aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/relocation.c
diff options
context:
space:
mode:
authorYan Zheng <zheng.yan@oracle.com>2009-06-10 10:45:14 -0400
committerChris Mason <chris.mason@oracle.com>2009-06-10 11:29:46 -0400
commit5d4f98a28c7d334091c1b7744f48a1acdd2a4ae0 (patch)
treec611d7d824cbcdb777dd2d8e33e2ed1c5df8a9c6 /fs/btrfs/relocation.c
parent5c939df56c3ea018b58e5aa76181284c2053d699 (diff)
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata. Once a filesystem has been mounted with this commit, IT WILL NO LONGER BE MOUNTABLE BY OLDER KERNELS. When a tree block in subvolume tree is cow'd, the reference counts of all extents it points to are increased by one. At transaction commit time, the old root of the subvolume is recorded in a "dead root" data structure, and the btree it points to is later walked, dropping reference counts and freeing any blocks where the reference count goes to 0. The increments done during cow and decrements done after commit cancel out, and the walk is a very expensive way to go about freeing the blocks that are no longer referenced by the new btree root. This commit reduces the transaction overhead by avoiding the need for dead root records. When a non-shared tree block is cow'd, we free the old block at once, and the new block inherits old block's references. When a tree block with reference count > 1 is cow'd, we increase the reference counts of all extents the new block points to by one, and decrease the old block's reference count by one. This dead tree avoidance code removes the need to modify the reference counts of lower level extents when a non-shared tree block is cow'd. But we still need to update back ref for all pointers in the block. This is because the location of the block is recorded in the back ref item. We can solve this by introducing a new type of back ref. The new back ref provides information about pointer's key, level and in which tree the pointer lives. This information allow us to find the pointer by searching the tree. The shortcoming of the new back ref is that it only works for pointers in tree blocks referenced by their owner trees. This is mostly a problem for snapshots, where resolving one of these fuzzy back references would be O(number_of_snapshots) and quite slow. The solution used here is to use the fuzzy back references in the common case where a given tree block is only referenced by one root, and use the full back references when multiple roots have a reference on a given block. This commit adds per subvolume red-black tree to keep trace of cached inodes. The red-black tree helps the balancing code to find cached inodes whose inode numbers within a given range. This commit improves the balancing code by introducing several data structures to keep the state of balancing. The most important one is the back ref cache. It caches how the upper level tree blocks are referenced. This greatly reduce the overhead of checking back ref. The improved balancing code scales significantly better with a large number of snapshots. This is a very large commit and was written in a number of pieces. But, they depend heavily on the disk format change and were squashed together to make sure git bisect didn't end up in a bad state wrt space balancing or the format change. Signed-off-by: Yan Zheng <zheng.yan@oracle.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/relocation.c')
-rw-r--r--fs/btrfs/relocation.c3711
1 files changed, 3711 insertions, 0 deletions
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
new file mode 100644
index 000000000000..b23dc209ae10
--- /dev/null
+++ b/fs/btrfs/relocation.c
@@ -0,0 +1,3711 @@
1/*
2 * Copyright (C) 2009 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/pagemap.h>
21#include <linux/writeback.h>
22#include <linux/blkdev.h>
23#include <linux/rbtree.h>
24#include "ctree.h"
25#include "disk-io.h"
26#include "transaction.h"
27#include "volumes.h"
28#include "locking.h"
29#include "btrfs_inode.h"
30#include "async-thread.h"
31
32/*
33 * backref_node, mapping_node and tree_block start with this
34 */
35struct tree_entry {
36 struct rb_node rb_node;
37 u64 bytenr;
38};
39
40/*
41 * present a tree block in the backref cache
42 */
43struct backref_node {
44 struct rb_node rb_node;
45 u64 bytenr;
46 /* objectid tree block owner */
47 u64 owner;
48 /* list of upper level blocks reference this block */
49 struct list_head upper;
50 /* list of child blocks in the cache */
51 struct list_head lower;
52 /* NULL if this node is not tree root */
53 struct btrfs_root *root;
54 /* extent buffer got by COW the block */
55 struct extent_buffer *eb;
56 /* level of tree block */
57 unsigned int level:8;
58 /* 1 if the block is root of old snapshot */
59 unsigned int old_root:1;
60 /* 1 if no child blocks in the cache */
61 unsigned int lowest:1;
62 /* is the extent buffer locked */
63 unsigned int locked:1;
64 /* has the block been processed */
65 unsigned int processed:1;
66 /* have backrefs of this block been checked */
67 unsigned int checked:1;
68};
69
70/*
71 * present a block pointer in the backref cache
72 */
73struct backref_edge {
74 struct list_head list[2];
75 struct backref_node *node[2];
76 u64 blockptr;
77};
78
79#define LOWER 0
80#define UPPER 1
81
82struct backref_cache {
83 /* red black tree of all backref nodes in the cache */
84 struct rb_root rb_root;
85 /* list of backref nodes with no child block in the cache */
86 struct list_head pending[BTRFS_MAX_LEVEL];
87 spinlock_t lock;
88};
89
90/*
91 * map address of tree root to tree
92 */
93struct mapping_node {
94 struct rb_node rb_node;
95 u64 bytenr;
96 void *data;
97};
98
99struct mapping_tree {
100 struct rb_root rb_root;
101 spinlock_t lock;
102};
103
104/*
105 * present a tree block to process
106 */
107struct tree_block {
108 struct rb_node rb_node;
109 u64 bytenr;
110 struct btrfs_key key;
111 unsigned int level:8;
112 unsigned int key_ready:1;
113};
114
115/* inode vector */
116#define INODEVEC_SIZE 16
117
118struct inodevec {
119 struct list_head list;
120 struct inode *inode[INODEVEC_SIZE];
121 int nr;
122};
123
124struct reloc_control {
125 /* block group to relocate */
126 struct btrfs_block_group_cache *block_group;
127 /* extent tree */
128 struct btrfs_root *extent_root;
129 /* inode for moving data */
130 struct inode *data_inode;
131 struct btrfs_workers workers;
132 /* tree blocks have been processed */
133 struct extent_io_tree processed_blocks;
134 /* map start of tree root to corresponding reloc tree */
135 struct mapping_tree reloc_root_tree;
136 /* list of reloc trees */
137 struct list_head reloc_roots;
138 u64 search_start;
139 u64 extents_found;
140 u64 extents_skipped;
141 int stage;
142 int create_reloc_root;
143 unsigned int found_file_extent:1;
144 unsigned int found_old_snapshot:1;
145};
146
147/* stages of data relocation */
148#define MOVE_DATA_EXTENTS 0
149#define UPDATE_DATA_PTRS 1
150
151/*
152 * merge reloc tree to corresponding fs tree in worker threads
153 */
154struct async_merge {
155 struct btrfs_work work;
156 struct reloc_control *rc;
157 struct btrfs_root *root;
158 struct completion *done;
159 atomic_t *num_pending;
160};
161
162static void mapping_tree_init(struct mapping_tree *tree)
163{
164 tree->rb_root.rb_node = NULL;
165 spin_lock_init(&tree->lock);
166}
167
168static void backref_cache_init(struct backref_cache *cache)
169{
170 int i;
171 cache->rb_root.rb_node = NULL;
172 for (i = 0; i < BTRFS_MAX_LEVEL; i++)
173 INIT_LIST_HEAD(&cache->pending[i]);
174 spin_lock_init(&cache->lock);
175}
176
177static void backref_node_init(struct backref_node *node)
178{
179 memset(node, 0, sizeof(*node));
180 INIT_LIST_HEAD(&node->upper);
181 INIT_LIST_HEAD(&node->lower);
182 RB_CLEAR_NODE(&node->rb_node);
183}
184
185static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
186 struct rb_node *node)
187{
188 struct rb_node **p = &root->rb_node;
189 struct rb_node *parent = NULL;
190 struct tree_entry *entry;
191
192 while (*p) {
193 parent = *p;
194 entry = rb_entry(parent, struct tree_entry, rb_node);
195
196 if (bytenr < entry->bytenr)
197 p = &(*p)->rb_left;
198 else if (bytenr > entry->bytenr)
199 p = &(*p)->rb_right;
200 else
201 return parent;
202 }
203
204 rb_link_node(node, parent, p);
205 rb_insert_color(node, root);
206 return NULL;
207}
208
209static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
210{
211 struct rb_node *n = root->rb_node;
212 struct tree_entry *entry;
213
214 while (n) {
215 entry = rb_entry(n, struct tree_entry, rb_node);
216
217 if (bytenr < entry->bytenr)
218 n = n->rb_left;
219 else if (bytenr > entry->bytenr)
220 n = n->rb_right;
221 else
222 return n;
223 }
224 return NULL;
225}
226
227/*
228 * walk up backref nodes until reach node presents tree root
229 */
230static struct backref_node *walk_up_backref(struct backref_node *node,
231 struct backref_edge *edges[],
232 int *index)
233{
234 struct backref_edge *edge;
235 int idx = *index;
236
237 while (!list_empty(&node->upper)) {
238 edge = list_entry(node->upper.next,
239 struct backref_edge, list[LOWER]);
240 edges[idx++] = edge;
241 node = edge->node[UPPER];
242 }
243 *index = idx;
244 return node;
245}
246
247/*
248 * walk down backref nodes to find start of next reference path
249 */
250static struct backref_node *walk_down_backref(struct backref_edge *edges[],
251 int *index)
252{
253 struct backref_edge *edge;
254 struct backref_node *lower;
255 int idx = *index;
256
257 while (idx > 0) {
258 edge = edges[idx - 1];
259 lower = edge->node[LOWER];
260 if (list_is_last(&edge->list[LOWER], &lower->upper)) {
261 idx--;
262 continue;
263 }
264 edge = list_entry(edge->list[LOWER].next,
265 struct backref_edge, list[LOWER]);
266 edges[idx - 1] = edge;
267 *index = idx;
268 return edge->node[UPPER];
269 }
270 *index = 0;
271 return NULL;
272}
273
274static void drop_node_buffer(struct backref_node *node)
275{
276 if (node->eb) {
277 if (node->locked) {
278 btrfs_tree_unlock(node->eb);
279 node->locked = 0;
280 }
281 free_extent_buffer(node->eb);
282 node->eb = NULL;
283 }
284}
285
286static void drop_backref_node(struct backref_cache *tree,
287 struct backref_node *node)
288{
289 BUG_ON(!node->lowest);
290 BUG_ON(!list_empty(&node->upper));
291
292 drop_node_buffer(node);
293 list_del(&node->lower);
294
295 rb_erase(&node->rb_node, &tree->rb_root);
296 kfree(node);
297}
298
299/*
300 * remove a backref node from the backref cache
301 */
302static void remove_backref_node(struct backref_cache *cache,
303 struct backref_node *node)
304{
305 struct backref_node *upper;
306 struct backref_edge *edge;
307
308 if (!node)
309 return;
310
311 BUG_ON(!node->lowest);
312 while (!list_empty(&node->upper)) {
313 edge = list_entry(node->upper.next, struct backref_edge,
314 list[LOWER]);
315 upper = edge->node[UPPER];
316 list_del(&edge->list[LOWER]);
317 list_del(&edge->list[UPPER]);
318 kfree(edge);
319 /*
320 * add the node to pending list if no other
321 * child block cached.
322 */
323 if (list_empty(&upper->lower)) {
324 list_add_tail(&upper->lower,
325 &cache->pending[upper->level]);
326 upper->lowest = 1;
327 }
328 }
329 drop_backref_node(cache, node);
330}
331
332/*
333 * find reloc tree by address of tree root
334 */
335static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
336 u64 bytenr)
337{
338 struct rb_node *rb_node;
339 struct mapping_node *node;
340 struct btrfs_root *root = NULL;
341
342 spin_lock(&rc->reloc_root_tree.lock);
343 rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
344 if (rb_node) {
345 node = rb_entry(rb_node, struct mapping_node, rb_node);
346 root = (struct btrfs_root *)node->data;
347 }
348 spin_unlock(&rc->reloc_root_tree.lock);
349 return root;
350}
351
352static int is_cowonly_root(u64 root_objectid)
353{
354 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
355 root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
356 root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
357 root_objectid == BTRFS_DEV_TREE_OBJECTID ||
358 root_objectid == BTRFS_TREE_LOG_OBJECTID ||
359 root_objectid == BTRFS_CSUM_TREE_OBJECTID)
360 return 1;
361 return 0;
362}
363
364static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
365 u64 root_objectid)
366{
367 struct btrfs_key key;
368
369 key.objectid = root_objectid;
370 key.type = BTRFS_ROOT_ITEM_KEY;
371 if (is_cowonly_root(root_objectid))
372 key.offset = 0;
373 else
374 key.offset = (u64)-1;
375
376 return btrfs_read_fs_root_no_name(fs_info, &key);
377}
378
379#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
380static noinline_for_stack
381struct btrfs_root *find_tree_root(struct reloc_control *rc,
382 struct extent_buffer *leaf,
383 struct btrfs_extent_ref_v0 *ref0)
384{
385 struct btrfs_root *root;
386 u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
387 u64 generation = btrfs_ref_generation_v0(leaf, ref0);
388
389 BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
390
391 root = read_fs_root(rc->extent_root->fs_info, root_objectid);
392 BUG_ON(IS_ERR(root));
393
394 if (root->ref_cows &&
395 generation != btrfs_root_generation(&root->root_item))
396 return NULL;
397
398 return root;
399}
400#endif
401
402static noinline_for_stack
403int find_inline_backref(struct extent_buffer *leaf, int slot,
404 unsigned long *ptr, unsigned long *end)
405{
406 struct btrfs_extent_item *ei;
407 struct btrfs_tree_block_info *bi;
408 u32 item_size;
409
410 item_size = btrfs_item_size_nr(leaf, slot);
411#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
412 if (item_size < sizeof(*ei)) {
413 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
414 return 1;
415 }
416#endif
417 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
418 WARN_ON(!(btrfs_extent_flags(leaf, ei) &
419 BTRFS_EXTENT_FLAG_TREE_BLOCK));
420
421 if (item_size <= sizeof(*ei) + sizeof(*bi)) {
422 WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
423 return 1;
424 }
425
426 bi = (struct btrfs_tree_block_info *)(ei + 1);
427 *ptr = (unsigned long)(bi + 1);
428 *end = (unsigned long)ei + item_size;
429 return 0;
430}
431
432/*
433 * build backref tree for a given tree block. root of the backref tree
434 * corresponds the tree block, leaves of the backref tree correspond
435 * roots of b-trees that reference the tree block.
436 *
437 * the basic idea of this function is check backrefs of a given block
438 * to find upper level blocks that refernece the block, and then check
439 * bakcrefs of these upper level blocks recursively. the recursion stop
440 * when tree root is reached or backrefs for the block is cached.
441 *
442 * NOTE: if we find backrefs for a block are cached, we know backrefs
443 * for all upper level blocks that directly/indirectly reference the
444 * block are also cached.
445 */
446static struct backref_node *build_backref_tree(struct reloc_control *rc,
447 struct backref_cache *cache,
448 struct btrfs_key *node_key,
449 int level, u64 bytenr)
450{
451 struct btrfs_path *path1;
452 struct btrfs_path *path2;
453 struct extent_buffer *eb;
454 struct btrfs_root *root;
455 struct backref_node *cur;
456 struct backref_node *upper;
457 struct backref_node *lower;
458 struct backref_node *node = NULL;
459 struct backref_node *exist = NULL;
460 struct backref_edge *edge;
461 struct rb_node *rb_node;
462 struct btrfs_key key;
463 unsigned long end;
464 unsigned long ptr;
465 LIST_HEAD(list);
466 int ret;
467 int err = 0;
468
469 path1 = btrfs_alloc_path();
470 path2 = btrfs_alloc_path();
471 if (!path1 || !path2) {
472 err = -ENOMEM;
473 goto out;
474 }
475
476 node = kmalloc(sizeof(*node), GFP_NOFS);
477 if (!node) {
478 err = -ENOMEM;
479 goto out;
480 }
481
482 backref_node_init(node);
483 node->bytenr = bytenr;
484 node->owner = 0;
485 node->level = level;
486 node->lowest = 1;
487 cur = node;
488again:
489 end = 0;
490 ptr = 0;
491 key.objectid = cur->bytenr;
492 key.type = BTRFS_EXTENT_ITEM_KEY;
493 key.offset = (u64)-1;
494
495 path1->search_commit_root = 1;
496 path1->skip_locking = 1;
497 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
498 0, 0);
499 if (ret < 0) {
500 err = ret;
501 goto out;
502 }
503 BUG_ON(!ret || !path1->slots[0]);
504
505 path1->slots[0]--;
506
507 WARN_ON(cur->checked);
508 if (!list_empty(&cur->upper)) {
509 /*
510 * the backref was added previously when processsing
511 * backref of type BTRFS_TREE_BLOCK_REF_KEY
512 */
513 BUG_ON(!list_is_singular(&cur->upper));
514 edge = list_entry(cur->upper.next, struct backref_edge,
515 list[LOWER]);
516 BUG_ON(!list_empty(&edge->list[UPPER]));
517 exist = edge->node[UPPER];
518 /*
519 * add the upper level block to pending list if we need
520 * check its backrefs
521 */
522 if (!exist->checked)
523 list_add_tail(&edge->list[UPPER], &list);
524 } else {
525 exist = NULL;
526 }
527
528 while (1) {
529 cond_resched();
530 eb = path1->nodes[0];
531
532 if (ptr >= end) {
533 if (path1->slots[0] >= btrfs_header_nritems(eb)) {
534 ret = btrfs_next_leaf(rc->extent_root, path1);
535 if (ret < 0) {
536 err = ret;
537 goto out;
538 }
539 if (ret > 0)
540 break;
541 eb = path1->nodes[0];
542 }
543
544 btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
545 if (key.objectid != cur->bytenr) {
546 WARN_ON(exist);
547 break;
548 }
549
550 if (key.type == BTRFS_EXTENT_ITEM_KEY) {
551 ret = find_inline_backref(eb, path1->slots[0],
552 &ptr, &end);
553 if (ret)
554 goto next;
555 }
556 }
557
558 if (ptr < end) {
559 /* update key for inline back ref */
560 struct btrfs_extent_inline_ref *iref;
561 iref = (struct btrfs_extent_inline_ref *)ptr;
562 key.type = btrfs_extent_inline_ref_type(eb, iref);
563 key.offset = btrfs_extent_inline_ref_offset(eb, iref);
564 WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
565 key.type != BTRFS_SHARED_BLOCK_REF_KEY);
566 }
567
568 if (exist &&
569 ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
570 exist->owner == key.offset) ||
571 (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
572 exist->bytenr == key.offset))) {
573 exist = NULL;
574 goto next;
575 }
576
577#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
578 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
579 key.type == BTRFS_EXTENT_REF_V0_KEY) {
580 if (key.objectid == key.offset &&
581 key.type == BTRFS_EXTENT_REF_V0_KEY) {
582 struct btrfs_extent_ref_v0 *ref0;
583 ref0 = btrfs_item_ptr(eb, path1->slots[0],
584 struct btrfs_extent_ref_v0);
585 root = find_tree_root(rc, eb, ref0);
586 if (root)
587 cur->root = root;
588 else
589 cur->old_root = 1;
590 break;
591 }
592#else
593 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
594 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
595#endif
596 if (key.objectid == key.offset) {
597 /*
598 * only root blocks of reloc trees use
599 * backref of this type.
600 */
601 root = find_reloc_root(rc, cur->bytenr);
602 BUG_ON(!root);
603 cur->root = root;
604 break;
605 }
606
607 edge = kzalloc(sizeof(*edge), GFP_NOFS);
608 if (!edge) {
609 err = -ENOMEM;
610 goto out;
611 }
612 rb_node = tree_search(&cache->rb_root, key.offset);
613 if (!rb_node) {
614 upper = kmalloc(sizeof(*upper), GFP_NOFS);
615 if (!upper) {
616 kfree(edge);
617 err = -ENOMEM;
618 goto out;
619 }
620 backref_node_init(upper);
621 upper->bytenr = key.offset;
622 upper->owner = 0;
623 upper->level = cur->level + 1;
624 /*
625 * backrefs for the upper level block isn't
626 * cached, add the block to pending list
627 */
628 list_add_tail(&edge->list[UPPER], &list);
629 } else {
630 upper = rb_entry(rb_node, struct backref_node,
631 rb_node);
632 INIT_LIST_HEAD(&edge->list[UPPER]);
633 }
634 list_add(&edge->list[LOWER], &cur->upper);
635 edge->node[UPPER] = upper;
636 edge->node[LOWER] = cur;
637
638 goto next;
639 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
640 goto next;
641 }
642
643 /* key.type == BTRFS_TREE_BLOCK_REF_KEY */
644 root = read_fs_root(rc->extent_root->fs_info, key.offset);
645 if (IS_ERR(root)) {
646 err = PTR_ERR(root);
647 goto out;
648 }
649
650 if (btrfs_root_level(&root->root_item) == cur->level) {
651 /* tree root */
652 BUG_ON(btrfs_root_bytenr(&root->root_item) !=
653 cur->bytenr);
654 cur->root = root;
655 break;
656 }
657
658 level = cur->level + 1;
659
660 /*
661 * searching the tree to find upper level blocks
662 * reference the block.
663 */
664 path2->search_commit_root = 1;
665 path2->skip_locking = 1;
666 path2->lowest_level = level;
667 ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
668 path2->lowest_level = 0;
669 if (ret < 0) {
670 err = ret;
671 goto out;
672 }
673
674 eb = path2->nodes[level];
675 WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
676 cur->bytenr);
677
678 lower = cur;
679 for (; level < BTRFS_MAX_LEVEL; level++) {
680 if (!path2->nodes[level]) {
681 BUG_ON(btrfs_root_bytenr(&root->root_item) !=
682 lower->bytenr);
683 lower->root = root;
684 break;
685 }
686
687 edge = kzalloc(sizeof(*edge), GFP_NOFS);
688 if (!edge) {
689 err = -ENOMEM;
690 goto out;
691 }
692
693 eb = path2->nodes[level];
694 rb_node = tree_search(&cache->rb_root, eb->start);
695 if (!rb_node) {
696 upper = kmalloc(sizeof(*upper), GFP_NOFS);
697 if (!upper) {
698 kfree(edge);
699 err = -ENOMEM;
700 goto out;
701 }
702 backref_node_init(upper);
703 upper->bytenr = eb->start;
704 upper->owner = btrfs_header_owner(eb);
705 upper->level = lower->level + 1;
706
707 /*
708 * if we know the block isn't shared
709 * we can void checking its backrefs.
710 */
711 if (btrfs_block_can_be_shared(root, eb))
712 upper->checked = 0;
713 else
714 upper->checked = 1;
715
716 /*
717 * add the block to pending list if we
718 * need check its backrefs. only block
719 * at 'cur->level + 1' is added to the
720 * tail of pending list. this guarantees
721 * we check backrefs from lower level
722 * blocks to upper level blocks.
723 */
724 if (!upper->checked &&
725 level == cur->level + 1) {
726 list_add_tail(&edge->list[UPPER],
727 &list);
728 } else
729 INIT_LIST_HEAD(&edge->list[UPPER]);
730 } else {
731 upper = rb_entry(rb_node, struct backref_node,
732 rb_node);
733 BUG_ON(!upper->checked);
734 INIT_LIST_HEAD(&edge->list[UPPER]);
735 }
736 list_add_tail(&edge->list[LOWER], &lower->upper);
737 edge->node[UPPER] = upper;
738 edge->node[LOWER] = lower;
739
740 if (rb_node)
741 break;
742 lower = upper;
743 upper = NULL;
744 }
745 btrfs_release_path(root, path2);
746next:
747 if (ptr < end) {
748 ptr += btrfs_extent_inline_ref_size(key.type);
749 if (ptr >= end) {
750 WARN_ON(ptr > end);
751 ptr = 0;
752 end = 0;
753 }
754 }
755 if (ptr >= end)
756 path1->slots[0]++;
757 }
758 btrfs_release_path(rc->extent_root, path1);
759
760 cur->checked = 1;
761 WARN_ON(exist);
762
763 /* the pending list isn't empty, take the first block to process */
764 if (!list_empty(&list)) {
765 edge = list_entry(list.next, struct backref_edge, list[UPPER]);
766 list_del_init(&edge->list[UPPER]);
767 cur = edge->node[UPPER];
768 goto again;
769 }
770
771 /*
772 * everything goes well, connect backref nodes and insert backref nodes
773 * into the cache.
774 */
775 BUG_ON(!node->checked);
776 rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
777 BUG_ON(rb_node);
778
779 list_for_each_entry(edge, &node->upper, list[LOWER])
780 list_add_tail(&edge->list[UPPER], &list);
781
782 while (!list_empty(&list)) {
783 edge = list_entry(list.next, struct backref_edge, list[UPPER]);
784 list_del_init(&edge->list[UPPER]);
785 upper = edge->node[UPPER];
786
787 if (!RB_EMPTY_NODE(&upper->rb_node)) {
788 if (upper->lowest) {
789 list_del_init(&upper->lower);
790 upper->lowest = 0;
791 }
792
793 list_add_tail(&edge->list[UPPER], &upper->lower);
794 continue;
795 }
796
797 BUG_ON(!upper->checked);
798 rb_node = tree_insert(&cache->rb_root, upper->bytenr,
799 &upper->rb_node);
800 BUG_ON(rb_node);
801
802 list_add_tail(&edge->list[UPPER], &upper->lower);
803
804 list_for_each_entry(edge, &upper->upper, list[LOWER])
805 list_add_tail(&edge->list[UPPER], &list);
806 }
807out:
808 btrfs_free_path(path1);
809 btrfs_free_path(path2);
810 if (err) {
811 INIT_LIST_HEAD(&list);
812 upper = node;
813 while (upper) {
814 if (RB_EMPTY_NODE(&upper->rb_node)) {
815 list_splice_tail(&upper->upper, &list);
816 kfree(upper);
817 }
818
819 if (list_empty(&list))
820 break;
821
822 edge = list_entry(list.next, struct backref_edge,
823 list[LOWER]);
824 upper = edge->node[UPPER];
825 kfree(edge);
826 }
827 return ERR_PTR(err);
828 }
829 return node;
830}
831
832/*
833 * helper to add 'address of tree root -> reloc tree' mapping
834 */
835static int __add_reloc_root(struct btrfs_root *root)
836{
837 struct rb_node *rb_node;
838 struct mapping_node *node;
839 struct reloc_control *rc = root->fs_info->reloc_ctl;
840
841 node = kmalloc(sizeof(*node), GFP_NOFS);
842 BUG_ON(!node);
843
844 node->bytenr = root->node->start;
845 node->data = root;
846
847 spin_lock(&rc->reloc_root_tree.lock);
848 rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
849 node->bytenr, &node->rb_node);
850 spin_unlock(&rc->reloc_root_tree.lock);
851 BUG_ON(rb_node);
852
853 list_add_tail(&root->root_list, &rc->reloc_roots);
854 return 0;
855}
856
857/*
858 * helper to update/delete the 'address of tree root -> reloc tree'
859 * mapping
860 */
861static int __update_reloc_root(struct btrfs_root *root, int del)
862{
863 struct rb_node *rb_node;
864 struct mapping_node *node = NULL;
865 struct reloc_control *rc = root->fs_info->reloc_ctl;
866
867 spin_lock(&rc->reloc_root_tree.lock);
868 rb_node = tree_search(&rc->reloc_root_tree.rb_root,
869 root->commit_root->start);
870 if (rb_node) {
871 node = rb_entry(rb_node, struct mapping_node, rb_node);
872 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
873 }
874 spin_unlock(&rc->reloc_root_tree.lock);
875
876 BUG_ON((struct btrfs_root *)node->data != root);
877
878 if (!del) {
879 spin_lock(&rc->reloc_root_tree.lock);
880 node->bytenr = root->node->start;
881 rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
882 node->bytenr, &node->rb_node);
883 spin_unlock(&rc->reloc_root_tree.lock);
884 BUG_ON(rb_node);
885 } else {
886 list_del_init(&root->root_list);
887 kfree(node);
888 }
889 return 0;
890}
891
892/*
893 * create reloc tree for a given fs tree. reloc tree is just a
894 * snapshot of the fs tree with special root objectid.
895 */
896int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
897 struct btrfs_root *root)
898{
899 struct btrfs_root *reloc_root;
900 struct extent_buffer *eb;
901 struct btrfs_root_item *root_item;
902 struct btrfs_key root_key;
903 int ret;
904
905 if (root->reloc_root) {
906 reloc_root = root->reloc_root;
907 reloc_root->last_trans = trans->transid;
908 return 0;
909 }
910
911 if (!root->fs_info->reloc_ctl ||
912 !root->fs_info->reloc_ctl->create_reloc_root ||
913 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
914 return 0;
915
916 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
917 BUG_ON(!root_item);
918
919 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
920 root_key.type = BTRFS_ROOT_ITEM_KEY;
921 root_key.offset = root->root_key.objectid;
922
923 ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
924 BTRFS_TREE_RELOC_OBJECTID);
925 BUG_ON(ret);
926
927 btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
928 memcpy(root_item, &root->root_item, sizeof(*root_item));
929 btrfs_set_root_refs(root_item, 1);
930 btrfs_set_root_bytenr(root_item, eb->start);
931 btrfs_set_root_level(root_item, btrfs_header_level(eb));
932 btrfs_set_root_generation(root_item, trans->transid);
933 memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key));
934 root_item->drop_level = 0;
935
936 btrfs_tree_unlock(eb);
937 free_extent_buffer(eb);
938
939 ret = btrfs_insert_root(trans, root->fs_info->tree_root,
940 &root_key, root_item);
941 BUG_ON(ret);
942 kfree(root_item);
943
944 reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
945 &root_key);
946 BUG_ON(IS_ERR(reloc_root));
947 reloc_root->last_trans = trans->transid;
948
949 __add_reloc_root(reloc_root);
950 root->reloc_root = reloc_root;
951 return 0;
952}
953
954/*
955 * update root item of reloc tree
956 */
957int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
958 struct btrfs_root *root)
959{
960 struct btrfs_root *reloc_root;
961 struct btrfs_root_item *root_item;
962 int del = 0;
963 int ret;
964
965 if (!root->reloc_root)
966 return 0;
967
968 reloc_root = root->reloc_root;
969 root_item = &reloc_root->root_item;
970
971 if (btrfs_root_refs(root_item) == 0) {
972 root->reloc_root = NULL;
973 del = 1;
974 }
975
976 __update_reloc_root(reloc_root, del);
977
978 if (reloc_root->commit_root != reloc_root->node) {
979 btrfs_set_root_node(root_item, reloc_root->node);
980 free_extent_buffer(reloc_root->commit_root);
981 reloc_root->commit_root = btrfs_root_node(reloc_root);
982 }
983
984 ret = btrfs_update_root(trans, root->fs_info->tree_root,
985 &reloc_root->root_key, root_item);
986 BUG_ON(ret);
987 return 0;
988}
989
990/*
991 * helper to find first cached inode with inode number >= objectid
992 * in a subvolume
993 */
994static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
995{
996 struct rb_node *node;
997 struct rb_node *prev;
998 struct btrfs_inode *entry;
999 struct inode *inode;
1000
1001 spin_lock(&root->inode_lock);
1002again:
1003 node = root->inode_tree.rb_node;
1004 prev = NULL;
1005 while (node) {
1006 prev = node;
1007 entry = rb_entry(node, struct btrfs_inode, rb_node);
1008
1009 if (objectid < entry->vfs_inode.i_ino)
1010 node = node->rb_left;
1011 else if (objectid > entry->vfs_inode.i_ino)
1012 node = node->rb_right;
1013 else
1014 break;
1015 }
1016 if (!node) {
1017 while (prev) {
1018 entry = rb_entry(prev, struct btrfs_inode, rb_node);
1019 if (objectid <= entry->vfs_inode.i_ino) {
1020 node = prev;
1021 break;
1022 }
1023 prev = rb_next(prev);
1024 }
1025 }
1026 while (node) {
1027 entry = rb_entry(node, struct btrfs_inode, rb_node);
1028 inode = igrab(&entry->vfs_inode);
1029 if (inode) {
1030 spin_unlock(&root->inode_lock);
1031 return inode;
1032 }
1033
1034 objectid = entry->vfs_inode.i_ino + 1;
1035 if (cond_resched_lock(&root->inode_lock))
1036 goto again;
1037
1038 node = rb_next(node);
1039 }
1040 spin_unlock(&root->inode_lock);
1041 return NULL;
1042}
1043
1044static int in_block_group(u64 bytenr,
1045 struct btrfs_block_group_cache *block_group)
1046{
1047 if (bytenr >= block_group->key.objectid &&
1048 bytenr < block_group->key.objectid + block_group->key.offset)
1049 return 1;
1050 return 0;
1051}
1052
1053/*
1054 * get new location of data
1055 */
1056static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1057 u64 bytenr, u64 num_bytes)
1058{
1059 struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1060 struct btrfs_path *path;
1061 struct btrfs_file_extent_item *fi;
1062 struct extent_buffer *leaf;
1063 int ret;
1064
1065 path = btrfs_alloc_path();
1066 if (!path)
1067 return -ENOMEM;
1068
1069 bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1070 ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
1071 bytenr, 0);
1072 if (ret < 0)
1073 goto out;
1074 if (ret > 0) {
1075 ret = -ENOENT;
1076 goto out;
1077 }
1078
1079 leaf = path->nodes[0];
1080 fi = btrfs_item_ptr(leaf, path->slots[0],
1081 struct btrfs_file_extent_item);
1082
1083 BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1084 btrfs_file_extent_compression(leaf, fi) ||
1085 btrfs_file_extent_encryption(leaf, fi) ||
1086 btrfs_file_extent_other_encoding(leaf, fi));
1087
1088 if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1089 ret = 1;
1090 goto out;
1091 }
1092
1093 if (new_bytenr)
1094 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1095 ret = 0;
1096out:
1097 btrfs_free_path(path);
1098 return ret;
1099}
1100
1101/*
1102 * update file extent items in the tree leaf to point to
1103 * the new locations.
1104 */
1105static int replace_file_extents(struct btrfs_trans_handle *trans,
1106 struct reloc_control *rc,
1107 struct btrfs_root *root,
1108 struct extent_buffer *leaf,
1109 struct list_head *inode_list)
1110{
1111 struct btrfs_key key;
1112 struct btrfs_file_extent_item *fi;
1113 struct inode *inode = NULL;
1114 struct inodevec *ivec = NULL;
1115 u64 parent;
1116 u64 bytenr;
1117 u64 new_bytenr;
1118 u64 num_bytes;
1119 u64 end;
1120 u32 nritems;
1121 u32 i;
1122 int ret;
1123 int first = 1;
1124 int dirty = 0;
1125
1126 if (rc->stage != UPDATE_DATA_PTRS)
1127 return 0;
1128
1129 /* reloc trees always use full backref */
1130 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1131 parent = leaf->start;
1132 else
1133 parent = 0;
1134
1135 nritems = btrfs_header_nritems(leaf);
1136 for (i = 0; i < nritems; i++) {
1137 cond_resched();
1138 btrfs_item_key_to_cpu(leaf, &key, i);
1139 if (key.type != BTRFS_EXTENT_DATA_KEY)
1140 continue;
1141 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1142 if (btrfs_file_extent_type(leaf, fi) ==
1143 BTRFS_FILE_EXTENT_INLINE)
1144 continue;
1145 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1146 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1147 if (bytenr == 0)
1148 continue;
1149 if (!in_block_group(bytenr, rc->block_group))
1150 continue;
1151
1152 /*
1153 * if we are modifying block in fs tree, wait for readpage
1154 * to complete and drop the extent cache
1155 */
1156 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1157 if (!ivec || ivec->nr == INODEVEC_SIZE) {
1158 ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
1159 BUG_ON(!ivec);
1160 ivec->nr = 0;
1161 list_add_tail(&ivec->list, inode_list);
1162 }
1163 if (first) {
1164 inode = find_next_inode(root, key.objectid);
1165 if (inode)
1166 ivec->inode[ivec->nr++] = inode;
1167 first = 0;
1168 } else if (inode && inode->i_ino < key.objectid) {
1169 inode = find_next_inode(root, key.objectid);
1170 if (inode)
1171 ivec->inode[ivec->nr++] = inode;
1172 }
1173 if (inode && inode->i_ino == key.objectid) {
1174 end = key.offset +
1175 btrfs_file_extent_num_bytes(leaf, fi);
1176 WARN_ON(!IS_ALIGNED(key.offset,
1177 root->sectorsize));
1178 WARN_ON(!IS_ALIGNED(end, root->sectorsize));
1179 end--;
1180 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1181 key.offset, end,
1182 GFP_NOFS);
1183 if (!ret)
1184 continue;
1185
1186 btrfs_drop_extent_cache(inode, key.offset, end,
1187 1);
1188 unlock_extent(&BTRFS_I(inode)->io_tree,
1189 key.offset, end, GFP_NOFS);
1190 }
1191 }
1192
1193 ret = get_new_location(rc->data_inode, &new_bytenr,
1194 bytenr, num_bytes);
1195 if (ret > 0)
1196 continue;
1197 BUG_ON(ret < 0);
1198
1199 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1200 dirty = 1;
1201
1202 key.offset -= btrfs_file_extent_offset(leaf, fi);
1203 ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
1204 num_bytes, parent,
1205 btrfs_header_owner(leaf),
1206 key.objectid, key.offset);
1207 BUG_ON(ret);
1208
1209 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1210 parent, btrfs_header_owner(leaf),
1211 key.objectid, key.offset);
1212 BUG_ON(ret);
1213 }
1214 if (dirty)
1215 btrfs_mark_buffer_dirty(leaf);
1216 return 0;
1217}
1218
1219static noinline_for_stack
1220int memcmp_node_keys(struct extent_buffer *eb, int slot,
1221 struct btrfs_path *path, int level)
1222{
1223 struct btrfs_disk_key key1;
1224 struct btrfs_disk_key key2;
1225 btrfs_node_key(eb, &key1, slot);
1226 btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1227 return memcmp(&key1, &key2, sizeof(key1));
1228}
1229
1230/*
1231 * try to replace tree blocks in fs tree with the new blocks
1232 * in reloc tree. tree blocks haven't been modified since the
1233 * reloc tree was create can be replaced.
1234 *
1235 * if a block was replaced, level of the block + 1 is returned.
1236 * if no block got replaced, 0 is returned. if there are other
1237 * errors, a negative error number is returned.
1238 */
1239static int replace_path(struct btrfs_trans_handle *trans,
1240 struct btrfs_root *dest, struct btrfs_root *src,
1241 struct btrfs_path *path, struct btrfs_key *next_key,
1242 struct extent_buffer **leaf,
1243 int lowest_level, int max_level)
1244{
1245 struct extent_buffer *eb;
1246 struct extent_buffer *parent;
1247 struct btrfs_key key;
1248 u64 old_bytenr;
1249 u64 new_bytenr;
1250 u64 old_ptr_gen;
1251 u64 new_ptr_gen;
1252 u64 last_snapshot;
1253 u32 blocksize;
1254 int level;
1255 int ret;
1256 int slot;
1257
1258 BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1259 BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1260 BUG_ON(lowest_level > 1 && leaf);
1261
1262 last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1263
1264 slot = path->slots[lowest_level];
1265 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1266
1267 eb = btrfs_lock_root_node(dest);
1268 btrfs_set_lock_blocking(eb);
1269 level = btrfs_header_level(eb);
1270
1271 if (level < lowest_level) {
1272 btrfs_tree_unlock(eb);
1273 free_extent_buffer(eb);
1274 return 0;
1275 }
1276
1277 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1278 BUG_ON(ret);
1279 btrfs_set_lock_blocking(eb);
1280
1281 if (next_key) {
1282 next_key->objectid = (u64)-1;
1283 next_key->type = (u8)-1;
1284 next_key->offset = (u64)-1;
1285 }
1286
1287 parent = eb;
1288 while (1) {
1289 level = btrfs_header_level(parent);
1290 BUG_ON(level < lowest_level);
1291
1292 ret = btrfs_bin_search(parent, &key, level, &slot);
1293 if (ret && slot > 0)
1294 slot--;
1295
1296 if (next_key && slot + 1 < btrfs_header_nritems(parent))
1297 btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1298
1299 old_bytenr = btrfs_node_blockptr(parent, slot);
1300 blocksize = btrfs_level_size(dest, level - 1);
1301 old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1302
1303 if (level <= max_level) {
1304 eb = path->nodes[level];
1305 new_bytenr = btrfs_node_blockptr(eb,
1306 path->slots[level]);
1307 new_ptr_gen = btrfs_node_ptr_generation(eb,
1308 path->slots[level]);
1309 } else {
1310 new_bytenr = 0;
1311 new_ptr_gen = 0;
1312 }
1313
1314 if (new_bytenr > 0 && new_bytenr == old_bytenr) {
1315 WARN_ON(1);
1316 ret = level;
1317 break;
1318 }
1319
1320 if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1321 memcmp_node_keys(parent, slot, path, level)) {
1322 if (level <= lowest_level && !leaf) {
1323 ret = 0;
1324 break;
1325 }
1326
1327 eb = read_tree_block(dest, old_bytenr, blocksize,
1328 old_ptr_gen);
1329 btrfs_tree_lock(eb);
1330 ret = btrfs_cow_block(trans, dest, eb, parent,
1331 slot, &eb);
1332 BUG_ON(ret);
1333 btrfs_set_lock_blocking(eb);
1334
1335 if (level <= lowest_level) {
1336 *leaf = eb;
1337 ret = 0;
1338 break;
1339 }
1340
1341 btrfs_tree_unlock(parent);
1342 free_extent_buffer(parent);
1343
1344 parent = eb;
1345 continue;
1346 }
1347
1348 btrfs_node_key_to_cpu(path->nodes[level], &key,
1349 path->slots[level]);
1350 btrfs_release_path(src, path);
1351
1352 path->lowest_level = level;
1353 ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1354 path->lowest_level = 0;
1355 BUG_ON(ret);
1356
1357 /*
1358 * swap blocks in fs tree and reloc tree.
1359 */
1360 btrfs_set_node_blockptr(parent, slot, new_bytenr);
1361 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1362 btrfs_mark_buffer_dirty(parent);
1363
1364 btrfs_set_node_blockptr(path->nodes[level],
1365 path->slots[level], old_bytenr);
1366 btrfs_set_node_ptr_generation(path->nodes[level],
1367 path->slots[level], old_ptr_gen);
1368 btrfs_mark_buffer_dirty(path->nodes[level]);
1369
1370 ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
1371 path->nodes[level]->start,
1372 src->root_key.objectid, level - 1, 0);
1373 BUG_ON(ret);
1374 ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
1375 0, dest->root_key.objectid, level - 1,
1376 0);
1377 BUG_ON(ret);
1378
1379 ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
1380 path->nodes[level]->start,
1381 src->root_key.objectid, level - 1, 0);
1382 BUG_ON(ret);
1383
1384 ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
1385 0, dest->root_key.objectid, level - 1,
1386 0);
1387 BUG_ON(ret);
1388
1389 btrfs_unlock_up_safe(path, 0);
1390
1391 ret = level;
1392 break;
1393 }
1394 btrfs_tree_unlock(parent);
1395 free_extent_buffer(parent);
1396 return ret;
1397}
1398
1399/*
1400 * helper to find next relocated block in reloc tree
1401 */
1402static noinline_for_stack
1403int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1404 int *level)
1405{
1406 struct extent_buffer *eb;
1407 int i;
1408 u64 last_snapshot;
1409 u32 nritems;
1410
1411 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1412
1413 for (i = 0; i < *level; i++) {
1414 free_extent_buffer(path->nodes[i]);
1415 path->nodes[i] = NULL;
1416 }
1417
1418 for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1419 eb = path->nodes[i];
1420 nritems = btrfs_header_nritems(eb);
1421 while (path->slots[i] + 1 < nritems) {
1422 path->slots[i]++;
1423 if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
1424 last_snapshot)
1425 continue;
1426
1427 *level = i;
1428 return 0;
1429 }
1430 free_extent_buffer(path->nodes[i]);
1431 path->nodes[i] = NULL;
1432 }
1433 return 1;
1434}
1435
1436/*
1437 * walk down reloc tree to find relocated block of lowest level
1438 */
1439static noinline_for_stack
1440int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1441 int *level)
1442{
1443 struct extent_buffer *eb = NULL;
1444 int i;
1445 u64 bytenr;
1446 u64 ptr_gen = 0;
1447 u64 last_snapshot;
1448 u32 blocksize;
1449 u32 nritems;
1450
1451 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1452
1453 for (i = *level; i > 0; i--) {
1454 eb = path->nodes[i];
1455 nritems = btrfs_header_nritems(eb);
1456 while (path->slots[i] < nritems) {
1457 ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
1458 if (ptr_gen > last_snapshot)
1459 break;
1460 path->slots[i]++;
1461 }
1462 if (path->slots[i] >= nritems) {
1463 if (i == *level)
1464 break;
1465 *level = i + 1;
1466 return 0;
1467 }
1468 if (i == 1) {
1469 *level = i;
1470 return 0;
1471 }
1472
1473 bytenr = btrfs_node_blockptr(eb, path->slots[i]);
1474 blocksize = btrfs_level_size(root, i - 1);
1475 eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
1476 BUG_ON(btrfs_header_level(eb) != i - 1);
1477 path->nodes[i - 1] = eb;
1478 path->slots[i - 1] = 0;
1479 }
1480 return 1;
1481}
1482
1483/*
1484 * invalidate extent cache for file extents whose key in range of
1485 * [min_key, max_key)
1486 */
1487static int invalidate_extent_cache(struct btrfs_root *root,
1488 struct btrfs_key *min_key,
1489 struct btrfs_key *max_key)
1490{
1491 struct inode *inode = NULL;
1492 u64 objectid;
1493 u64 start, end;
1494
1495 objectid = min_key->objectid;
1496 while (1) {
1497 cond_resched();
1498 iput(inode);
1499
1500 if (objectid > max_key->objectid)
1501 break;
1502
1503 inode = find_next_inode(root, objectid);
1504 if (!inode)
1505 break;
1506
1507 if (inode->i_ino > max_key->objectid) {
1508 iput(inode);
1509 break;
1510 }
1511
1512 objectid = inode->i_ino + 1;
1513 if (!S_ISREG(inode->i_mode))
1514 continue;
1515
1516 if (unlikely(min_key->objectid == inode->i_ino)) {
1517 if (min_key->type > BTRFS_EXTENT_DATA_KEY)
1518 continue;
1519 if (min_key->type < BTRFS_EXTENT_DATA_KEY)
1520 start = 0;
1521 else {
1522 start = min_key->offset;
1523 WARN_ON(!IS_ALIGNED(start, root->sectorsize));
1524 }
1525 } else {
1526 start = 0;
1527 }
1528
1529 if (unlikely(max_key->objectid == inode->i_ino)) {
1530 if (max_key->type < BTRFS_EXTENT_DATA_KEY)
1531 continue;
1532 if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
1533 end = (u64)-1;
1534 } else {
1535 if (max_key->offset == 0)
1536 continue;
1537 end = max_key->offset;
1538 WARN_ON(!IS_ALIGNED(end, root->sectorsize));
1539 end--;
1540 }
1541 } else {
1542 end = (u64)-1;
1543 }
1544
1545 /* the lock_extent waits for readpage to complete */
1546 lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
1547 btrfs_drop_extent_cache(inode, start, end, 1);
1548 unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
1549 }
1550 return 0;
1551}
1552
1553static int find_next_key(struct btrfs_path *path, int level,
1554 struct btrfs_key *key)
1555
1556{
1557 while (level < BTRFS_MAX_LEVEL) {
1558 if (!path->nodes[level])
1559 break;
1560 if (path->slots[level] + 1 <
1561 btrfs_header_nritems(path->nodes[level])) {
1562 btrfs_node_key_to_cpu(path->nodes[level], key,
1563 path->slots[level] + 1);
1564 return 0;
1565 }
1566 level++;
1567 }
1568 return 1;
1569}
1570
1571/*
1572 * merge the relocated tree blocks in reloc tree with corresponding
1573 * fs tree.
1574 */
1575static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1576 struct btrfs_root *root)
1577{
1578 LIST_HEAD(inode_list);
1579 struct btrfs_key key;
1580 struct btrfs_key next_key;
1581 struct btrfs_trans_handle *trans;
1582 struct btrfs_root *reloc_root;
1583 struct btrfs_root_item *root_item;
1584 struct btrfs_path *path;
1585 struct extent_buffer *leaf = NULL;
1586 unsigned long nr;
1587 int level;
1588 int max_level;
1589 int replaced = 0;
1590 int ret;
1591 int err = 0;
1592
1593 path = btrfs_alloc_path();
1594 if (!path)
1595 return -ENOMEM;
1596
1597 reloc_root = root->reloc_root;
1598 root_item = &reloc_root->root_item;
1599
1600 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
1601 level = btrfs_root_level(root_item);
1602 extent_buffer_get(reloc_root->node);
1603 path->nodes[level] = reloc_root->node;
1604 path->slots[level] = 0;
1605 } else {
1606 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
1607
1608 level = root_item->drop_level;
1609 BUG_ON(level == 0);
1610 path->lowest_level = level;
1611 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
1612 if (ret < 0) {
1613 btrfs_free_path(path);
1614 return ret;
1615 }
1616
1617 btrfs_node_key_to_cpu(path->nodes[level], &next_key,
1618 path->slots[level]);
1619 WARN_ON(memcmp(&key, &next_key, sizeof(key)));
1620
1621 btrfs_unlock_up_safe(path, 0);
1622 }
1623
1624 if (level == 0 && rc->stage == UPDATE_DATA_PTRS) {
1625 trans = btrfs_start_transaction(root, 1);
1626
1627 leaf = path->nodes[0];
1628 btrfs_item_key_to_cpu(leaf, &key, 0);
1629 btrfs_release_path(reloc_root, path);
1630
1631 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1632 if (ret < 0) {
1633 err = ret;
1634 goto out;
1635 }
1636
1637 leaf = path->nodes[0];
1638 btrfs_unlock_up_safe(path, 1);
1639 ret = replace_file_extents(trans, rc, root, leaf,
1640 &inode_list);
1641 if (ret < 0)
1642 err = ret;
1643 goto out;
1644 }
1645
1646 memset(&next_key, 0, sizeof(next_key));
1647
1648 while (1) {
1649 leaf = NULL;
1650 replaced = 0;
1651 trans = btrfs_start_transaction(root, 1);
1652 max_level = level;
1653
1654 ret = walk_down_reloc_tree(reloc_root, path, &level);
1655 if (ret < 0) {
1656 err = ret;
1657 goto out;
1658 }
1659 if (ret > 0)
1660 break;
1661
1662 if (!find_next_key(path, level, &key) &&
1663 btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
1664 ret = 0;
1665 } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
1666 ret = replace_path(trans, root, reloc_root,
1667 path, &next_key, &leaf,
1668 level, max_level);
1669 } else {
1670 ret = replace_path(trans, root, reloc_root,
1671 path, &next_key, NULL,
1672 level, max_level);
1673 }
1674 if (ret < 0) {
1675 err = ret;
1676 goto out;
1677 }
1678
1679 if (ret > 0) {
1680 level = ret;
1681 btrfs_node_key_to_cpu(path->nodes[level], &key,
1682 path->slots[level]);
1683 replaced = 1;
1684 } else if (leaf) {
1685 /*
1686 * no block got replaced, try replacing file extents
1687 */
1688 btrfs_item_key_to_cpu(leaf, &key, 0);
1689 ret = replace_file_extents(trans, rc, root, leaf,
1690 &inode_list);
1691 btrfs_tree_unlock(leaf);
1692 free_extent_buffer(leaf);
1693 BUG_ON(ret < 0);
1694 }
1695
1696 ret = walk_up_reloc_tree(reloc_root, path, &level);
1697 if (ret > 0)
1698 break;
1699
1700 BUG_ON(level == 0);
1701 /*
1702 * save the merging progress in the drop_progress.
1703 * this is OK since root refs == 1 in this case.
1704 */
1705 btrfs_node_key(path->nodes[level], &root_item->drop_progress,
1706 path->slots[level]);
1707 root_item->drop_level = level;
1708
1709 nr = trans->blocks_used;
1710 btrfs_end_transaction(trans, root);
1711
1712 btrfs_btree_balance_dirty(root, nr);
1713
1714 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1715 invalidate_extent_cache(root, &key, &next_key);
1716 }
1717
1718 /*
1719 * handle the case only one block in the fs tree need to be
1720 * relocated and the block is tree root.
1721 */
1722 leaf = btrfs_lock_root_node(root);
1723 ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
1724 btrfs_tree_unlock(leaf);
1725 free_extent_buffer(leaf);
1726 if (ret < 0)
1727 err = ret;
1728out:
1729 btrfs_free_path(path);
1730
1731 if (err == 0) {
1732 memset(&root_item->drop_progress, 0,
1733 sizeof(root_item->drop_progress));
1734 root_item->drop_level = 0;
1735 btrfs_set_root_refs(root_item, 0);
1736 }
1737
1738 nr = trans->blocks_used;
1739 btrfs_end_transaction(trans, root);
1740
1741 btrfs_btree_balance_dirty(root, nr);
1742
1743 /*
1744 * put inodes while we aren't holding the tree locks
1745 */
1746 while (!list_empty(&inode_list)) {
1747 struct inodevec *ivec;
1748 ivec = list_entry(inode_list.next, struct inodevec, list);
1749 list_del(&ivec->list);
1750 while (ivec->nr > 0) {
1751 ivec->nr--;
1752 iput(ivec->inode[ivec->nr]);
1753 }
1754 kfree(ivec);
1755 }
1756
1757 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1758 invalidate_extent_cache(root, &key, &next_key);
1759
1760 return err;
1761}
1762
1763/*
1764 * callback for the work threads.
1765 * this function merges reloc tree with corresponding fs tree,
1766 * and then drops the reloc tree.
1767 */
1768static void merge_func(struct btrfs_work *work)
1769{
1770 struct btrfs_trans_handle *trans;
1771 struct btrfs_root *root;
1772 struct btrfs_root *reloc_root;
1773 struct async_merge *async;
1774
1775 async = container_of(work, struct async_merge, work);
1776 reloc_root = async->root;
1777
1778 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1779 root = read_fs_root(reloc_root->fs_info,
1780 reloc_root->root_key.offset);
1781 BUG_ON(IS_ERR(root));
1782 BUG_ON(root->reloc_root != reloc_root);
1783
1784 merge_reloc_root(async->rc, root);
1785
1786 trans = btrfs_start_transaction(root, 1);
1787 btrfs_update_reloc_root(trans, root);
1788 btrfs_end_transaction(trans, root);
1789 }
1790
1791 btrfs_drop_dead_root(reloc_root);
1792
1793 if (atomic_dec_and_test(async->num_pending))
1794 complete(async->done);
1795
1796 kfree(async);
1797}
1798
1799static int merge_reloc_roots(struct reloc_control *rc)
1800{
1801 struct async_merge *async;
1802 struct btrfs_root *root;
1803 struct completion done;
1804 atomic_t num_pending;
1805
1806 init_completion(&done);
1807 atomic_set(&num_pending, 1);
1808
1809 while (!list_empty(&rc->reloc_roots)) {
1810 root = list_entry(rc->reloc_roots.next,
1811 struct btrfs_root, root_list);
1812 list_del_init(&root->root_list);
1813
1814 async = kmalloc(sizeof(*async), GFP_NOFS);
1815 BUG_ON(!async);
1816 async->work.func = merge_func;
1817 async->work.flags = 0;
1818 async->rc = rc;
1819 async->root = root;
1820 async->done = &done;
1821 async->num_pending = &num_pending;
1822 atomic_inc(&num_pending);
1823 btrfs_queue_worker(&rc->workers, &async->work);
1824 }
1825
1826 if (!atomic_dec_and_test(&num_pending))
1827 wait_for_completion(&done);
1828
1829 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
1830 return 0;
1831}
1832
1833static void free_block_list(struct rb_root *blocks)
1834{
1835 struct tree_block *block;
1836 struct rb_node *rb_node;
1837 while ((rb_node = rb_first(blocks))) {
1838 block = rb_entry(rb_node, struct tree_block, rb_node);
1839 rb_erase(rb_node, blocks);
1840 kfree(block);
1841 }
1842}
1843
1844static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
1845 struct btrfs_root *reloc_root)
1846{
1847 struct btrfs_root *root;
1848
1849 if (reloc_root->last_trans == trans->transid)
1850 return 0;
1851
1852 root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset);
1853 BUG_ON(IS_ERR(root));
1854 BUG_ON(root->reloc_root != reloc_root);
1855
1856 return btrfs_record_root_in_trans(trans, root);
1857}
1858
1859/*
1860 * select one tree from trees that references the block.
1861 * for blocks in refernce counted trees, we preper reloc tree.
1862 * if no reloc tree found and reloc_only is true, NULL is returned.
1863 */
1864static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
1865 struct backref_node *node,
1866 struct backref_edge *edges[],
1867 int *nr, int reloc_only)
1868{
1869 struct backref_node *next;
1870 struct btrfs_root *root;
1871 int index;
1872 int loop = 0;
1873again:
1874 index = 0;
1875 next = node;
1876 while (1) {
1877 cond_resched();
1878 next = walk_up_backref(next, edges, &index);
1879 root = next->root;
1880 if (!root) {
1881 BUG_ON(!node->old_root);
1882 goto skip;
1883 }
1884
1885 /* no other choice for non-refernce counted tree */
1886 if (!root->ref_cows) {
1887 BUG_ON(reloc_only);
1888 break;
1889 }
1890
1891 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1892 record_reloc_root_in_trans(trans, root);
1893 break;
1894 }
1895
1896 if (loop) {
1897 btrfs_record_root_in_trans(trans, root);
1898 break;
1899 }
1900
1901 if (reloc_only || next != node) {
1902 if (!root->reloc_root)
1903 btrfs_record_root_in_trans(trans, root);
1904 root = root->reloc_root;
1905 /*
1906 * if the reloc tree was created in current
1907 * transation, there is no node in backref tree
1908 * corresponds to the root of the reloc tree.
1909 */
1910 if (btrfs_root_last_snapshot(&root->root_item) ==
1911 trans->transid - 1)
1912 break;
1913 }
1914skip:
1915 root = NULL;
1916 next = walk_down_backref(edges, &index);
1917 if (!next || next->level <= node->level)
1918 break;
1919 }
1920
1921 if (!root && !loop && !reloc_only) {
1922 loop = 1;
1923 goto again;
1924 }
1925
1926 if (root)
1927 *nr = index;
1928 else
1929 *nr = 0;
1930
1931 return root;
1932}
1933
1934static noinline_for_stack
1935struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
1936 struct backref_node *node)
1937{
1938 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
1939 int nr;
1940 return __select_one_root(trans, node, edges, &nr, 0);
1941}
1942
1943static noinline_for_stack
1944struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
1945 struct backref_node *node,
1946 struct backref_edge *edges[], int *nr)
1947{
1948 return __select_one_root(trans, node, edges, nr, 1);
1949}
1950
1951static void grab_path_buffers(struct btrfs_path *path,
1952 struct backref_node *node,
1953 struct backref_edge *edges[], int nr)
1954{
1955 int i = 0;
1956 while (1) {
1957 drop_node_buffer(node);
1958 node->eb = path->nodes[node->level];
1959 BUG_ON(!node->eb);
1960 if (path->locks[node->level])
1961 node->locked = 1;
1962 path->nodes[node->level] = NULL;
1963 path->locks[node->level] = 0;
1964
1965 if (i >= nr)
1966 break;
1967
1968 edges[i]->blockptr = node->eb->start;
1969 node = edges[i]->node[UPPER];
1970 i++;
1971 }
1972}
1973
1974/*
1975 * relocate a block tree, and then update pointers in upper level
1976 * blocks that reference the block to point to the new location.
1977 *
1978 * if called by link_to_upper, the block has already been relocated.
1979 * in that case this function just updates pointers.
1980 */
1981static int do_relocation(struct btrfs_trans_handle *trans,
1982 struct backref_node *node,
1983 struct btrfs_key *key,
1984 struct btrfs_path *path, int lowest)
1985{
1986 struct backref_node *upper;
1987 struct backref_edge *edge;
1988 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
1989 struct btrfs_root *root;
1990 struct extent_buffer *eb;
1991 u32 blocksize;
1992 u64 bytenr;
1993 u64 generation;
1994 int nr;
1995 int slot;
1996 int ret;
1997 int err = 0;
1998
1999 BUG_ON(lowest && node->eb);
2000
2001 path->lowest_level = node->level + 1;
2002 list_for_each_entry(edge, &node->upper, list[LOWER]) {
2003 cond_resched();
2004 if (node->eb && node->eb->start == edge->blockptr)
2005 continue;
2006
2007 upper = edge->node[UPPER];
2008 root = select_reloc_root(trans, upper, edges, &nr);
2009 if (!root)
2010 continue;
2011
2012 if (upper->eb && !upper->locked)
2013 drop_node_buffer(upper);
2014
2015 if (!upper->eb) {
2016 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2017 if (ret < 0) {
2018 err = ret;
2019 break;
2020 }
2021 BUG_ON(ret > 0);
2022
2023 slot = path->slots[upper->level];
2024
2025 btrfs_unlock_up_safe(path, upper->level + 1);
2026 grab_path_buffers(path, upper, edges, nr);
2027
2028 btrfs_release_path(NULL, path);
2029 } else {
2030 ret = btrfs_bin_search(upper->eb, key, upper->level,
2031 &slot);
2032 BUG_ON(ret);
2033 }
2034
2035 bytenr = btrfs_node_blockptr(upper->eb, slot);
2036 if (!lowest) {
2037 if (node->eb->start == bytenr) {
2038 btrfs_tree_unlock(upper->eb);
2039 upper->locked = 0;
2040 continue;
2041 }
2042 } else {
2043 BUG_ON(node->bytenr != bytenr);
2044 }
2045
2046 blocksize = btrfs_level_size(root, node->level);
2047 generation = btrfs_node_ptr_generation(upper->eb, slot);
2048 eb = read_tree_block(root, bytenr, blocksize, generation);
2049 btrfs_tree_lock(eb);
2050 btrfs_set_lock_blocking(eb);
2051
2052 if (!node->eb) {
2053 ret = btrfs_cow_block(trans, root, eb, upper->eb,
2054 slot, &eb);
2055 if (ret < 0) {
2056 err = ret;
2057 break;
2058 }
2059 btrfs_set_lock_blocking(eb);
2060 node->eb = eb;
2061 node->locked = 1;
2062 } else {
2063 btrfs_set_node_blockptr(upper->eb, slot,
2064 node->eb->start);
2065 btrfs_set_node_ptr_generation(upper->eb, slot,
2066 trans->transid);
2067 btrfs_mark_buffer_dirty(upper->eb);
2068
2069 ret = btrfs_inc_extent_ref(trans, root,
2070 node->eb->start, blocksize,
2071 upper->eb->start,
2072 btrfs_header_owner(upper->eb),
2073 node->level, 0);
2074 BUG_ON(ret);
2075
2076 ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2077 BUG_ON(ret);
2078
2079 btrfs_tree_unlock(eb);
2080 free_extent_buffer(eb);
2081 }
2082 if (!lowest) {
2083 btrfs_tree_unlock(upper->eb);
2084 upper->locked = 0;
2085 }
2086 }
2087 path->lowest_level = 0;
2088 return err;
2089}
2090
2091static int link_to_upper(struct btrfs_trans_handle *trans,
2092 struct backref_node *node,
2093 struct btrfs_path *path)
2094{
2095 struct btrfs_key key;
2096 if (!node->eb || list_empty(&node->upper))
2097 return 0;
2098
2099 btrfs_node_key_to_cpu(node->eb, &key, 0);
2100 return do_relocation(trans, node, &key, path, 0);
2101}
2102
2103static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2104 struct backref_cache *cache,
2105 struct btrfs_path *path)
2106{
2107 struct backref_node *node;
2108 int level;
2109 int ret;
2110 int err = 0;
2111
2112 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2113 while (!list_empty(&cache->pending[level])) {
2114 node = list_entry(cache->pending[level].next,
2115 struct backref_node, lower);
2116 BUG_ON(node->level != level);
2117
2118 ret = link_to_upper(trans, node, path);
2119 if (ret < 0)
2120 err = ret;
2121 /*
2122 * this remove the node from the pending list and
2123 * may add some other nodes to the level + 1
2124 * pending list
2125 */
2126 remove_backref_node(cache, node);
2127 }
2128 }
2129 BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
2130 return err;
2131}
2132
2133static void mark_block_processed(struct reloc_control *rc,
2134 struct backref_node *node)
2135{
2136 u32 blocksize;
2137 if (node->level == 0 ||
2138 in_block_group(node->bytenr, rc->block_group)) {
2139 blocksize = btrfs_level_size(rc->extent_root, node->level);
2140 set_extent_bits(&rc->processed_blocks, node->bytenr,
2141 node->bytenr + blocksize - 1, EXTENT_DIRTY,
2142 GFP_NOFS);
2143 }
2144 node->processed = 1;
2145}
2146
2147/*
2148 * mark a block and all blocks directly/indirectly reference the block
2149 * as processed.
2150 */
2151static void update_processed_blocks(struct reloc_control *rc,
2152 struct backref_node *node)
2153{
2154 struct backref_node *next = node;
2155 struct backref_edge *edge;
2156 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2157 int index = 0;
2158
2159 while (next) {
2160 cond_resched();
2161 while (1) {
2162 if (next->processed)
2163 break;
2164
2165 mark_block_processed(rc, next);
2166
2167 if (list_empty(&next->upper))
2168 break;
2169
2170 edge = list_entry(next->upper.next,
2171 struct backref_edge, list[LOWER]);
2172 edges[index++] = edge;
2173 next = edge->node[UPPER];
2174 }
2175 next = walk_down_backref(edges, &index);
2176 }
2177}
2178
2179static int tree_block_processed(u64 bytenr, u32 blocksize,
2180 struct reloc_control *rc)
2181{
2182 if (test_range_bit(&rc->processed_blocks, bytenr,
2183 bytenr + blocksize - 1, EXTENT_DIRTY, 1))
2184 return 1;
2185 return 0;
2186}
2187
2188/*
2189 * check if there are any file extent pointers in the leaf point to
2190 * data require processing
2191 */
2192static int check_file_extents(struct reloc_control *rc,
2193 u64 bytenr, u32 blocksize, u64 ptr_gen)
2194{
2195 struct btrfs_key found_key;
2196 struct btrfs_file_extent_item *fi;
2197 struct extent_buffer *leaf;
2198 u32 nritems;
2199 int i;
2200 int ret = 0;
2201
2202 leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
2203
2204 nritems = btrfs_header_nritems(leaf);
2205 for (i = 0; i < nritems; i++) {
2206 cond_resched();
2207 btrfs_item_key_to_cpu(leaf, &found_key, i);
2208 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
2209 continue;
2210 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
2211 if (btrfs_file_extent_type(leaf, fi) ==
2212 BTRFS_FILE_EXTENT_INLINE)
2213 continue;
2214 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
2215 if (bytenr == 0)
2216 continue;
2217 if (in_block_group(bytenr, rc->block_group)) {
2218 ret = 1;
2219 break;
2220 }
2221 }
2222 free_extent_buffer(leaf);
2223 return ret;
2224}
2225
2226/*
2227 * scan child blocks of a given block to find blocks require processing
2228 */
2229static int add_child_blocks(struct btrfs_trans_handle *trans,
2230 struct reloc_control *rc,
2231 struct backref_node *node,
2232 struct rb_root *blocks)
2233{
2234 struct tree_block *block;
2235 struct rb_node *rb_node;
2236 u64 bytenr;
2237 u64 ptr_gen;
2238 u32 blocksize;
2239 u32 nritems;
2240 int i;
2241 int err = 0;
2242
2243 nritems = btrfs_header_nritems(node->eb);
2244 blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
2245 for (i = 0; i < nritems; i++) {
2246 cond_resched();
2247 bytenr = btrfs_node_blockptr(node->eb, i);
2248 ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2249 if (ptr_gen == trans->transid)
2250 continue;
2251 if (!in_block_group(bytenr, rc->block_group) &&
2252 (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2253 continue;
2254 if (tree_block_processed(bytenr, blocksize, rc))
2255 continue;
2256
2257 readahead_tree_block(rc->extent_root,
2258 bytenr, blocksize, ptr_gen);
2259 }
2260
2261 for (i = 0; i < nritems; i++) {
2262 cond_resched();
2263 bytenr = btrfs_node_blockptr(node->eb, i);
2264 ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2265 if (ptr_gen == trans->transid)
2266 continue;
2267 if (!in_block_group(bytenr, rc->block_group) &&
2268 (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2269 continue;
2270 if (tree_block_processed(bytenr, blocksize, rc))
2271 continue;
2272 if (!in_block_group(bytenr, rc->block_group) &&
2273 !check_file_extents(rc, bytenr, blocksize, ptr_gen))
2274 continue;
2275
2276 block = kmalloc(sizeof(*block), GFP_NOFS);
2277 if (!block) {
2278 err = -ENOMEM;
2279 break;
2280 }
2281 block->bytenr = bytenr;
2282 btrfs_node_key_to_cpu(node->eb, &block->key, i);
2283 block->level = node->level - 1;
2284 block->key_ready = 1;
2285 rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
2286 BUG_ON(rb_node);
2287 }
2288 if (err)
2289 free_block_list(blocks);
2290 return err;
2291}
2292
2293/*
2294 * find adjacent blocks require processing
2295 */
2296static noinline_for_stack
2297int add_adjacent_blocks(struct btrfs_trans_handle *trans,
2298 struct reloc_control *rc,
2299 struct backref_cache *cache,
2300 struct rb_root *blocks, int level,
2301 struct backref_node **upper)
2302{
2303 struct backref_node *node;
2304 int ret = 0;
2305
2306 WARN_ON(!list_empty(&cache->pending[level]));
2307
2308 if (list_empty(&cache->pending[level + 1]))
2309 return 1;
2310
2311 node = list_entry(cache->pending[level + 1].next,
2312 struct backref_node, lower);
2313 if (node->eb)
2314 ret = add_child_blocks(trans, rc, node, blocks);
2315
2316 *upper = node;
2317 return ret;
2318}
2319
2320static int get_tree_block_key(struct reloc_control *rc,
2321 struct tree_block *block)
2322{
2323 struct extent_buffer *eb;
2324
2325 BUG_ON(block->key_ready);
2326 eb = read_tree_block(rc->extent_root, block->bytenr,
2327 block->key.objectid, block->key.offset);
2328 WARN_ON(btrfs_header_level(eb) != block->level);
2329 if (block->level == 0)
2330 btrfs_item_key_to_cpu(eb, &block->key, 0);
2331 else
2332 btrfs_node_key_to_cpu(eb, &block->key, 0);
2333 free_extent_buffer(eb);
2334 block->key_ready = 1;
2335 return 0;
2336}
2337
2338static int reada_tree_block(struct reloc_control *rc,
2339 struct tree_block *block)
2340{
2341 BUG_ON(block->key_ready);
2342 readahead_tree_block(rc->extent_root, block->bytenr,
2343 block->key.objectid, block->key.offset);
2344 return 0;
2345}
2346
2347/*
2348 * helper function to relocate a tree block
2349 */
2350static int relocate_tree_block(struct btrfs_trans_handle *trans,
2351 struct reloc_control *rc,
2352 struct backref_node *node,
2353 struct btrfs_key *key,
2354 struct btrfs_path *path)
2355{
2356 struct btrfs_root *root;
2357 int ret;
2358
2359 root = select_one_root(trans, node);
2360 if (unlikely(!root)) {
2361 rc->found_old_snapshot = 1;
2362 update_processed_blocks(rc, node);
2363 return 0;
2364 }
2365
2366 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2367 ret = do_relocation(trans, node, key, path, 1);
2368 if (ret < 0)
2369 goto out;
2370 if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
2371 ret = replace_file_extents(trans, rc, root,
2372 node->eb, NULL);
2373 if (ret < 0)
2374 goto out;
2375 }
2376 drop_node_buffer(node);
2377 } else if (!root->ref_cows) {
2378 path->lowest_level = node->level;
2379 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2380 btrfs_release_path(root, path);
2381 if (ret < 0)
2382 goto out;
2383 } else if (root != node->root) {
2384 WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
2385 }
2386
2387 update_processed_blocks(rc, node);
2388 ret = 0;
2389out:
2390 drop_node_buffer(node);
2391 return ret;
2392}
2393
2394/*
2395 * relocate a list of blocks
2396 */
2397static noinline_for_stack
2398int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2399 struct reloc_control *rc, struct rb_root *blocks)
2400{
2401 struct backref_cache *cache;
2402 struct backref_node *node;
2403 struct btrfs_path *path;
2404 struct tree_block *block;
2405 struct rb_node *rb_node;
2406 int level = -1;
2407 int ret;
2408 int err = 0;
2409
2410 path = btrfs_alloc_path();
2411 if (!path)
2412 return -ENOMEM;
2413
2414 cache = kmalloc(sizeof(*cache), GFP_NOFS);
2415 if (!cache) {
2416 btrfs_free_path(path);
2417 return -ENOMEM;
2418 }
2419
2420 backref_cache_init(cache);
2421
2422 rb_node = rb_first(blocks);
2423 while (rb_node) {
2424 block = rb_entry(rb_node, struct tree_block, rb_node);
2425 if (level == -1)
2426 level = block->level;
2427 else
2428 BUG_ON(level != block->level);
2429 if (!block->key_ready)
2430 reada_tree_block(rc, block);
2431 rb_node = rb_next(rb_node);
2432 }
2433
2434 rb_node = rb_first(blocks);
2435 while (rb_node) {
2436 block = rb_entry(rb_node, struct tree_block, rb_node);
2437 if (!block->key_ready)
2438 get_tree_block_key(rc, block);
2439 rb_node = rb_next(rb_node);
2440 }
2441
2442 rb_node = rb_first(blocks);
2443 while (rb_node) {
2444 block = rb_entry(rb_node, struct tree_block, rb_node);
2445
2446 node = build_backref_tree(rc, cache, &block->key,
2447 block->level, block->bytenr);
2448 if (IS_ERR(node)) {
2449 err = PTR_ERR(node);
2450 goto out;
2451 }
2452
2453 ret = relocate_tree_block(trans, rc, node, &block->key,
2454 path);
2455 if (ret < 0) {
2456 err = ret;
2457 goto out;
2458 }
2459 remove_backref_node(cache, node);
2460 rb_node = rb_next(rb_node);
2461 }
2462
2463 if (level > 0)
2464 goto out;
2465
2466 free_block_list(blocks);
2467
2468 /*
2469 * now backrefs of some upper level tree blocks have been cached,
2470 * try relocating blocks referenced by these upper level blocks.
2471 */
2472 while (1) {
2473 struct backref_node *upper = NULL;
2474 if (trans->transaction->in_commit ||
2475 trans->transaction->delayed_refs.flushing)
2476 break;
2477
2478 ret = add_adjacent_blocks(trans, rc, cache, blocks, level,
2479 &upper);
2480 if (ret < 0)
2481 err = ret;
2482 if (ret != 0)
2483 break;
2484
2485 rb_node = rb_first(blocks);
2486 while (rb_node) {
2487 block = rb_entry(rb_node, struct tree_block, rb_node);
2488 if (trans->transaction->in_commit ||
2489 trans->transaction->delayed_refs.flushing)
2490 goto out;
2491 BUG_ON(!block->key_ready);
2492 node = build_backref_tree(rc, cache, &block->key,
2493 level, block->bytenr);
2494 if (IS_ERR(node)) {
2495 err = PTR_ERR(node);
2496 goto out;
2497 }
2498
2499 ret = relocate_tree_block(trans, rc, node,
2500 &block->key, path);
2501 if (ret < 0) {
2502 err = ret;
2503 goto out;
2504 }
2505 remove_backref_node(cache, node);
2506 rb_node = rb_next(rb_node);
2507 }
2508 free_block_list(blocks);
2509
2510 if (upper) {
2511 ret = link_to_upper(trans, upper, path);
2512 if (ret < 0) {
2513 err = ret;
2514 break;
2515 }
2516 remove_backref_node(cache, upper);
2517 }
2518 }
2519out:
2520 free_block_list(blocks);
2521
2522 ret = finish_pending_nodes(trans, cache, path);
2523 if (ret < 0)
2524 err = ret;
2525
2526 kfree(cache);
2527 btrfs_free_path(path);
2528 return err;
2529}
2530
2531static noinline_for_stack
2532int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
2533{
2534 u64 page_start;
2535 u64 page_end;
2536 unsigned long i;
2537 unsigned long first_index;
2538 unsigned long last_index;
2539 unsigned int total_read = 0;
2540 unsigned int total_dirty = 0;
2541 struct page *page;
2542 struct file_ra_state *ra;
2543 struct btrfs_ordered_extent *ordered;
2544 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2545 int ret = 0;
2546
2547 ra = kzalloc(sizeof(*ra), GFP_NOFS);
2548 if (!ra)
2549 return -ENOMEM;
2550
2551 mutex_lock(&inode->i_mutex);
2552 first_index = start >> PAGE_CACHE_SHIFT;
2553 last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
2554
2555 /* make sure the dirty trick played by the caller work */
2556 ret = invalidate_inode_pages2_range(inode->i_mapping,
2557 first_index, last_index);
2558 if (ret)
2559 goto out_unlock;
2560
2561 file_ra_state_init(ra, inode->i_mapping);
2562
2563 for (i = first_index ; i <= last_index; i++) {
2564 if (total_read % ra->ra_pages == 0) {
2565 btrfs_force_ra(inode->i_mapping, ra, NULL, i,
2566 min(last_index, ra->ra_pages + i - 1));
2567 }
2568 total_read++;
2569again:
2570 if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
2571 BUG_ON(1);
2572 page = grab_cache_page(inode->i_mapping, i);
2573 if (!page) {
2574 ret = -ENOMEM;
2575 goto out_unlock;
2576 }
2577 if (!PageUptodate(page)) {
2578 btrfs_readpage(NULL, page);
2579 lock_page(page);
2580 if (!PageUptodate(page)) {
2581 unlock_page(page);
2582 page_cache_release(page);
2583 ret = -EIO;
2584 goto out_unlock;
2585 }
2586 }
2587 wait_on_page_writeback(page);
2588
2589 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2590 page_end = page_start + PAGE_CACHE_SIZE - 1;
2591 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
2592
2593 ordered = btrfs_lookup_ordered_extent(inode, page_start);
2594 if (ordered) {
2595 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2596 unlock_page(page);
2597 page_cache_release(page);
2598 btrfs_start_ordered_extent(inode, ordered, 1);
2599 btrfs_put_ordered_extent(ordered);
2600 goto again;
2601 }
2602 set_page_extent_mapped(page);
2603
2604 if (i == first_index)
2605 set_extent_bits(io_tree, page_start, page_end,
2606 EXTENT_BOUNDARY, GFP_NOFS);
2607 btrfs_set_extent_delalloc(inode, page_start, page_end);
2608
2609 set_page_dirty(page);
2610 total_dirty++;
2611
2612 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2613 unlock_page(page);
2614 page_cache_release(page);
2615 }
2616out_unlock:
2617 mutex_unlock(&inode->i_mutex);
2618 kfree(ra);
2619 balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
2620 return ret;
2621}
2622
2623static noinline_for_stack
2624int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key)
2625{
2626 struct btrfs_root *root = BTRFS_I(inode)->root;
2627 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2628 struct extent_map *em;
2629 u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt;
2630 u64 end = start + extent_key->offset - 1;
2631
2632 em = alloc_extent_map(GFP_NOFS);
2633 em->start = start;
2634 em->len = extent_key->offset;
2635 em->block_len = extent_key->offset;
2636 em->block_start = extent_key->objectid;
2637 em->bdev = root->fs_info->fs_devices->latest_bdev;
2638 set_bit(EXTENT_FLAG_PINNED, &em->flags);
2639
2640 /* setup extent map to cheat btrfs_readpage */
2641 lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2642 while (1) {
2643 int ret;
2644 spin_lock(&em_tree->lock);
2645 ret = add_extent_mapping(em_tree, em);
2646 spin_unlock(&em_tree->lock);
2647 if (ret != -EEXIST) {
2648 free_extent_map(em);
2649 break;
2650 }
2651 btrfs_drop_extent_cache(inode, start, end, 0);
2652 }
2653 unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2654
2655 return relocate_inode_pages(inode, start, extent_key->offset);
2656}
2657
2658#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2659static int get_ref_objectid_v0(struct reloc_control *rc,
2660 struct btrfs_path *path,
2661 struct btrfs_key *extent_key,
2662 u64 *ref_objectid, int *path_change)
2663{
2664 struct btrfs_key key;
2665 struct extent_buffer *leaf;
2666 struct btrfs_extent_ref_v0 *ref0;
2667 int ret;
2668 int slot;
2669
2670 leaf = path->nodes[0];
2671 slot = path->slots[0];
2672 while (1) {
2673 if (slot >= btrfs_header_nritems(leaf)) {
2674 ret = btrfs_next_leaf(rc->extent_root, path);
2675 if (ret < 0)
2676 return ret;
2677 BUG_ON(ret > 0);
2678 leaf = path->nodes[0];
2679 slot = path->slots[0];
2680 if (path_change)
2681 *path_change = 1;
2682 }
2683 btrfs_item_key_to_cpu(leaf, &key, slot);
2684 if (key.objectid != extent_key->objectid)
2685 return -ENOENT;
2686
2687 if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
2688 slot++;
2689 continue;
2690 }
2691 ref0 = btrfs_item_ptr(leaf, slot,
2692 struct btrfs_extent_ref_v0);
2693 *ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
2694 break;
2695 }
2696 return 0;
2697}
2698#endif
2699
2700/*
2701 * helper to add a tree block to the list.
2702 * the major work is getting the generation and level of the block
2703 */
2704static int add_tree_block(struct reloc_control *rc,
2705 struct btrfs_key *extent_key,
2706 struct btrfs_path *path,
2707 struct rb_root *blocks)
2708{
2709 struct extent_buffer *eb;
2710 struct btrfs_extent_item *ei;
2711 struct btrfs_tree_block_info *bi;
2712 struct tree_block *block;
2713 struct rb_node *rb_node;
2714 u32 item_size;
2715 int level = -1;
2716 int generation;
2717
2718 eb = path->nodes[0];
2719 item_size = btrfs_item_size_nr(eb, path->slots[0]);
2720
2721 if (item_size >= sizeof(*ei) + sizeof(*bi)) {
2722 ei = btrfs_item_ptr(eb, path->slots[0],
2723 struct btrfs_extent_item);
2724 bi = (struct btrfs_tree_block_info *)(ei + 1);
2725 generation = btrfs_extent_generation(eb, ei);
2726 level = btrfs_tree_block_level(eb, bi);
2727 } else {
2728#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2729 u64 ref_owner;
2730 int ret;
2731
2732 BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2733 ret = get_ref_objectid_v0(rc, path, extent_key,
2734 &ref_owner, NULL);
2735 BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
2736 level = (int)ref_owner;
2737 /* FIXME: get real generation */
2738 generation = 0;
2739#else
2740 BUG();
2741#endif
2742 }
2743
2744 btrfs_release_path(rc->extent_root, path);
2745
2746 BUG_ON(level == -1);
2747
2748 block = kmalloc(sizeof(*block), GFP_NOFS);
2749 if (!block)
2750 return -ENOMEM;
2751
2752 block->bytenr = extent_key->objectid;
2753 block->key.objectid = extent_key->offset;
2754 block->key.offset = generation;
2755 block->level = level;
2756 block->key_ready = 0;
2757
2758 rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
2759 BUG_ON(rb_node);
2760
2761 return 0;
2762}
2763
2764/*
2765 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
2766 */
2767static int __add_tree_block(struct reloc_control *rc,
2768 u64 bytenr, u32 blocksize,
2769 struct rb_root *blocks)
2770{
2771 struct btrfs_path *path;
2772 struct btrfs_key key;
2773 int ret;
2774
2775 if (tree_block_processed(bytenr, blocksize, rc))
2776 return 0;
2777
2778 if (tree_search(blocks, bytenr))
2779 return 0;
2780
2781 path = btrfs_alloc_path();
2782 if (!path)
2783 return -ENOMEM;
2784
2785 key.objectid = bytenr;
2786 key.type = BTRFS_EXTENT_ITEM_KEY;
2787 key.offset = blocksize;
2788
2789 path->search_commit_root = 1;
2790 path->skip_locking = 1;
2791 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
2792 if (ret < 0)
2793 goto out;
2794 BUG_ON(ret);
2795
2796 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2797 ret = add_tree_block(rc, &key, path, blocks);
2798out:
2799 btrfs_free_path(path);
2800 return ret;
2801}
2802
2803/*
2804 * helper to check if the block use full backrefs for pointers in it
2805 */
2806static int block_use_full_backref(struct reloc_control *rc,
2807 struct extent_buffer *eb)
2808{
2809 struct btrfs_path *path;
2810 struct btrfs_extent_item *ei;
2811 struct btrfs_key key;
2812 u64 flags;
2813 int ret;
2814
2815 if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
2816 btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
2817 return 1;
2818
2819 path = btrfs_alloc_path();
2820 BUG_ON(!path);
2821
2822 key.objectid = eb->start;
2823 key.type = BTRFS_EXTENT_ITEM_KEY;
2824 key.offset = eb->len;
2825
2826 path->search_commit_root = 1;
2827 path->skip_locking = 1;
2828 ret = btrfs_search_slot(NULL, rc->extent_root,
2829 &key, path, 0, 0);
2830 BUG_ON(ret);
2831
2832 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
2833 struct btrfs_extent_item);
2834 flags = btrfs_extent_flags(path->nodes[0], ei);
2835 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2836 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
2837 ret = 1;
2838 else
2839 ret = 0;
2840 btrfs_free_path(path);
2841 return ret;
2842}
2843
2844/*
2845 * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
2846 * this function scans fs tree to find blocks reference the data extent
2847 */
2848static int find_data_references(struct reloc_control *rc,
2849 struct btrfs_key *extent_key,
2850 struct extent_buffer *leaf,
2851 struct btrfs_extent_data_ref *ref,
2852 struct rb_root *blocks)
2853{
2854 struct btrfs_path *path;
2855 struct tree_block *block;
2856 struct btrfs_root *root;
2857 struct btrfs_file_extent_item *fi;
2858 struct rb_node *rb_node;
2859 struct btrfs_key key;
2860 u64 ref_root;
2861 u64 ref_objectid;
2862 u64 ref_offset;
2863 u32 ref_count;
2864 u32 nritems;
2865 int err = 0;
2866 int added = 0;
2867 int counted;
2868 int ret;
2869
2870 path = btrfs_alloc_path();
2871 if (!path)
2872 return -ENOMEM;
2873
2874 ref_root = btrfs_extent_data_ref_root(leaf, ref);
2875 ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
2876 ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
2877 ref_count = btrfs_extent_data_ref_count(leaf, ref);
2878
2879 root = read_fs_root(rc->extent_root->fs_info, ref_root);
2880 if (IS_ERR(root)) {
2881 err = PTR_ERR(root);
2882 goto out;
2883 }
2884
2885 key.objectid = ref_objectid;
2886 key.offset = ref_offset;
2887 key.type = BTRFS_EXTENT_DATA_KEY;
2888
2889 path->search_commit_root = 1;
2890 path->skip_locking = 1;
2891 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2892 if (ret < 0) {
2893 err = ret;
2894 goto out;
2895 }
2896
2897 leaf = path->nodes[0];
2898 nritems = btrfs_header_nritems(leaf);
2899 /*
2900 * the references in tree blocks that use full backrefs
2901 * are not counted in
2902 */
2903 if (block_use_full_backref(rc, leaf))
2904 counted = 0;
2905 else
2906 counted = 1;
2907 rb_node = tree_search(blocks, leaf->start);
2908 if (rb_node) {
2909 if (counted)
2910 added = 1;
2911 else
2912 path->slots[0] = nritems;
2913 }
2914
2915 while (ref_count > 0) {
2916 while (path->slots[0] >= nritems) {
2917 ret = btrfs_next_leaf(root, path);
2918 if (ret < 0) {
2919 err = ret;
2920 goto out;
2921 }
2922 if (ret > 0) {
2923 WARN_ON(1);
2924 goto out;
2925 }
2926
2927 leaf = path->nodes[0];
2928 nritems = btrfs_header_nritems(leaf);
2929 added = 0;
2930
2931 if (block_use_full_backref(rc, leaf))
2932 counted = 0;
2933 else
2934 counted = 1;
2935 rb_node = tree_search(blocks, leaf->start);
2936 if (rb_node) {
2937 if (counted)
2938 added = 1;
2939 else
2940 path->slots[0] = nritems;
2941 }
2942 }
2943
2944 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2945 if (key.objectid != ref_objectid ||
2946 key.type != BTRFS_EXTENT_DATA_KEY) {
2947 WARN_ON(1);
2948 break;
2949 }
2950
2951 fi = btrfs_item_ptr(leaf, path->slots[0],
2952 struct btrfs_file_extent_item);
2953
2954 if (btrfs_file_extent_type(leaf, fi) ==
2955 BTRFS_FILE_EXTENT_INLINE)
2956 goto next;
2957
2958 if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
2959 extent_key->objectid)
2960 goto next;
2961
2962 key.offset -= btrfs_file_extent_offset(leaf, fi);
2963 if (key.offset != ref_offset)
2964 goto next;
2965
2966 if (counted)
2967 ref_count--;
2968 if (added)
2969 goto next;
2970
2971 if (!tree_block_processed(leaf->start, leaf->len, rc)) {
2972 block = kmalloc(sizeof(*block), GFP_NOFS);
2973 if (!block) {
2974 err = -ENOMEM;
2975 break;
2976 }
2977 block->bytenr = leaf->start;
2978 btrfs_item_key_to_cpu(leaf, &block->key, 0);
2979 block->level = 0;
2980 block->key_ready = 1;
2981 rb_node = tree_insert(blocks, block->bytenr,
2982 &block->rb_node);
2983 BUG_ON(rb_node);
2984 }
2985 if (counted)
2986 added = 1;
2987 else
2988 path->slots[0] = nritems;
2989next:
2990 path->slots[0]++;
2991
2992 }
2993out:
2994 btrfs_free_path(path);
2995 return err;
2996}
2997
2998/*
2999 * hepler to find all tree blocks that reference a given data extent
3000 */
3001static noinline_for_stack
3002int add_data_references(struct reloc_control *rc,
3003 struct btrfs_key *extent_key,
3004 struct btrfs_path *path,
3005 struct rb_root *blocks)
3006{
3007 struct btrfs_key key;
3008 struct extent_buffer *eb;
3009 struct btrfs_extent_data_ref *dref;
3010 struct btrfs_extent_inline_ref *iref;
3011 unsigned long ptr;
3012 unsigned long end;
3013 u32 blocksize;
3014 int ret;
3015 int err = 0;
3016
3017 ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
3018 extent_key->offset);
3019 BUG_ON(ret < 0);
3020 if (ret > 0) {
3021 /* the relocated data is fragmented */
3022 rc->extents_skipped++;
3023 btrfs_release_path(rc->extent_root, path);
3024 return 0;
3025 }
3026
3027 blocksize = btrfs_level_size(rc->extent_root, 0);
3028
3029 eb = path->nodes[0];
3030 ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3031 end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
3032#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3033 if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
3034 ptr = end;
3035 else
3036#endif
3037 ptr += sizeof(struct btrfs_extent_item);
3038
3039 while (ptr < end) {
3040 iref = (struct btrfs_extent_inline_ref *)ptr;
3041 key.type = btrfs_extent_inline_ref_type(eb, iref);
3042 if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3043 key.offset = btrfs_extent_inline_ref_offset(eb, iref);
3044 ret = __add_tree_block(rc, key.offset, blocksize,
3045 blocks);
3046 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3047 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
3048 ret = find_data_references(rc, extent_key,
3049 eb, dref, blocks);
3050 } else {
3051 BUG();
3052 }
3053 ptr += btrfs_extent_inline_ref_size(key.type);
3054 }
3055 WARN_ON(ptr > end);
3056
3057 while (1) {
3058 cond_resched();
3059 eb = path->nodes[0];
3060 if (path->slots[0] >= btrfs_header_nritems(eb)) {
3061 ret = btrfs_next_leaf(rc->extent_root, path);
3062 if (ret < 0) {
3063 err = ret;
3064 break;
3065 }
3066 if (ret > 0)
3067 break;
3068 eb = path->nodes[0];
3069 }
3070
3071 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
3072 if (key.objectid != extent_key->objectid)
3073 break;
3074
3075#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3076 if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
3077 key.type == BTRFS_EXTENT_REF_V0_KEY) {
3078#else
3079 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
3080 if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3081#endif
3082 ret = __add_tree_block(rc, key.offset, blocksize,
3083 blocks);
3084 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3085 dref = btrfs_item_ptr(eb, path->slots[0],
3086 struct btrfs_extent_data_ref);
3087 ret = find_data_references(rc, extent_key,
3088 eb, dref, blocks);
3089 } else {
3090 ret = 0;
3091 }
3092 if (ret) {
3093 err = ret;
3094 break;
3095 }
3096 path->slots[0]++;
3097 }
3098 btrfs_release_path(rc->extent_root, path);
3099 if (err)
3100 free_block_list(blocks);
3101 return err;
3102}
3103
3104/*
3105 * hepler to find next unprocessed extent
3106 */
3107static noinline_for_stack
3108int find_next_extent(struct btrfs_trans_handle *trans,
3109 struct reloc_control *rc, struct btrfs_path *path)
3110{
3111 struct btrfs_key key;
3112 struct extent_buffer *leaf;
3113 u64 start, end, last;
3114 int ret;
3115
3116 last = rc->block_group->key.objectid + rc->block_group->key.offset;
3117 while (1) {
3118 cond_resched();
3119 if (rc->search_start >= last) {
3120 ret = 1;
3121 break;
3122 }
3123
3124 key.objectid = rc->search_start;
3125 key.type = BTRFS_EXTENT_ITEM_KEY;
3126 key.offset = 0;
3127
3128 path->search_commit_root = 1;
3129 path->skip_locking = 1;
3130 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3131 0, 0);
3132 if (ret < 0)
3133 break;
3134next:
3135 leaf = path->nodes[0];
3136 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3137 ret = btrfs_next_leaf(rc->extent_root, path);
3138 if (ret != 0)
3139 break;
3140 leaf = path->nodes[0];
3141 }
3142
3143 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3144 if (key.objectid >= last) {
3145 ret = 1;
3146 break;
3147 }
3148
3149 if (key.type != BTRFS_EXTENT_ITEM_KEY ||
3150 key.objectid + key.offset <= rc->search_start) {
3151 path->slots[0]++;
3152 goto next;
3153 }
3154
3155 ret = find_first_extent_bit(&rc->processed_blocks,
3156 key.objectid, &start, &end,
3157 EXTENT_DIRTY);
3158
3159 if (ret == 0 && start <= key.objectid) {
3160 btrfs_release_path(rc->extent_root, path);
3161 rc->search_start = end + 1;
3162 } else {
3163 rc->search_start = key.objectid + key.offset;
3164 return 0;
3165 }
3166 }
3167 btrfs_release_path(rc->extent_root, path);
3168 return ret;
3169}
3170
3171static void set_reloc_control(struct reloc_control *rc)
3172{
3173 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3174 mutex_lock(&fs_info->trans_mutex);
3175 fs_info->reloc_ctl = rc;
3176 mutex_unlock(&fs_info->trans_mutex);
3177}
3178
3179static void unset_reloc_control(struct reloc_control *rc)
3180{
3181 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3182 mutex_lock(&fs_info->trans_mutex);
3183 fs_info->reloc_ctl = NULL;
3184 mutex_unlock(&fs_info->trans_mutex);
3185}
3186
3187static int check_extent_flags(u64 flags)
3188{
3189 if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3190 (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3191 return 1;
3192 if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
3193 !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3194 return 1;
3195 if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3196 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
3197 return 1;
3198 return 0;
3199}
3200
3201static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3202{
3203 struct rb_root blocks = RB_ROOT;
3204 struct btrfs_key key;
3205 struct btrfs_trans_handle *trans = NULL;
3206 struct btrfs_path *path;
3207 struct btrfs_extent_item *ei;
3208 unsigned long nr;
3209 u64 flags;
3210 u32 item_size;
3211 int ret;
3212 int err = 0;
3213
3214 path = btrfs_alloc_path();
3215 if (!path)
3216 return -ENOMEM;
3217
3218 rc->search_start = rc->block_group->key.objectid;
3219 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3220 GFP_NOFS);
3221
3222 rc->create_reloc_root = 1;
3223 set_reloc_control(rc);
3224
3225 trans = btrfs_start_transaction(rc->extent_root, 1);
3226 btrfs_commit_transaction(trans, rc->extent_root);
3227
3228 while (1) {
3229 trans = btrfs_start_transaction(rc->extent_root, 1);
3230
3231 ret = find_next_extent(trans, rc, path);
3232 if (ret < 0)
3233 err = ret;
3234 if (ret != 0)
3235 break;
3236
3237 rc->extents_found++;
3238
3239 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3240 struct btrfs_extent_item);
3241 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3242 item_size = btrfs_item_size_nr(path->nodes[0],
3243 path->slots[0]);
3244 if (item_size >= sizeof(*ei)) {
3245 flags = btrfs_extent_flags(path->nodes[0], ei);
3246 ret = check_extent_flags(flags);
3247 BUG_ON(ret);
3248
3249 } else {
3250#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3251 u64 ref_owner;
3252 int path_change = 0;
3253
3254 BUG_ON(item_size !=
3255 sizeof(struct btrfs_extent_item_v0));
3256 ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
3257 &path_change);
3258 if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
3259 flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
3260 else
3261 flags = BTRFS_EXTENT_FLAG_DATA;
3262
3263 if (path_change) {
3264 btrfs_release_path(rc->extent_root, path);
3265
3266 path->search_commit_root = 1;
3267 path->skip_locking = 1;
3268 ret = btrfs_search_slot(NULL, rc->extent_root,
3269 &key, path, 0, 0);
3270 if (ret < 0) {
3271 err = ret;
3272 break;
3273 }
3274 BUG_ON(ret > 0);
3275 }
3276#else
3277 BUG();
3278#endif
3279 }
3280
3281 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3282 ret = add_tree_block(rc, &key, path, &blocks);
3283 } else if (rc->stage == UPDATE_DATA_PTRS &&
3284 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3285 ret = add_data_references(rc, &key, path, &blocks);
3286 } else {
3287 btrfs_release_path(rc->extent_root, path);
3288 ret = 0;
3289 }
3290 if (ret < 0) {
3291 err = 0;
3292 break;
3293 }
3294
3295 if (!RB_EMPTY_ROOT(&blocks)) {
3296 ret = relocate_tree_blocks(trans, rc, &blocks);
3297 if (ret < 0) {
3298 err = ret;
3299 break;
3300 }
3301 }
3302
3303 nr = trans->blocks_used;
3304 btrfs_end_transaction_throttle(trans, rc->extent_root);
3305 trans = NULL;
3306 btrfs_btree_balance_dirty(rc->extent_root, nr);
3307
3308 if (rc->stage == MOVE_DATA_EXTENTS &&
3309 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3310 rc->found_file_extent = 1;
3311 ret = relocate_data_extent(rc->data_inode, &key);
3312 if (ret < 0) {
3313 err = ret;
3314 break;
3315 }
3316 }
3317 }
3318 btrfs_free_path(path);
3319
3320 if (trans) {
3321 nr = trans->blocks_used;
3322 btrfs_end_transaction(trans, rc->extent_root);
3323 btrfs_btree_balance_dirty(rc->extent_root, nr);
3324 }
3325
3326 rc->create_reloc_root = 0;
3327 smp_mb();
3328
3329 if (rc->extents_found > 0) {
3330 trans = btrfs_start_transaction(rc->extent_root, 1);
3331 btrfs_commit_transaction(trans, rc->extent_root);
3332 }
3333
3334 merge_reloc_roots(rc);
3335
3336 unset_reloc_control(rc);
3337
3338 /* get rid of pinned extents */
3339 trans = btrfs_start_transaction(rc->extent_root, 1);
3340 btrfs_commit_transaction(trans, rc->extent_root);
3341
3342 return err;
3343}
3344
3345static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3346 struct btrfs_root *root,
3347 u64 objectid, u64 size)
3348{
3349 struct btrfs_path *path;
3350 struct btrfs_inode_item *item;
3351 struct extent_buffer *leaf;
3352 int ret;
3353
3354 path = btrfs_alloc_path();
3355 if (!path)
3356 return -ENOMEM;
3357
3358 ret = btrfs_insert_empty_inode(trans, root, path, objectid);
3359 if (ret)
3360 goto out;
3361
3362 leaf = path->nodes[0];
3363 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
3364 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
3365 btrfs_set_inode_generation(leaf, item, 1);
3366 btrfs_set_inode_size(leaf, item, size);
3367 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3368 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
3369 btrfs_mark_buffer_dirty(leaf);
3370 btrfs_release_path(root, path);
3371out:
3372 btrfs_free_path(path);
3373 return ret;
3374}
3375
3376/*
3377 * helper to create inode for data relocation.
3378 * the inode is in data relocation tree and its link count is 0
3379 */
3380static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3381 struct btrfs_block_group_cache *group)
3382{
3383 struct inode *inode = NULL;
3384 struct btrfs_trans_handle *trans;
3385 struct btrfs_root *root;
3386 struct btrfs_key key;
3387 unsigned long nr;
3388 u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
3389 int err = 0;
3390
3391 root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
3392 if (IS_ERR(root))
3393 return ERR_CAST(root);
3394
3395 trans = btrfs_start_transaction(root, 1);
3396 BUG_ON(!trans);
3397
3398 err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
3399 if (err)
3400 goto out;
3401
3402 err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
3403 BUG_ON(err);
3404
3405 err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
3406 group->key.offset, 0, group->key.offset,
3407 0, 0, 0);
3408 BUG_ON(err);
3409
3410 key.objectid = objectid;
3411 key.type = BTRFS_INODE_ITEM_KEY;
3412 key.offset = 0;
3413 inode = btrfs_iget(root->fs_info->sb, &key, root);
3414 BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
3415 BTRFS_I(inode)->index_cnt = group->key.objectid;
3416
3417 err = btrfs_orphan_add(trans, inode);
3418out:
3419 nr = trans->blocks_used;
3420 btrfs_end_transaction(trans, root);
3421
3422 btrfs_btree_balance_dirty(root, nr);
3423 if (err) {
3424 if (inode)
3425 iput(inode);
3426 inode = ERR_PTR(err);
3427 }
3428 return inode;
3429}
3430
3431/*
3432 * function to relocate all extents in a block group.
3433 */
3434int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3435{
3436 struct btrfs_fs_info *fs_info = extent_root->fs_info;
3437 struct reloc_control *rc;
3438 int ret;
3439 int err = 0;
3440
3441 rc = kzalloc(sizeof(*rc), GFP_NOFS);
3442 if (!rc)
3443 return -ENOMEM;
3444
3445 mapping_tree_init(&rc->reloc_root_tree);
3446 extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
3447 INIT_LIST_HEAD(&rc->reloc_roots);
3448
3449 rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
3450 BUG_ON(!rc->block_group);
3451
3452 btrfs_init_workers(&rc->workers, "relocate",
3453 fs_info->thread_pool_size);
3454
3455 rc->extent_root = extent_root;
3456 btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
3457
3458 rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
3459 if (IS_ERR(rc->data_inode)) {
3460 err = PTR_ERR(rc->data_inode);
3461 rc->data_inode = NULL;
3462 goto out;
3463 }
3464
3465 printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n",
3466 (unsigned long long)rc->block_group->key.objectid,
3467 (unsigned long long)rc->block_group->flags);
3468
3469 btrfs_start_delalloc_inodes(fs_info->tree_root);
3470 btrfs_wait_ordered_extents(fs_info->tree_root, 0);
3471
3472 while (1) {
3473 mutex_lock(&fs_info->cleaner_mutex);
3474 btrfs_clean_old_snapshots(fs_info->tree_root);
3475 mutex_unlock(&fs_info->cleaner_mutex);
3476
3477 rc->extents_found = 0;
3478 rc->extents_skipped = 0;
3479
3480 ret = relocate_block_group(rc);
3481 if (ret < 0) {
3482 err = ret;
3483 break;
3484 }
3485
3486 if (rc->extents_found == 0)
3487 break;
3488
3489 printk(KERN_INFO "btrfs: found %llu extents\n",
3490 (unsigned long long)rc->extents_found);
3491
3492 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
3493 btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1);
3494 invalidate_mapping_pages(rc->data_inode->i_mapping,
3495 0, -1);
3496 rc->stage = UPDATE_DATA_PTRS;
3497 } else if (rc->stage == UPDATE_DATA_PTRS &&
3498 rc->extents_skipped >= rc->extents_found) {
3499 iput(rc->data_inode);
3500 rc->data_inode = create_reloc_inode(fs_info,
3501 rc->block_group);
3502 if (IS_ERR(rc->data_inode)) {
3503 err = PTR_ERR(rc->data_inode);
3504 rc->data_inode = NULL;
3505 break;
3506 }
3507 rc->stage = MOVE_DATA_EXTENTS;
3508 rc->found_file_extent = 0;
3509 }
3510 }
3511
3512 filemap_fdatawrite_range(fs_info->btree_inode->i_mapping,
3513 rc->block_group->key.objectid,
3514 rc->block_group->key.objectid +
3515 rc->block_group->key.offset - 1);
3516
3517 WARN_ON(rc->block_group->pinned > 0);
3518 WARN_ON(rc->block_group->reserved > 0);
3519 WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
3520out:
3521 iput(rc->data_inode);
3522 btrfs_stop_workers(&rc->workers);
3523 btrfs_put_block_group(rc->block_group);
3524 kfree(rc);
3525 return err;
3526}
3527
3528/*
3529 * recover relocation interrupted by system crash.
3530 *
3531 * this function resumes merging reloc trees with corresponding fs trees.
3532 * this is important for keeping the sharing of tree blocks
3533 */
3534int btrfs_recover_relocation(struct btrfs_root *root)
3535{
3536 LIST_HEAD(reloc_roots);
3537 struct btrfs_key key;
3538 struct btrfs_root *fs_root;
3539 struct btrfs_root *reloc_root;
3540 struct btrfs_path *path;
3541 struct extent_buffer *leaf;
3542 struct reloc_control *rc = NULL;
3543 struct btrfs_trans_handle *trans;
3544 int ret;
3545 int err = 0;
3546
3547 path = btrfs_alloc_path();
3548 if (!path)
3549 return -ENOMEM;
3550
3551 key.objectid = BTRFS_TREE_RELOC_OBJECTID;
3552 key.type = BTRFS_ROOT_ITEM_KEY;
3553 key.offset = (u64)-1;
3554
3555 while (1) {
3556 ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key,
3557 path, 0, 0);
3558 if (ret < 0) {
3559 err = ret;
3560 goto out;
3561 }
3562 if (ret > 0) {
3563 if (path->slots[0] == 0)
3564 break;
3565 path->slots[0]--;
3566 }
3567 leaf = path->nodes[0];
3568 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3569 btrfs_release_path(root->fs_info->tree_root, path);
3570
3571 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
3572 key.type != BTRFS_ROOT_ITEM_KEY)
3573 break;
3574
3575 reloc_root = btrfs_read_fs_root_no_radix(root, &key);
3576 if (IS_ERR(reloc_root)) {
3577 err = PTR_ERR(reloc_root);
3578 goto out;
3579 }
3580
3581 list_add(&reloc_root->root_list, &reloc_roots);
3582
3583 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
3584 fs_root = read_fs_root(root->fs_info,
3585 reloc_root->root_key.offset);
3586 if (IS_ERR(fs_root)) {
3587 err = PTR_ERR(fs_root);
3588 goto out;
3589 }
3590 }
3591
3592 if (key.offset == 0)
3593 break;
3594
3595 key.offset--;
3596 }
3597 btrfs_release_path(root->fs_info->tree_root, path);
3598
3599 if (list_empty(&reloc_roots))
3600 goto out;
3601
3602 rc = kzalloc(sizeof(*rc), GFP_NOFS);
3603 if (!rc) {
3604 err = -ENOMEM;
3605 goto out;
3606 }
3607
3608 mapping_tree_init(&rc->reloc_root_tree);
3609 INIT_LIST_HEAD(&rc->reloc_roots);
3610 btrfs_init_workers(&rc->workers, "relocate",
3611 root->fs_info->thread_pool_size);
3612 rc->extent_root = root->fs_info->extent_root;
3613
3614 set_reloc_control(rc);
3615
3616 while (!list_empty(&reloc_roots)) {
3617 reloc_root = list_entry(reloc_roots.next,
3618 struct btrfs_root, root_list);
3619 list_del(&reloc_root->root_list);
3620
3621 if (btrfs_root_refs(&reloc_root->root_item) == 0) {
3622 list_add_tail(&reloc_root->root_list,
3623 &rc->reloc_roots);
3624 continue;
3625 }
3626
3627 fs_root = read_fs_root(root->fs_info,
3628 reloc_root->root_key.offset);
3629 BUG_ON(IS_ERR(fs_root));
3630
3631 __add_reloc_root(reloc_root);
3632 fs_root->reloc_root = reloc_root;
3633 }
3634
3635 trans = btrfs_start_transaction(rc->extent_root, 1);
3636 btrfs_commit_transaction(trans, rc->extent_root);
3637
3638 merge_reloc_roots(rc);
3639
3640 unset_reloc_control(rc);
3641
3642 trans = btrfs_start_transaction(rc->extent_root, 1);
3643 btrfs_commit_transaction(trans, rc->extent_root);
3644out:
3645 if (rc) {
3646 btrfs_stop_workers(&rc->workers);
3647 kfree(rc);
3648 }
3649 while (!list_empty(&reloc_roots)) {
3650 reloc_root = list_entry(reloc_roots.next,
3651 struct btrfs_root, root_list);
3652 list_del(&reloc_root->root_list);
3653 free_extent_buffer(reloc_root->node);
3654 free_extent_buffer(reloc_root->commit_root);
3655 kfree(reloc_root);
3656 }
3657 btrfs_free_path(path);
3658
3659 if (err == 0) {
3660 /* cleanup orphan inode in data relocation tree */
3661 fs_root = read_fs_root(root->fs_info,
3662 BTRFS_DATA_RELOC_TREE_OBJECTID);
3663 if (IS_ERR(fs_root))
3664 err = PTR_ERR(fs_root);
3665 }
3666 return err;
3667}
3668
3669/*
3670 * helper to add ordered checksum for data relocation.
3671 *
3672 * cloning checksum properly handles the nodatasum extents.
3673 * it also saves CPU time to re-calculate the checksum.
3674 */
3675int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
3676{
3677 struct btrfs_ordered_sum *sums;
3678 struct btrfs_sector_sum *sector_sum;
3679 struct btrfs_ordered_extent *ordered;
3680 struct btrfs_root *root = BTRFS_I(inode)->root;
3681 size_t offset;
3682 int ret;
3683 u64 disk_bytenr;
3684 LIST_HEAD(list);
3685
3686 ordered = btrfs_lookup_ordered_extent(inode, file_pos);
3687 BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
3688
3689 disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
3690 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
3691 disk_bytenr + len - 1, &list);
3692
3693 while (!list_empty(&list)) {
3694 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
3695 list_del_init(&sums->list);
3696
3697 sector_sum = sums->sums;
3698 sums->bytenr = ordered->start;
3699
3700 offset = 0;
3701 while (offset < sums->len) {
3702 sector_sum->bytenr += ordered->start - disk_bytenr;
3703 sector_sum++;
3704 offset += root->sectorsize;
3705 }
3706
3707 btrfs_add_ordered_sum(inode, ordered, sums);
3708 }
3709 btrfs_put_ordered_extent(ordered);
3710 return 0;
3711}