aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2012-05-31 16:50:28 -0400
committerChris Mason <chris.mason@oracle.com>2012-05-31 16:49:53 -0400
commit1e20932a23578bb1ec59107843574e259b96193f (patch)
tree844ae54293c4414fc4c232a36d0e4d4939dc35aa
parentcfc442b69696b593cb442f09997dcb4cb5748171 (diff)
parentc31931088fd6cf953bd0868a2647b6c3928e6c96 (diff)
Merge branch 'for-chris' of git://git.jan-o-sch.net/btrfs-unstable into for-linus
Conflicts: fs/btrfs/ulist.h Signed-off-by: Chris Mason <chris.mason@oracle.com>
-rw-r--r--fs/btrfs/backref.c495
-rw-r--r--fs/btrfs/backref.h3
-rw-r--r--fs/btrfs/ctree.c849
-rw-r--r--fs/btrfs/ctree.h34
-rw-r--r--fs/btrfs/delayed-ref.c10
-rw-r--r--fs/btrfs/delayed-ref.h24
-rw-r--r--fs/btrfs/disk-io.c7
-rw-r--r--fs/btrfs/extent-tree.c10
-rw-r--r--fs/btrfs/extent_io.c80
-rw-r--r--fs/btrfs/extent_io.h3
-rw-r--r--fs/btrfs/ioctl.c2
-rw-r--r--fs/btrfs/transaction.c55
-rw-r--r--fs/btrfs/ulist.c34
-rw-r--r--fs/btrfs/ulist.h14
14 files changed, 1368 insertions, 252 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index bcec06750232..3f75895c919b 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -24,22 +24,135 @@
24#include "delayed-ref.h" 24#include "delayed-ref.h"
25#include "locking.h" 25#include "locking.h"
26 26
27struct extent_inode_elem {
28 u64 inum;
29 u64 offset;
30 struct extent_inode_elem *next;
31};
32
33static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb,
34 struct btrfs_file_extent_item *fi,
35 u64 extent_item_pos,
36 struct extent_inode_elem **eie)
37{
38 u64 data_offset;
39 u64 data_len;
40 struct extent_inode_elem *e;
41
42 data_offset = btrfs_file_extent_offset(eb, fi);
43 data_len = btrfs_file_extent_num_bytes(eb, fi);
44
45 if (extent_item_pos < data_offset ||
46 extent_item_pos >= data_offset + data_len)
47 return 1;
48
49 e = kmalloc(sizeof(*e), GFP_NOFS);
50 if (!e)
51 return -ENOMEM;
52
53 e->next = *eie;
54 e->inum = key->objectid;
55 e->offset = key->offset + (extent_item_pos - data_offset);
56 *eie = e;
57
58 return 0;
59}
60
61static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte,
62 u64 extent_item_pos,
63 struct extent_inode_elem **eie)
64{
65 u64 disk_byte;
66 struct btrfs_key key;
67 struct btrfs_file_extent_item *fi;
68 int slot;
69 int nritems;
70 int extent_type;
71 int ret;
72
73 /*
74 * from the shared data ref, we only have the leaf but we need
75 * the key. thus, we must look into all items and see that we
76 * find one (some) with a reference to our extent item.
77 */
78 nritems = btrfs_header_nritems(eb);
79 for (slot = 0; slot < nritems; ++slot) {
80 btrfs_item_key_to_cpu(eb, &key, slot);
81 if (key.type != BTRFS_EXTENT_DATA_KEY)
82 continue;
83 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
84 extent_type = btrfs_file_extent_type(eb, fi);
85 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
86 continue;
87 /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
88 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
89 if (disk_byte != wanted_disk_byte)
90 continue;
91
92 ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie);
93 if (ret < 0)
94 return ret;
95 }
96
97 return 0;
98}
99
27/* 100/*
28 * this structure records all encountered refs on the way up to the root 101 * this structure records all encountered refs on the way up to the root
29 */ 102 */
30struct __prelim_ref { 103struct __prelim_ref {
31 struct list_head list; 104 struct list_head list;
32 u64 root_id; 105 u64 root_id;
33 struct btrfs_key key; 106 struct btrfs_key key_for_search;
34 int level; 107 int level;
35 int count; 108 int count;
109 struct extent_inode_elem *inode_list;
36 u64 parent; 110 u64 parent;
37 u64 wanted_disk_byte; 111 u64 wanted_disk_byte;
38}; 112};
39 113
114/*
115 * the rules for all callers of this function are:
116 * - obtaining the parent is the goal
117 * - if you add a key, you must know that it is a correct key
118 * - if you cannot add the parent or a correct key, then we will look into the
119 * block later to set a correct key
120 *
121 * delayed refs
122 * ============
123 * backref type | shared | indirect | shared | indirect
124 * information | tree | tree | data | data
125 * --------------------+--------+----------+--------+----------
126 * parent logical | y | - | - | -
127 * key to resolve | - | y | y | y
128 * tree block logical | - | - | - | -
129 * root for resolving | y | y | y | y
130 *
131 * - column 1: we've the parent -> done
132 * - column 2, 3, 4: we use the key to find the parent
133 *
134 * on disk refs (inline or keyed)
135 * ==============================
136 * backref type | shared | indirect | shared | indirect
137 * information | tree | tree | data | data
138 * --------------------+--------+----------+--------+----------
139 * parent logical | y | - | y | -
140 * key to resolve | - | - | - | y
141 * tree block logical | y | y | y | y
142 * root for resolving | - | y | y | y
143 *
144 * - column 1, 3: we've the parent -> done
145 * - column 2: we take the first key from the block to find the parent
146 * (see __add_missing_keys)
147 * - column 4: we use the key to find the parent
148 *
149 * additional information that's available but not required to find the parent
150 * block might help in merging entries to gain some speed.
151 */
152
40static int __add_prelim_ref(struct list_head *head, u64 root_id, 153static int __add_prelim_ref(struct list_head *head, u64 root_id,
41 struct btrfs_key *key, int level, u64 parent, 154 struct btrfs_key *key, int level,
42 u64 wanted_disk_byte, int count) 155 u64 parent, u64 wanted_disk_byte, int count)
43{ 156{
44 struct __prelim_ref *ref; 157 struct __prelim_ref *ref;
45 158
@@ -50,10 +163,11 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
50 163
51 ref->root_id = root_id; 164 ref->root_id = root_id;
52 if (key) 165 if (key)
53 ref->key = *key; 166 ref->key_for_search = *key;
54 else 167 else
55 memset(&ref->key, 0, sizeof(ref->key)); 168 memset(&ref->key_for_search, 0, sizeof(ref->key_for_search));
56 169
170 ref->inode_list = NULL;
57 ref->level = level; 171 ref->level = level;
58 ref->count = count; 172 ref->count = count;
59 ref->parent = parent; 173 ref->parent = parent;
@@ -64,18 +178,26 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
64} 178}
65 179
66static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, 180static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
67 struct ulist *parents, 181 struct ulist *parents, int level,
68 struct extent_buffer *eb, int level, 182 struct btrfs_key *key, u64 wanted_disk_byte,
69 u64 wanted_objectid, u64 wanted_disk_byte) 183 const u64 *extent_item_pos)
70{ 184{
71 int ret; 185 int ret;
72 int slot; 186 int slot = path->slots[level];
187 struct extent_buffer *eb = path->nodes[level];
73 struct btrfs_file_extent_item *fi; 188 struct btrfs_file_extent_item *fi;
74 struct btrfs_key key; 189 struct extent_inode_elem *eie = NULL;
75 u64 disk_byte; 190 u64 disk_byte;
191 u64 wanted_objectid = key->objectid;
76 192
77add_parent: 193add_parent:
78 ret = ulist_add(parents, eb->start, 0, GFP_NOFS); 194 if (level == 0 && extent_item_pos) {
195 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
196 ret = check_extent_in_eb(key, eb, fi, *extent_item_pos, &eie);
197 if (ret < 0)
198 return ret;
199 }
200 ret = ulist_add(parents, eb->start, (unsigned long)eie, GFP_NOFS);
79 if (ret < 0) 201 if (ret < 0)
80 return ret; 202 return ret;
81 203
@@ -89,6 +211,7 @@ add_parent:
89 * repeat this until we don't find any additional EXTENT_DATA items. 211 * repeat this until we don't find any additional EXTENT_DATA items.
90 */ 212 */
91 while (1) { 213 while (1) {
214 eie = NULL;
92 ret = btrfs_next_leaf(root, path); 215 ret = btrfs_next_leaf(root, path);
93 if (ret < 0) 216 if (ret < 0)
94 return ret; 217 return ret;
@@ -97,9 +220,9 @@ add_parent:
97 220
98 eb = path->nodes[0]; 221 eb = path->nodes[0];
99 for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) { 222 for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) {
100 btrfs_item_key_to_cpu(eb, &key, slot); 223 btrfs_item_key_to_cpu(eb, key, slot);
101 if (key.objectid != wanted_objectid || 224 if (key->objectid != wanted_objectid ||
102 key.type != BTRFS_EXTENT_DATA_KEY) 225 key->type != BTRFS_EXTENT_DATA_KEY)
103 return 0; 226 return 0;
104 fi = btrfs_item_ptr(eb, slot, 227 fi = btrfs_item_ptr(eb, slot,
105 struct btrfs_file_extent_item); 228 struct btrfs_file_extent_item);
@@ -118,8 +241,10 @@ add_parent:
118 */ 241 */
119static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, 242static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
120 int search_commit_root, 243 int search_commit_root,
244 u64 time_seq,
121 struct __prelim_ref *ref, 245 struct __prelim_ref *ref,
122 struct ulist *parents) 246 struct ulist *parents,
247 const u64 *extent_item_pos)
123{ 248{
124 struct btrfs_path *path; 249 struct btrfs_path *path;
125 struct btrfs_root *root; 250 struct btrfs_root *root;
@@ -152,12 +277,13 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
152 goto out; 277 goto out;
153 278
154 path->lowest_level = level; 279 path->lowest_level = level;
155 ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0); 280 ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq);
156 pr_debug("search slot in root %llu (level %d, ref count %d) returned " 281 pr_debug("search slot in root %llu (level %d, ref count %d) returned "
157 "%d for key (%llu %u %llu)\n", 282 "%d for key (%llu %u %llu)\n",
158 (unsigned long long)ref->root_id, level, ref->count, ret, 283 (unsigned long long)ref->root_id, level, ref->count, ret,
159 (unsigned long long)ref->key.objectid, ref->key.type, 284 (unsigned long long)ref->key_for_search.objectid,
160 (unsigned long long)ref->key.offset); 285 ref->key_for_search.type,
286 (unsigned long long)ref->key_for_search.offset);
161 if (ret < 0) 287 if (ret < 0)
162 goto out; 288 goto out;
163 289
@@ -179,9 +305,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
179 btrfs_item_key_to_cpu(eb, &key, path->slots[0]); 305 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
180 } 306 }
181 307
182 /* the last two parameters will only be used for level == 0 */ 308 ret = add_all_parents(root, path, parents, level, &key,
183 ret = add_all_parents(root, path, parents, eb, level, key.objectid, 309 ref->wanted_disk_byte, extent_item_pos);
184 ref->wanted_disk_byte);
185out: 310out:
186 btrfs_free_path(path); 311 btrfs_free_path(path);
187 return ret; 312 return ret;
@@ -191,8 +316,9 @@ out:
191 * resolve all indirect backrefs from the list 316 * resolve all indirect backrefs from the list
192 */ 317 */
193static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, 318static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
194 int search_commit_root, 319 int search_commit_root, u64 time_seq,
195 struct list_head *head) 320 struct list_head *head,
321 const u64 *extent_item_pos)
196{ 322{
197 int err; 323 int err;
198 int ret = 0; 324 int ret = 0;
@@ -201,6 +327,7 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
201 struct __prelim_ref *new_ref; 327 struct __prelim_ref *new_ref;
202 struct ulist *parents; 328 struct ulist *parents;
203 struct ulist_node *node; 329 struct ulist_node *node;
330 struct ulist_iterator uiter;
204 331
205 parents = ulist_alloc(GFP_NOFS); 332 parents = ulist_alloc(GFP_NOFS);
206 if (!parents) 333 if (!parents)
@@ -217,7 +344,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
217 if (ref->count == 0) 344 if (ref->count == 0)
218 continue; 345 continue;
219 err = __resolve_indirect_ref(fs_info, search_commit_root, 346 err = __resolve_indirect_ref(fs_info, search_commit_root,
220 ref, parents); 347 time_seq, ref, parents,
348 extent_item_pos);
221 if (err) { 349 if (err) {
222 if (ret == 0) 350 if (ret == 0)
223 ret = err; 351 ret = err;
@@ -225,11 +353,14 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
225 } 353 }
226 354
227 /* we put the first parent into the ref at hand */ 355 /* we put the first parent into the ref at hand */
228 node = ulist_next(parents, NULL); 356 ULIST_ITER_INIT(&uiter);
357 node = ulist_next(parents, &uiter);
229 ref->parent = node ? node->val : 0; 358 ref->parent = node ? node->val : 0;
359 ref->inode_list =
360 node ? (struct extent_inode_elem *)node->aux : 0;
230 361
231 /* additional parents require new refs being added here */ 362 /* additional parents require new refs being added here */
232 while ((node = ulist_next(parents, node))) { 363 while ((node = ulist_next(parents, &uiter))) {
233 new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS); 364 new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS);
234 if (!new_ref) { 365 if (!new_ref) {
235 ret = -ENOMEM; 366 ret = -ENOMEM;
@@ -237,6 +368,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
237 } 368 }
238 memcpy(new_ref, ref, sizeof(*ref)); 369 memcpy(new_ref, ref, sizeof(*ref));
239 new_ref->parent = node->val; 370 new_ref->parent = node->val;
371 new_ref->inode_list =
372 (struct extent_inode_elem *)node->aux;
240 list_add(&new_ref->list, &ref->list); 373 list_add(&new_ref->list, &ref->list);
241 } 374 }
242 ulist_reinit(parents); 375 ulist_reinit(parents);
@@ -246,10 +379,65 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
246 return ret; 379 return ret;
247} 380}
248 381
382static inline int ref_for_same_block(struct __prelim_ref *ref1,
383 struct __prelim_ref *ref2)
384{
385 if (ref1->level != ref2->level)
386 return 0;
387 if (ref1->root_id != ref2->root_id)
388 return 0;
389 if (ref1->key_for_search.type != ref2->key_for_search.type)
390 return 0;
391 if (ref1->key_for_search.objectid != ref2->key_for_search.objectid)
392 return 0;
393 if (ref1->key_for_search.offset != ref2->key_for_search.offset)
394 return 0;
395 if (ref1->parent != ref2->parent)
396 return 0;
397
398 return 1;
399}
400
401/*
402 * read tree blocks and add keys where required.
403 */
404static int __add_missing_keys(struct btrfs_fs_info *fs_info,
405 struct list_head *head)
406{
407 struct list_head *pos;
408 struct extent_buffer *eb;
409
410 list_for_each(pos, head) {
411 struct __prelim_ref *ref;
412 ref = list_entry(pos, struct __prelim_ref, list);
413
414 if (ref->parent)
415 continue;
416 if (ref->key_for_search.type)
417 continue;
418 BUG_ON(!ref->wanted_disk_byte);
419 eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
420 fs_info->tree_root->leafsize, 0);
421 BUG_ON(!eb);
422 btrfs_tree_read_lock(eb);
423 if (btrfs_header_level(eb) == 0)
424 btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
425 else
426 btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
427 btrfs_tree_read_unlock(eb);
428 free_extent_buffer(eb);
429 }
430 return 0;
431}
432
249/* 433/*
250 * merge two lists of backrefs and adjust counts accordingly 434 * merge two lists of backrefs and adjust counts accordingly
251 * 435 *
252 * mode = 1: merge identical keys, if key is set 436 * mode = 1: merge identical keys, if key is set
437 * FIXME: if we add more keys in __add_prelim_ref, we can merge more here.
438 * additionally, we could even add a key range for the blocks we
439 * looked into to merge even more (-> replace unresolved refs by those
440 * having a parent).
253 * mode = 2: merge identical parents 441 * mode = 2: merge identical parents
254 */ 442 */
255static int __merge_refs(struct list_head *head, int mode) 443static int __merge_refs(struct list_head *head, int mode)
@@ -263,20 +451,21 @@ static int __merge_refs(struct list_head *head, int mode)
263 451
264 ref1 = list_entry(pos1, struct __prelim_ref, list); 452 ref1 = list_entry(pos1, struct __prelim_ref, list);
265 453
266 if (mode == 1 && ref1->key.type == 0)
267 continue;
268 for (pos2 = pos1->next, n2 = pos2->next; pos2 != head; 454 for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
269 pos2 = n2, n2 = pos2->next) { 455 pos2 = n2, n2 = pos2->next) {
270 struct __prelim_ref *ref2; 456 struct __prelim_ref *ref2;
457 struct __prelim_ref *xchg;
271 458
272 ref2 = list_entry(pos2, struct __prelim_ref, list); 459 ref2 = list_entry(pos2, struct __prelim_ref, list);
273 460
274 if (mode == 1) { 461 if (mode == 1) {
275 if (memcmp(&ref1->key, &ref2->key, 462 if (!ref_for_same_block(ref1, ref2))
276 sizeof(ref1->key)) ||
277 ref1->level != ref2->level ||
278 ref1->root_id != ref2->root_id)
279 continue; 463 continue;
464 if (!ref1->parent && ref2->parent) {
465 xchg = ref1;
466 ref1 = ref2;
467 ref2 = xchg;
468 }
280 ref1->count += ref2->count; 469 ref1->count += ref2->count;
281 } else { 470 } else {
282 if (ref1->parent != ref2->parent) 471 if (ref1->parent != ref2->parent)
@@ -296,16 +485,17 @@ static int __merge_refs(struct list_head *head, int mode)
296 * smaller or equal that seq to the list 485 * smaller or equal that seq to the list
297 */ 486 */
298static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, 487static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
299 struct btrfs_key *info_key,
300 struct list_head *prefs) 488 struct list_head *prefs)
301{ 489{
302 struct btrfs_delayed_extent_op *extent_op = head->extent_op; 490 struct btrfs_delayed_extent_op *extent_op = head->extent_op;
303 struct rb_node *n = &head->node.rb_node; 491 struct rb_node *n = &head->node.rb_node;
492 struct btrfs_key key;
493 struct btrfs_key op_key = {0};
304 int sgn; 494 int sgn;
305 int ret = 0; 495 int ret = 0;
306 496
307 if (extent_op && extent_op->update_key) 497 if (extent_op && extent_op->update_key)
308 btrfs_disk_key_to_cpu(info_key, &extent_op->key); 498 btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
309 499
310 while ((n = rb_prev(n))) { 500 while ((n = rb_prev(n))) {
311 struct btrfs_delayed_ref_node *node; 501 struct btrfs_delayed_ref_node *node;
@@ -337,7 +527,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
337 struct btrfs_delayed_tree_ref *ref; 527 struct btrfs_delayed_tree_ref *ref;
338 528
339 ref = btrfs_delayed_node_to_tree_ref(node); 529 ref = btrfs_delayed_node_to_tree_ref(node);
340 ret = __add_prelim_ref(prefs, ref->root, info_key, 530 ret = __add_prelim_ref(prefs, ref->root, &op_key,
341 ref->level + 1, 0, node->bytenr, 531 ref->level + 1, 0, node->bytenr,
342 node->ref_mod * sgn); 532 node->ref_mod * sgn);
343 break; 533 break;
@@ -346,7 +536,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
346 struct btrfs_delayed_tree_ref *ref; 536 struct btrfs_delayed_tree_ref *ref;
347 537
348 ref = btrfs_delayed_node_to_tree_ref(node); 538 ref = btrfs_delayed_node_to_tree_ref(node);
349 ret = __add_prelim_ref(prefs, ref->root, info_key, 539 ret = __add_prelim_ref(prefs, ref->root, NULL,
350 ref->level + 1, ref->parent, 540 ref->level + 1, ref->parent,
351 node->bytenr, 541 node->bytenr,
352 node->ref_mod * sgn); 542 node->ref_mod * sgn);
@@ -354,8 +544,6 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
354 } 544 }
355 case BTRFS_EXTENT_DATA_REF_KEY: { 545 case BTRFS_EXTENT_DATA_REF_KEY: {
356 struct btrfs_delayed_data_ref *ref; 546 struct btrfs_delayed_data_ref *ref;
357 struct btrfs_key key;
358
359 ref = btrfs_delayed_node_to_data_ref(node); 547 ref = btrfs_delayed_node_to_data_ref(node);
360 548
361 key.objectid = ref->objectid; 549 key.objectid = ref->objectid;
@@ -368,7 +556,6 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
368 } 556 }
369 case BTRFS_SHARED_DATA_REF_KEY: { 557 case BTRFS_SHARED_DATA_REF_KEY: {
370 struct btrfs_delayed_data_ref *ref; 558 struct btrfs_delayed_data_ref *ref;
371 struct btrfs_key key;
372 559
373 ref = btrfs_delayed_node_to_data_ref(node); 560 ref = btrfs_delayed_node_to_data_ref(node);
374 561
@@ -394,8 +581,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
394 */ 581 */
395static int __add_inline_refs(struct btrfs_fs_info *fs_info, 582static int __add_inline_refs(struct btrfs_fs_info *fs_info,
396 struct btrfs_path *path, u64 bytenr, 583 struct btrfs_path *path, u64 bytenr,
397 struct btrfs_key *info_key, int *info_level, 584 int *info_level, struct list_head *prefs)
398 struct list_head *prefs)
399{ 585{
400 int ret = 0; 586 int ret = 0;
401 int slot; 587 int slot;
@@ -411,7 +597,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
411 * enumerate all inline refs 597 * enumerate all inline refs
412 */ 598 */
413 leaf = path->nodes[0]; 599 leaf = path->nodes[0];
414 slot = path->slots[0] - 1; 600 slot = path->slots[0];
415 601
416 item_size = btrfs_item_size_nr(leaf, slot); 602 item_size = btrfs_item_size_nr(leaf, slot);
417 BUG_ON(item_size < sizeof(*ei)); 603 BUG_ON(item_size < sizeof(*ei));
@@ -424,12 +610,9 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
424 610
425 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 611 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
426 struct btrfs_tree_block_info *info; 612 struct btrfs_tree_block_info *info;
427 struct btrfs_disk_key disk_key;
428 613
429 info = (struct btrfs_tree_block_info *)ptr; 614 info = (struct btrfs_tree_block_info *)ptr;
430 *info_level = btrfs_tree_block_level(leaf, info); 615 *info_level = btrfs_tree_block_level(leaf, info);
431 btrfs_tree_block_key(leaf, info, &disk_key);
432 btrfs_disk_key_to_cpu(info_key, &disk_key);
433 ptr += sizeof(struct btrfs_tree_block_info); 616 ptr += sizeof(struct btrfs_tree_block_info);
434 BUG_ON(ptr > end); 617 BUG_ON(ptr > end);
435 } else { 618 } else {
@@ -447,7 +630,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
447 630
448 switch (type) { 631 switch (type) {
449 case BTRFS_SHARED_BLOCK_REF_KEY: 632 case BTRFS_SHARED_BLOCK_REF_KEY:
450 ret = __add_prelim_ref(prefs, 0, info_key, 633 ret = __add_prelim_ref(prefs, 0, NULL,
451 *info_level + 1, offset, 634 *info_level + 1, offset,
452 bytenr, 1); 635 bytenr, 1);
453 break; 636 break;
@@ -462,8 +645,9 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
462 break; 645 break;
463 } 646 }
464 case BTRFS_TREE_BLOCK_REF_KEY: 647 case BTRFS_TREE_BLOCK_REF_KEY:
465 ret = __add_prelim_ref(prefs, offset, info_key, 648 ret = __add_prelim_ref(prefs, offset, NULL,
466 *info_level + 1, 0, bytenr, 1); 649 *info_level + 1, 0,
650 bytenr, 1);
467 break; 651 break;
468 case BTRFS_EXTENT_DATA_REF_KEY: { 652 case BTRFS_EXTENT_DATA_REF_KEY: {
469 struct btrfs_extent_data_ref *dref; 653 struct btrfs_extent_data_ref *dref;
@@ -477,8 +661,8 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
477 key.type = BTRFS_EXTENT_DATA_KEY; 661 key.type = BTRFS_EXTENT_DATA_KEY;
478 key.offset = btrfs_extent_data_ref_offset(leaf, dref); 662 key.offset = btrfs_extent_data_ref_offset(leaf, dref);
479 root = btrfs_extent_data_ref_root(leaf, dref); 663 root = btrfs_extent_data_ref_root(leaf, dref);
480 ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr, 664 ret = __add_prelim_ref(prefs, root, &key, 0, 0,
481 count); 665 bytenr, count);
482 break; 666 break;
483 } 667 }
484 default: 668 default:
@@ -496,8 +680,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
496 */ 680 */
497static int __add_keyed_refs(struct btrfs_fs_info *fs_info, 681static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
498 struct btrfs_path *path, u64 bytenr, 682 struct btrfs_path *path, u64 bytenr,
499 struct btrfs_key *info_key, int info_level, 683 int info_level, struct list_head *prefs)
500 struct list_head *prefs)
501{ 684{
502 struct btrfs_root *extent_root = fs_info->extent_root; 685 struct btrfs_root *extent_root = fs_info->extent_root;
503 int ret; 686 int ret;
@@ -527,7 +710,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
527 710
528 switch (key.type) { 711 switch (key.type) {
529 case BTRFS_SHARED_BLOCK_REF_KEY: 712 case BTRFS_SHARED_BLOCK_REF_KEY:
530 ret = __add_prelim_ref(prefs, 0, info_key, 713 ret = __add_prelim_ref(prefs, 0, NULL,
531 info_level + 1, key.offset, 714 info_level + 1, key.offset,
532 bytenr, 1); 715 bytenr, 1);
533 break; 716 break;
@@ -543,8 +726,9 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
543 break; 726 break;
544 } 727 }
545 case BTRFS_TREE_BLOCK_REF_KEY: 728 case BTRFS_TREE_BLOCK_REF_KEY:
546 ret = __add_prelim_ref(prefs, key.offset, info_key, 729 ret = __add_prelim_ref(prefs, key.offset, NULL,
547 info_level + 1, 0, bytenr, 1); 730 info_level + 1, 0,
731 bytenr, 1);
548 break; 732 break;
549 case BTRFS_EXTENT_DATA_REF_KEY: { 733 case BTRFS_EXTENT_DATA_REF_KEY: {
550 struct btrfs_extent_data_ref *dref; 734 struct btrfs_extent_data_ref *dref;
@@ -560,7 +744,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
560 key.offset = btrfs_extent_data_ref_offset(leaf, dref); 744 key.offset = btrfs_extent_data_ref_offset(leaf, dref);
561 root = btrfs_extent_data_ref_root(leaf, dref); 745 root = btrfs_extent_data_ref_root(leaf, dref);
562 ret = __add_prelim_ref(prefs, root, &key, 0, 0, 746 ret = __add_prelim_ref(prefs, root, &key, 0, 0,
563 bytenr, count); 747 bytenr, count);
564 break; 748 break;
565 } 749 }
566 default: 750 default:
@@ -582,11 +766,12 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
582 */ 766 */
583static int find_parent_nodes(struct btrfs_trans_handle *trans, 767static int find_parent_nodes(struct btrfs_trans_handle *trans,
584 struct btrfs_fs_info *fs_info, u64 bytenr, 768 struct btrfs_fs_info *fs_info, u64 bytenr,
585 u64 seq, struct ulist *refs, struct ulist *roots) 769 u64 delayed_ref_seq, u64 time_seq,
770 struct ulist *refs, struct ulist *roots,
771 const u64 *extent_item_pos)
586{ 772{
587 struct btrfs_key key; 773 struct btrfs_key key;
588 struct btrfs_path *path; 774 struct btrfs_path *path;
589 struct btrfs_key info_key = { 0 };
590 struct btrfs_delayed_ref_root *delayed_refs = NULL; 775 struct btrfs_delayed_ref_root *delayed_refs = NULL;
591 struct btrfs_delayed_ref_head *head; 776 struct btrfs_delayed_ref_head *head;
592 int info_level = 0; 777 int info_level = 0;
@@ -645,7 +830,7 @@ again:
645 btrfs_put_delayed_ref(&head->node); 830 btrfs_put_delayed_ref(&head->node);
646 goto again; 831 goto again;
647 } 832 }
648 ret = __add_delayed_refs(head, seq, &info_key, 833 ret = __add_delayed_refs(head, delayed_ref_seq,
649 &prefs_delayed); 834 &prefs_delayed);
650 if (ret) { 835 if (ret) {
651 spin_unlock(&delayed_refs->lock); 836 spin_unlock(&delayed_refs->lock);
@@ -659,16 +844,17 @@ again:
659 struct extent_buffer *leaf; 844 struct extent_buffer *leaf;
660 int slot; 845 int slot;
661 846
847 path->slots[0]--;
662 leaf = path->nodes[0]; 848 leaf = path->nodes[0];
663 slot = path->slots[0] - 1; 849 slot = path->slots[0];
664 btrfs_item_key_to_cpu(leaf, &key, slot); 850 btrfs_item_key_to_cpu(leaf, &key, slot);
665 if (key.objectid == bytenr && 851 if (key.objectid == bytenr &&
666 key.type == BTRFS_EXTENT_ITEM_KEY) { 852 key.type == BTRFS_EXTENT_ITEM_KEY) {
667 ret = __add_inline_refs(fs_info, path, bytenr, 853 ret = __add_inline_refs(fs_info, path, bytenr,
668 &info_key, &info_level, &prefs); 854 &info_level, &prefs);
669 if (ret) 855 if (ret)
670 goto out; 856 goto out;
671 ret = __add_keyed_refs(fs_info, path, bytenr, &info_key, 857 ret = __add_keyed_refs(fs_info, path, bytenr,
672 info_level, &prefs); 858 info_level, &prefs);
673 if (ret) 859 if (ret)
674 goto out; 860 goto out;
@@ -676,21 +862,18 @@ again:
676 } 862 }
677 btrfs_release_path(path); 863 btrfs_release_path(path);
678 864
679 /*
680 * when adding the delayed refs above, the info_key might not have
681 * been known yet. Go over the list and replace the missing keys
682 */
683 list_for_each_entry(ref, &prefs_delayed, list) {
684 if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0)
685 memcpy(&ref->key, &info_key, sizeof(ref->key));
686 }
687 list_splice_init(&prefs_delayed, &prefs); 865 list_splice_init(&prefs_delayed, &prefs);
688 866
867 ret = __add_missing_keys(fs_info, &prefs);
868 if (ret)
869 goto out;
870
689 ret = __merge_refs(&prefs, 1); 871 ret = __merge_refs(&prefs, 1);
690 if (ret) 872 if (ret)
691 goto out; 873 goto out;
692 874
693 ret = __resolve_indirect_refs(fs_info, search_commit_root, &prefs); 875 ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq,
876 &prefs, extent_item_pos);
694 if (ret) 877 if (ret)
695 goto out; 878 goto out;
696 879
@@ -709,7 +892,33 @@ again:
709 BUG_ON(ret < 0); 892 BUG_ON(ret < 0);
710 } 893 }
711 if (ref->count && ref->parent) { 894 if (ref->count && ref->parent) {
712 ret = ulist_add(refs, ref->parent, 0, GFP_NOFS); 895 struct extent_inode_elem *eie = NULL;
896 if (extent_item_pos && !ref->inode_list) {
897 u32 bsz;
898 struct extent_buffer *eb;
899 bsz = btrfs_level_size(fs_info->extent_root,
900 info_level);
901 eb = read_tree_block(fs_info->extent_root,
902 ref->parent, bsz, 0);
903 BUG_ON(!eb);
904 ret = find_extent_in_eb(eb, bytenr,
905 *extent_item_pos, &eie);
906 ref->inode_list = eie;
907 free_extent_buffer(eb);
908 }
909 ret = ulist_add_merge(refs, ref->parent,
910 (unsigned long)ref->inode_list,
911 (unsigned long *)&eie, GFP_NOFS);
912 if (!ret && extent_item_pos) {
913 /*
914 * we've recorded that parent, so we must extend
915 * its inode list here
916 */
917 BUG_ON(!eie);
918 while (eie->next)
919 eie = eie->next;
920 eie->next = ref->inode_list;
921 }
713 BUG_ON(ret < 0); 922 BUG_ON(ret < 0);
714 } 923 }
715 kfree(ref); 924 kfree(ref);
@@ -734,6 +943,28 @@ out:
734 return ret; 943 return ret;
735} 944}
736 945
946static void free_leaf_list(struct ulist *blocks)
947{
948 struct ulist_node *node = NULL;
949 struct extent_inode_elem *eie;
950 struct extent_inode_elem *eie_next;
951 struct ulist_iterator uiter;
952
953 ULIST_ITER_INIT(&uiter);
954 while ((node = ulist_next(blocks, &uiter))) {
955 if (!node->aux)
956 continue;
957 eie = (struct extent_inode_elem *)node->aux;
958 for (; eie; eie = eie_next) {
959 eie_next = eie->next;
960 kfree(eie);
961 }
962 node->aux = 0;
963 }
964
965 ulist_free(blocks);
966}
967
737/* 968/*
738 * Finds all leafs with a reference to the specified combination of bytenr and 969 * Finds all leafs with a reference to the specified combination of bytenr and
739 * offset. key_list_head will point to a list of corresponding keys (caller must 970 * offset. key_list_head will point to a list of corresponding keys (caller must
@@ -744,7 +975,9 @@ out:
744 */ 975 */
745static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, 976static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
746 struct btrfs_fs_info *fs_info, u64 bytenr, 977 struct btrfs_fs_info *fs_info, u64 bytenr,
747 u64 num_bytes, u64 seq, struct ulist **leafs) 978 u64 delayed_ref_seq, u64 time_seq,
979 struct ulist **leafs,
980 const u64 *extent_item_pos)
748{ 981{
749 struct ulist *tmp; 982 struct ulist *tmp;
750 int ret; 983 int ret;
@@ -758,11 +991,12 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
758 return -ENOMEM; 991 return -ENOMEM;
759 } 992 }
760 993
761 ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp); 994 ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq,
995 time_seq, *leafs, tmp, extent_item_pos);
762 ulist_free(tmp); 996 ulist_free(tmp);
763 997
764 if (ret < 0 && ret != -ENOENT) { 998 if (ret < 0 && ret != -ENOENT) {
765 ulist_free(*leafs); 999 free_leaf_list(*leafs);
766 return ret; 1000 return ret;
767 } 1001 }
768 1002
@@ -784,10 +1018,12 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
784 */ 1018 */
785int btrfs_find_all_roots(struct btrfs_trans_handle *trans, 1019int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
786 struct btrfs_fs_info *fs_info, u64 bytenr, 1020 struct btrfs_fs_info *fs_info, u64 bytenr,
787 u64 num_bytes, u64 seq, struct ulist **roots) 1021 u64 delayed_ref_seq, u64 time_seq,
1022 struct ulist **roots)
788{ 1023{
789 struct ulist *tmp; 1024 struct ulist *tmp;
790 struct ulist_node *node = NULL; 1025 struct ulist_node *node = NULL;
1026 struct ulist_iterator uiter;
791 int ret; 1027 int ret;
792 1028
793 tmp = ulist_alloc(GFP_NOFS); 1029 tmp = ulist_alloc(GFP_NOFS);
@@ -799,15 +1035,16 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
799 return -ENOMEM; 1035 return -ENOMEM;
800 } 1036 }
801 1037
1038 ULIST_ITER_INIT(&uiter);
802 while (1) { 1039 while (1) {
803 ret = find_parent_nodes(trans, fs_info, bytenr, seq, 1040 ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq,
804 tmp, *roots); 1041 time_seq, tmp, *roots, NULL);
805 if (ret < 0 && ret != -ENOENT) { 1042 if (ret < 0 && ret != -ENOENT) {
806 ulist_free(tmp); 1043 ulist_free(tmp);
807 ulist_free(*roots); 1044 ulist_free(*roots);
808 return ret; 1045 return ret;
809 } 1046 }
810 node = ulist_next(tmp, node); 1047 node = ulist_next(tmp, &uiter);
811 if (!node) 1048 if (!node)
812 break; 1049 break;
813 bytenr = node->val; 1050 bytenr = node->val;
@@ -1093,67 +1330,25 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
1093 return 0; 1330 return 0;
1094} 1331}
1095 1332
1096static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, u64 logical, 1333static int iterate_leaf_refs(struct extent_inode_elem *inode_list,
1097 u64 orig_extent_item_objectid, 1334 u64 root, u64 extent_item_objectid,
1098 u64 extent_item_pos, u64 root,
1099 iterate_extent_inodes_t *iterate, void *ctx) 1335 iterate_extent_inodes_t *iterate, void *ctx)
1100{ 1336{
1101 u64 disk_byte; 1337 struct extent_inode_elem *eie;
1102 struct btrfs_key key;
1103 struct btrfs_file_extent_item *fi;
1104 struct extent_buffer *eb;
1105 int slot;
1106 int nritems;
1107 int ret = 0; 1338 int ret = 0;
1108 int extent_type;
1109 u64 data_offset;
1110 u64 data_len;
1111
1112 eb = read_tree_block(fs_info->tree_root, logical,
1113 fs_info->tree_root->leafsize, 0);
1114 if (!eb)
1115 return -EIO;
1116
1117 /*
1118 * from the shared data ref, we only have the leaf but we need
1119 * the key. thus, we must look into all items and see that we
1120 * find one (some) with a reference to our extent item.
1121 */
1122 nritems = btrfs_header_nritems(eb);
1123 for (slot = 0; slot < nritems; ++slot) {
1124 btrfs_item_key_to_cpu(eb, &key, slot);
1125 if (key.type != BTRFS_EXTENT_DATA_KEY)
1126 continue;
1127 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
1128 extent_type = btrfs_file_extent_type(eb, fi);
1129 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
1130 continue;
1131 /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
1132 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
1133 if (disk_byte != orig_extent_item_objectid)
1134 continue;
1135
1136 data_offset = btrfs_file_extent_offset(eb, fi);
1137 data_len = btrfs_file_extent_num_bytes(eb, fi);
1138
1139 if (extent_item_pos < data_offset ||
1140 extent_item_pos >= data_offset + data_len)
1141 continue;
1142 1339
1340 for (eie = inode_list; eie; eie = eie->next) {
1143 pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), " 1341 pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), "
1144 "root %llu\n", orig_extent_item_objectid, 1342 "root %llu\n", extent_item_objectid,
1145 key.objectid, key.offset, root); 1343 eie->inum, eie->offset, root);
1146 ret = iterate(key.objectid, 1344 ret = iterate(eie->inum, eie->offset, root, ctx);
1147 key.offset + (extent_item_pos - data_offset),
1148 root, ctx);
1149 if (ret) { 1345 if (ret) {
1150 pr_debug("stopping iteration because ret=%d\n", ret); 1346 pr_debug("stopping iteration for %llu due to ret=%d\n",
1347 extent_item_objectid, ret);
1151 break; 1348 break;
1152 } 1349 }
1153 } 1350 }
1154 1351
1155 free_extent_buffer(eb);
1156
1157 return ret; 1352 return ret;
1158} 1353}
1159 1354
@@ -1175,7 +1370,10 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1175 struct ulist *roots = NULL; 1370 struct ulist *roots = NULL;
1176 struct ulist_node *ref_node = NULL; 1371 struct ulist_node *ref_node = NULL;
1177 struct ulist_node *root_node = NULL; 1372 struct ulist_node *root_node = NULL;
1178 struct seq_list seq_elem; 1373 struct seq_list seq_elem = {};
1374 struct seq_list tree_mod_seq_elem = {};
1375 struct ulist_iterator ref_uiter;
1376 struct ulist_iterator root_uiter;
1179 struct btrfs_delayed_ref_root *delayed_refs = NULL; 1377 struct btrfs_delayed_ref_root *delayed_refs = NULL;
1180 1378
1181 pr_debug("resolving all inodes for extent %llu\n", 1379 pr_debug("resolving all inodes for extent %llu\n",
@@ -1192,34 +1390,41 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1192 spin_lock(&delayed_refs->lock); 1390 spin_lock(&delayed_refs->lock);
1193 btrfs_get_delayed_seq(delayed_refs, &seq_elem); 1391 btrfs_get_delayed_seq(delayed_refs, &seq_elem);
1194 spin_unlock(&delayed_refs->lock); 1392 spin_unlock(&delayed_refs->lock);
1393 btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
1195 } 1394 }
1196 1395
1197 ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, 1396 ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
1198 extent_item_pos, seq_elem.seq, 1397 seq_elem.seq, tree_mod_seq_elem.seq, &refs,
1199 &refs); 1398 &extent_item_pos);
1200
1201 if (ret) 1399 if (ret)
1202 goto out; 1400 goto out;
1203 1401
1204 while (!ret && (ref_node = ulist_next(refs, ref_node))) { 1402 ULIST_ITER_INIT(&ref_uiter);
1205 ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1, 1403 while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
1206 seq_elem.seq, &roots); 1404 ret = btrfs_find_all_roots(trans, fs_info, ref_node->val,
1405 seq_elem.seq,
1406 tree_mod_seq_elem.seq, &roots);
1207 if (ret) 1407 if (ret)
1208 break; 1408 break;
1209 while (!ret && (root_node = ulist_next(roots, root_node))) { 1409 ULIST_ITER_INIT(&root_uiter);
1210 pr_debug("root %llu references leaf %llu\n", 1410 while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
1211 root_node->val, ref_node->val); 1411 pr_debug("root %llu references leaf %llu, data list "
1212 ret = iterate_leaf_refs(fs_info, ref_node->val, 1412 "%#lx\n", root_node->val, ref_node->val,
1213 extent_item_objectid, 1413 ref_node->aux);
1214 extent_item_pos, root_node->val, 1414 ret = iterate_leaf_refs(
1215 iterate, ctx); 1415 (struct extent_inode_elem *)ref_node->aux,
1416 root_node->val, extent_item_objectid,
1417 iterate, ctx);
1216 } 1418 }
1419 ulist_free(roots);
1420 roots = NULL;
1217 } 1421 }
1218 1422
1219 ulist_free(refs); 1423 free_leaf_list(refs);
1220 ulist_free(roots); 1424 ulist_free(roots);
1221out: 1425out:
1222 if (!search_commit_root) { 1426 if (!search_commit_root) {
1427 btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
1223 btrfs_put_delayed_seq(delayed_refs, &seq_elem); 1428 btrfs_put_delayed_seq(delayed_refs, &seq_elem);
1224 btrfs_end_transaction(trans, fs_info->extent_root); 1429 btrfs_end_transaction(trans, fs_info->extent_root);
1225 } 1430 }
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 57ea2e959e4d..c18d8ac7b795 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -58,7 +58,8 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
58 58
59int btrfs_find_all_roots(struct btrfs_trans_handle *trans, 59int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
60 struct btrfs_fs_info *fs_info, u64 bytenr, 60 struct btrfs_fs_info *fs_info, u64 bytenr,
61 u64 num_bytes, u64 seq, struct ulist **roots); 61 u64 delayed_ref_seq, u64 time_seq,
62 struct ulist **roots);
62 63
63struct btrfs_data_container *init_data_container(u32 total_bytes); 64struct btrfs_data_container *init_data_container(u32 total_bytes);
64struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, 65struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 99fcad631a21..d7a96cfdc50a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -18,6 +18,7 @@
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/rbtree.h>
21#include "ctree.h" 22#include "ctree.h"
22#include "disk-io.h" 23#include "disk-io.h"
23#include "transaction.h" 24#include "transaction.h"
@@ -37,7 +38,16 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
37 struct extent_buffer *dst_buf, 38 struct extent_buffer *dst_buf,
38 struct extent_buffer *src_buf); 39 struct extent_buffer *src_buf);
39static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 40static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
40 struct btrfs_path *path, int level, int slot); 41 struct btrfs_path *path, int level, int slot,
42 int tree_mod_log);
43static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
44 struct extent_buffer *eb);
45struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr,
46 u32 blocksize, u64 parent_transid,
47 u64 time_seq);
48struct extent_buffer *btrfs_find_old_tree_block(struct btrfs_root *root,
49 u64 bytenr, u32 blocksize,
50 u64 time_seq);
41 51
42struct btrfs_path *btrfs_alloc_path(void) 52struct btrfs_path *btrfs_alloc_path(void)
43{ 53{
@@ -255,7 +265,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
255 265
256 cow = btrfs_alloc_free_block(trans, root, buf->len, 0, 266 cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
257 new_root_objectid, &disk_key, level, 267 new_root_objectid, &disk_key, level,
258 buf->start, 0, 1); 268 buf->start, 0);
259 if (IS_ERR(cow)) 269 if (IS_ERR(cow))
260 return PTR_ERR(cow); 270 return PTR_ERR(cow);
261 271
@@ -288,6 +298,434 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
288 return 0; 298 return 0;
289} 299}
290 300
301enum mod_log_op {
302 MOD_LOG_KEY_REPLACE,
303 MOD_LOG_KEY_ADD,
304 MOD_LOG_KEY_REMOVE,
305 MOD_LOG_KEY_REMOVE_WHILE_FREEING,
306 MOD_LOG_KEY_REMOVE_WHILE_MOVING,
307 MOD_LOG_MOVE_KEYS,
308 MOD_LOG_ROOT_REPLACE,
309};
310
311struct tree_mod_move {
312 int dst_slot;
313 int nr_items;
314};
315
316struct tree_mod_root {
317 u64 logical;
318 u8 level;
319};
320
321struct tree_mod_elem {
322 struct rb_node node;
323 u64 index; /* shifted logical */
324 struct seq_list elem;
325 enum mod_log_op op;
326
327 /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
328 int slot;
329
330 /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
331 u64 generation;
332
333 /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
334 struct btrfs_disk_key key;
335 u64 blockptr;
336
337 /* this is used for op == MOD_LOG_MOVE_KEYS */
338 struct tree_mod_move move;
339
340 /* this is used for op == MOD_LOG_ROOT_REPLACE */
341 struct tree_mod_root old_root;
342};
343
344static inline void
345__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem)
346{
347 elem->seq = atomic_inc_return(&fs_info->tree_mod_seq);
348 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
349}
350
351void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
352 struct seq_list *elem)
353{
354 elem->flags = 1;
355 spin_lock(&fs_info->tree_mod_seq_lock);
356 __get_tree_mod_seq(fs_info, elem);
357 spin_unlock(&fs_info->tree_mod_seq_lock);
358}
359
360void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
361 struct seq_list *elem)
362{
363 struct rb_root *tm_root;
364 struct rb_node *node;
365 struct rb_node *next;
366 struct seq_list *cur_elem;
367 struct tree_mod_elem *tm;
368 u64 min_seq = (u64)-1;
369 u64 seq_putting = elem->seq;
370
371 if (!seq_putting)
372 return;
373
374 BUG_ON(!(elem->flags & 1));
375 spin_lock(&fs_info->tree_mod_seq_lock);
376 list_del(&elem->list);
377
378 list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
379 if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) {
380 if (seq_putting > cur_elem->seq) {
381 /*
382 * blocker with lower sequence number exists, we
383 * cannot remove anything from the log
384 */
385 goto out;
386 }
387 min_seq = cur_elem->seq;
388 }
389 }
390
391 /*
392 * anything that's lower than the lowest existing (read: blocked)
393 * sequence number can be removed from the tree.
394 */
395 write_lock(&fs_info->tree_mod_log_lock);
396 tm_root = &fs_info->tree_mod_log;
397 for (node = rb_first(tm_root); node; node = next) {
398 next = rb_next(node);
399 tm = container_of(node, struct tree_mod_elem, node);
400 if (tm->elem.seq > min_seq)
401 continue;
402 rb_erase(node, tm_root);
403 list_del(&tm->elem.list);
404 kfree(tm);
405 }
406 write_unlock(&fs_info->tree_mod_log_lock);
407out:
408 spin_unlock(&fs_info->tree_mod_seq_lock);
409}
410
411/*
412 * key order of the log:
413 * index -> sequence
414 *
415 * the index is the shifted logical of the *new* root node for root replace
416 * operations, or the shifted logical of the affected block for all other
417 * operations.
418 */
419static noinline int
420__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
421{
422 struct rb_root *tm_root;
423 struct rb_node **new;
424 struct rb_node *parent = NULL;
425 struct tree_mod_elem *cur;
426 int ret = 0;
427
428 BUG_ON(!tm || !tm->elem.seq);
429
430 write_lock(&fs_info->tree_mod_log_lock);
431 tm_root = &fs_info->tree_mod_log;
432 new = &tm_root->rb_node;
433 while (*new) {
434 cur = container_of(*new, struct tree_mod_elem, node);
435 parent = *new;
436 if (cur->index < tm->index)
437 new = &((*new)->rb_left);
438 else if (cur->index > tm->index)
439 new = &((*new)->rb_right);
440 else if (cur->elem.seq < tm->elem.seq)
441 new = &((*new)->rb_left);
442 else if (cur->elem.seq > tm->elem.seq)
443 new = &((*new)->rb_right);
444 else {
445 kfree(tm);
446 ret = -EEXIST;
447 goto unlock;
448 }
449 }
450
451 rb_link_node(&tm->node, parent, new);
452 rb_insert_color(&tm->node, tm_root);
453unlock:
454 write_unlock(&fs_info->tree_mod_log_lock);
455 return ret;
456}
457
458static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
459 struct extent_buffer *eb) {
460 smp_mb();
461 if (list_empty(&(fs_info)->tree_mod_seq_list))
462 return 1;
463 if (!eb)
464 return 0;
465 if (btrfs_header_level(eb) == 0)
466 return 1;
467 return 0;
468}
469
470static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
471 struct tree_mod_elem **tm_ret)
472{
473 struct tree_mod_elem *tm;
474 int seq;
475
476 if (tree_mod_dont_log(fs_info, NULL))
477 return 0;
478
479 tm = *tm_ret = kzalloc(sizeof(*tm), flags);
480 if (!tm)
481 return -ENOMEM;
482
483 tm->elem.flags = 0;
484 spin_lock(&fs_info->tree_mod_seq_lock);
485 if (list_empty(&fs_info->tree_mod_seq_list)) {
486 /*
487 * someone emptied the list while we were waiting for the lock.
488 * we must not add to the list, because no blocker exists. items
489 * are removed from the list only when the existing blocker is
490 * removed from the list.
491 */
492 kfree(tm);
493 seq = 0;
494 } else {
495 __get_tree_mod_seq(fs_info, &tm->elem);
496 seq = tm->elem.seq;
497 }
498 spin_unlock(&fs_info->tree_mod_seq_lock);
499
500 return seq;
501}
502
503static noinline int
504tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
505 struct extent_buffer *eb, int slot,
506 enum mod_log_op op, gfp_t flags)
507{
508 struct tree_mod_elem *tm;
509 int ret;
510
511 ret = tree_mod_alloc(fs_info, flags, &tm);
512 if (ret <= 0)
513 return ret;
514
515 tm->index = eb->start >> PAGE_CACHE_SHIFT;
516 if (op != MOD_LOG_KEY_ADD) {
517 btrfs_node_key(eb, &tm->key, slot);
518 tm->blockptr = btrfs_node_blockptr(eb, slot);
519 }
520 tm->op = op;
521 tm->slot = slot;
522 tm->generation = btrfs_node_ptr_generation(eb, slot);
523
524 return __tree_mod_log_insert(fs_info, tm);
525}
526
527static noinline int
528tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
529 int slot, enum mod_log_op op)
530{
531 return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS);
532}
533
534static noinline int
535tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
536 struct extent_buffer *eb, int dst_slot, int src_slot,
537 int nr_items, gfp_t flags)
538{
539 struct tree_mod_elem *tm;
540 int ret;
541 int i;
542
543 if (tree_mod_dont_log(fs_info, eb))
544 return 0;
545
546 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
547 ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot,
548 MOD_LOG_KEY_REMOVE_WHILE_MOVING);
549 BUG_ON(ret < 0);
550 }
551
552 ret = tree_mod_alloc(fs_info, flags, &tm);
553 if (ret <= 0)
554 return ret;
555
556 tm->index = eb->start >> PAGE_CACHE_SHIFT;
557 tm->slot = src_slot;
558 tm->move.dst_slot = dst_slot;
559 tm->move.nr_items = nr_items;
560 tm->op = MOD_LOG_MOVE_KEYS;
561
562 return __tree_mod_log_insert(fs_info, tm);
563}
564
565static noinline int
566tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
567 struct extent_buffer *old_root,
568 struct extent_buffer *new_root, gfp_t flags)
569{
570 struct tree_mod_elem *tm;
571 int ret;
572
573 ret = tree_mod_alloc(fs_info, flags, &tm);
574 if (ret <= 0)
575 return ret;
576
577 tm->index = new_root->start >> PAGE_CACHE_SHIFT;
578 tm->old_root.logical = old_root->start;
579 tm->old_root.level = btrfs_header_level(old_root);
580 tm->generation = btrfs_header_generation(old_root);
581 tm->op = MOD_LOG_ROOT_REPLACE;
582
583 return __tree_mod_log_insert(fs_info, tm);
584}
585
586static struct tree_mod_elem *
587__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
588 int smallest)
589{
590 struct rb_root *tm_root;
591 struct rb_node *node;
592 struct tree_mod_elem *cur = NULL;
593 struct tree_mod_elem *found = NULL;
594 u64 index = start >> PAGE_CACHE_SHIFT;
595
596 read_lock(&fs_info->tree_mod_log_lock);
597 tm_root = &fs_info->tree_mod_log;
598 node = tm_root->rb_node;
599 while (node) {
600 cur = container_of(node, struct tree_mod_elem, node);
601 if (cur->index < index) {
602 node = node->rb_left;
603 } else if (cur->index > index) {
604 node = node->rb_right;
605 } else if (cur->elem.seq < min_seq) {
606 node = node->rb_left;
607 } else if (!smallest) {
608 /* we want the node with the highest seq */
609 if (found)
610 BUG_ON(found->elem.seq > cur->elem.seq);
611 found = cur;
612 node = node->rb_left;
613 } else if (cur->elem.seq > min_seq) {
614 /* we want the node with the smallest seq */
615 if (found)
616 BUG_ON(found->elem.seq < cur->elem.seq);
617 found = cur;
618 node = node->rb_right;
619 } else {
620 found = cur;
621 break;
622 }
623 }
624 read_unlock(&fs_info->tree_mod_log_lock);
625
626 return found;
627}
628
629/*
630 * this returns the element from the log with the smallest time sequence
631 * value that's in the log (the oldest log item). any element with a time
632 * sequence lower than min_seq will be ignored.
633 */
634static struct tree_mod_elem *
635tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
636 u64 min_seq)
637{
638 return __tree_mod_log_search(fs_info, start, min_seq, 1);
639}
640
641/*
642 * this returns the element from the log with the largest time sequence
643 * value that's in the log (the most recent log item). any element with
644 * a time sequence lower than min_seq will be ignored.
645 */
646static struct tree_mod_elem *
647tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
648{
649 return __tree_mod_log_search(fs_info, start, min_seq, 0);
650}
651
652static inline void
653tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
654 struct extent_buffer *src, unsigned long dst_offset,
655 unsigned long src_offset, int nr_items)
656{
657 int ret;
658 int i;
659
660 if (tree_mod_dont_log(fs_info, NULL))
661 return;
662
663 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
664 return;
665
666 /* speed this up by single seq for all operations? */
667 for (i = 0; i < nr_items; i++) {
668 ret = tree_mod_log_insert_key(fs_info, src, i + src_offset,
669 MOD_LOG_KEY_REMOVE);
670 BUG_ON(ret < 0);
671 ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset,
672 MOD_LOG_KEY_ADD);
673 BUG_ON(ret < 0);
674 }
675}
676
677static inline void
678tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
679 int dst_offset, int src_offset, int nr_items)
680{
681 int ret;
682 ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
683 nr_items, GFP_NOFS);
684 BUG_ON(ret < 0);
685}
686
687static inline void
688tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
689 struct extent_buffer *eb,
690 struct btrfs_disk_key *disk_key, int slot, int atomic)
691{
692 int ret;
693
694 ret = tree_mod_log_insert_key_mask(fs_info, eb, slot,
695 MOD_LOG_KEY_REPLACE,
696 atomic ? GFP_ATOMIC : GFP_NOFS);
697 BUG_ON(ret < 0);
698}
699
700static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
701 struct extent_buffer *eb)
702{
703 int i;
704 int ret;
705 u32 nritems;
706
707 if (tree_mod_dont_log(fs_info, eb))
708 return;
709
710 nritems = btrfs_header_nritems(eb);
711 for (i = nritems - 1; i >= 0; i--) {
712 ret = tree_mod_log_insert_key(fs_info, eb, i,
713 MOD_LOG_KEY_REMOVE_WHILE_FREEING);
714 BUG_ON(ret < 0);
715 }
716}
717
718static inline void
719tree_mod_log_set_root_pointer(struct btrfs_root *root,
720 struct extent_buffer *new_root_node)
721{
722 int ret;
723 tree_mod_log_free_eb(root->fs_info, root->node);
724 ret = tree_mod_log_insert_root(root->fs_info, root->node,
725 new_root_node, GFP_NOFS);
726 BUG_ON(ret < 0);
727}
728
291/* 729/*
292 * check if the tree block can be shared by multiple trees 730 * check if the tree block can be shared by multiple trees
293 */ 731 */
@@ -409,6 +847,12 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
409 ret = btrfs_dec_ref(trans, root, buf, 1, 1); 847 ret = btrfs_dec_ref(trans, root, buf, 1, 1);
410 BUG_ON(ret); /* -ENOMEM */ 848 BUG_ON(ret); /* -ENOMEM */
411 } 849 }
850 /*
851 * don't log freeing in case we're freeing the root node, this
852 * is done by tree_mod_log_set_root_pointer later
853 */
854 if (buf != root->node && btrfs_header_level(buf) != 0)
855 tree_mod_log_free_eb(root->fs_info, buf);
412 clean_tree_block(trans, root, buf); 856 clean_tree_block(trans, root, buf);
413 *last_ref = 1; 857 *last_ref = 1;
414 } 858 }
@@ -467,7 +911,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
467 911
468 cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, 912 cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
469 root->root_key.objectid, &disk_key, 913 root->root_key.objectid, &disk_key,
470 level, search_start, empty_size, 1); 914 level, search_start, empty_size);
471 if (IS_ERR(cow)) 915 if (IS_ERR(cow))
472 return PTR_ERR(cow); 916 return PTR_ERR(cow);
473 917
@@ -506,10 +950,11 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
506 parent_start = 0; 950 parent_start = 0;
507 951
508 extent_buffer_get(cow); 952 extent_buffer_get(cow);
953 tree_mod_log_set_root_pointer(root, cow);
509 rcu_assign_pointer(root->node, cow); 954 rcu_assign_pointer(root->node, cow);
510 955
511 btrfs_free_tree_block(trans, root, buf, parent_start, 956 btrfs_free_tree_block(trans, root, buf, parent_start,
512 last_ref, 1); 957 last_ref);
513 free_extent_buffer(buf); 958 free_extent_buffer(buf);
514 add_root_to_dirty_list(root); 959 add_root_to_dirty_list(root);
515 } else { 960 } else {
@@ -519,13 +964,15 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
519 parent_start = 0; 964 parent_start = 0;
520 965
521 WARN_ON(trans->transid != btrfs_header_generation(parent)); 966 WARN_ON(trans->transid != btrfs_header_generation(parent));
967 tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
968 MOD_LOG_KEY_REPLACE);
522 btrfs_set_node_blockptr(parent, parent_slot, 969 btrfs_set_node_blockptr(parent, parent_slot,
523 cow->start); 970 cow->start);
524 btrfs_set_node_ptr_generation(parent, parent_slot, 971 btrfs_set_node_ptr_generation(parent, parent_slot,
525 trans->transid); 972 trans->transid);
526 btrfs_mark_buffer_dirty(parent); 973 btrfs_mark_buffer_dirty(parent);
527 btrfs_free_tree_block(trans, root, buf, parent_start, 974 btrfs_free_tree_block(trans, root, buf, parent_start,
528 last_ref, 1); 975 last_ref);
529 } 976 }
530 if (unlock_orig) 977 if (unlock_orig)
531 btrfs_tree_unlock(buf); 978 btrfs_tree_unlock(buf);
@@ -535,6 +982,210 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
535 return 0; 982 return 0;
536} 983}
537 984
985/*
986 * returns the logical address of the oldest predecessor of the given root.
987 * entries older than time_seq are ignored.
988 */
989static struct tree_mod_elem *
990__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
991 struct btrfs_root *root, u64 time_seq)
992{
993 struct tree_mod_elem *tm;
994 struct tree_mod_elem *found = NULL;
995 u64 root_logical = root->node->start;
996 int looped = 0;
997
998 if (!time_seq)
999 return 0;
1000
1001 /*
1002 * the very last operation that's logged for a root is the replacement
1003 * operation (if it is replaced at all). this has the index of the *new*
1004 * root, making it the very first operation that's logged for this root.
1005 */
1006 while (1) {
1007 tm = tree_mod_log_search_oldest(fs_info, root_logical,
1008 time_seq);
1009 if (!looped && !tm)
1010 return 0;
1011 /*
1012 * we must have key remove operations in the log before the
1013 * replace operation.
1014 */
1015 BUG_ON(!tm);
1016
1017 if (tm->op != MOD_LOG_ROOT_REPLACE)
1018 break;
1019
1020 found = tm;
1021 root_logical = tm->old_root.logical;
1022 BUG_ON(root_logical == root->node->start);
1023 looped = 1;
1024 }
1025
1026 return found;
1027}
1028
1029/*
1030 * tm is a pointer to the first operation to rewind within eb. then, all
1031 * previous operations will be rewinded (until we reach something older than
1032 * time_seq).
1033 */
1034static void
1035__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1036 struct tree_mod_elem *first_tm)
1037{
1038 u32 n;
1039 struct rb_node *next;
1040 struct tree_mod_elem *tm = first_tm;
1041 unsigned long o_dst;
1042 unsigned long o_src;
1043 unsigned long p_size = sizeof(struct btrfs_key_ptr);
1044
1045 n = btrfs_header_nritems(eb);
1046 while (tm && tm->elem.seq >= time_seq) {
1047 /*
1048 * all the operations are recorded with the operator used for
1049 * the modification. as we're going backwards, we do the
1050 * opposite of each operation here.
1051 */
1052 switch (tm->op) {
1053 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1054 BUG_ON(tm->slot < n);
1055 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1056 case MOD_LOG_KEY_REMOVE:
1057 btrfs_set_node_key(eb, &tm->key, tm->slot);
1058 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1059 btrfs_set_node_ptr_generation(eb, tm->slot,
1060 tm->generation);
1061 n++;
1062 break;
1063 case MOD_LOG_KEY_REPLACE:
1064 BUG_ON(tm->slot >= n);
1065 btrfs_set_node_key(eb, &tm->key, tm->slot);
1066 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1067 btrfs_set_node_ptr_generation(eb, tm->slot,
1068 tm->generation);
1069 break;
1070 case MOD_LOG_KEY_ADD:
1071 if (tm->slot != n - 1) {
1072 o_dst = btrfs_node_key_ptr_offset(tm->slot);
1073 o_src = btrfs_node_key_ptr_offset(tm->slot + 1);
1074 memmove_extent_buffer(eb, o_dst, o_src, p_size);
1075 }
1076 n--;
1077 break;
1078 case MOD_LOG_MOVE_KEYS:
1079 o_dst = btrfs_node_key_ptr_offset(tm->slot);
1080 o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1081 memmove_extent_buffer(eb, o_dst, o_src,
1082 tm->move.nr_items * p_size);
1083 break;
1084 case MOD_LOG_ROOT_REPLACE:
1085 /*
1086 * this operation is special. for roots, this must be
1087 * handled explicitly before rewinding.
1088 * for non-roots, this operation may exist if the node
1089 * was a root: root A -> child B; then A gets empty and
1090 * B is promoted to the new root. in the mod log, we'll
1091 * have a root-replace operation for B, a tree block
1092 * that is no root. we simply ignore that operation.
1093 */
1094 break;
1095 }
1096 next = rb_next(&tm->node);
1097 if (!next)
1098 break;
1099 tm = container_of(next, struct tree_mod_elem, node);
1100 if (tm->index != first_tm->index)
1101 break;
1102 }
1103 btrfs_set_header_nritems(eb, n);
1104}
1105
1106static struct extent_buffer *
1107tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1108 u64 time_seq)
1109{
1110 struct extent_buffer *eb_rewin;
1111 struct tree_mod_elem *tm;
1112
1113 if (!time_seq)
1114 return eb;
1115
1116 if (btrfs_header_level(eb) == 0)
1117 return eb;
1118
1119 tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1120 if (!tm)
1121 return eb;
1122
1123 if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1124 BUG_ON(tm->slot != 0);
1125 eb_rewin = alloc_dummy_extent_buffer(eb->start,
1126 fs_info->tree_root->nodesize);
1127 BUG_ON(!eb_rewin);
1128 btrfs_set_header_bytenr(eb_rewin, eb->start);
1129 btrfs_set_header_backref_rev(eb_rewin,
1130 btrfs_header_backref_rev(eb));
1131 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1132 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1133 } else {
1134 eb_rewin = btrfs_clone_extent_buffer(eb);
1135 BUG_ON(!eb_rewin);
1136 }
1137
1138 extent_buffer_get(eb_rewin);
1139 free_extent_buffer(eb);
1140
1141 __tree_mod_log_rewind(eb_rewin, time_seq, tm);
1142
1143 return eb_rewin;
1144}
1145
1146static inline struct extent_buffer *
1147get_old_root(struct btrfs_root *root, u64 time_seq)
1148{
1149 struct tree_mod_elem *tm;
1150 struct extent_buffer *eb;
1151 struct tree_mod_root *old_root;
1152 u64 old_generation;
1153
1154 tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq);
1155 if (!tm)
1156 return root->node;
1157
1158 old_root = &tm->old_root;
1159 old_generation = tm->generation;
1160
1161 tm = tree_mod_log_search(root->fs_info, old_root->logical, time_seq);
1162 /*
1163 * there was an item in the log when __tree_mod_log_oldest_root
1164 * returned. this one must not go away, because the time_seq passed to
1165 * us must be blocking its removal.
1166 */
1167 BUG_ON(!tm);
1168
1169 if (old_root->logical == root->node->start) {
1170 /* there are logged operations for the current root */
1171 eb = btrfs_clone_extent_buffer(root->node);
1172 } else {
1173 /* there's a root replace operation for the current root */
1174 eb = alloc_dummy_extent_buffer(tm->index << PAGE_CACHE_SHIFT,
1175 root->nodesize);
1176 btrfs_set_header_bytenr(eb, eb->start);
1177 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1178 btrfs_set_header_owner(eb, root->root_key.objectid);
1179 }
1180 if (!eb)
1181 return NULL;
1182 btrfs_set_header_level(eb, old_root->level);
1183 btrfs_set_header_generation(eb, old_generation);
1184 __tree_mod_log_rewind(eb, time_seq, tm);
1185
1186 return eb;
1187}
1188
538static inline int should_cow_block(struct btrfs_trans_handle *trans, 1189static inline int should_cow_block(struct btrfs_trans_handle *trans,
539 struct btrfs_root *root, 1190 struct btrfs_root *root,
540 struct extent_buffer *buf) 1191 struct extent_buffer *buf)
@@ -976,6 +1627,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
976 goto enospc; 1627 goto enospc;
977 } 1628 }
978 1629
1630 tree_mod_log_set_root_pointer(root, child);
979 rcu_assign_pointer(root->node, child); 1631 rcu_assign_pointer(root->node, child);
980 1632
981 add_root_to_dirty_list(root); 1633 add_root_to_dirty_list(root);
@@ -989,7 +1641,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
989 free_extent_buffer(mid); 1641 free_extent_buffer(mid);
990 1642
991 root_sub_used(root, mid->len); 1643 root_sub_used(root, mid->len);
992 btrfs_free_tree_block(trans, root, mid, 0, 1, 0); 1644 btrfs_free_tree_block(trans, root, mid, 0, 1);
993 /* once for the root ptr */ 1645 /* once for the root ptr */
994 free_extent_buffer_stale(mid); 1646 free_extent_buffer_stale(mid);
995 return 0; 1647 return 0;
@@ -1042,14 +1694,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1042 if (btrfs_header_nritems(right) == 0) { 1694 if (btrfs_header_nritems(right) == 0) {
1043 clean_tree_block(trans, root, right); 1695 clean_tree_block(trans, root, right);
1044 btrfs_tree_unlock(right); 1696 btrfs_tree_unlock(right);
1045 del_ptr(trans, root, path, level + 1, pslot + 1); 1697 del_ptr(trans, root, path, level + 1, pslot + 1, 1);
1046 root_sub_used(root, right->len); 1698 root_sub_used(root, right->len);
1047 btrfs_free_tree_block(trans, root, right, 0, 1, 0); 1699 btrfs_free_tree_block(trans, root, right, 0, 1);
1048 free_extent_buffer_stale(right); 1700 free_extent_buffer_stale(right);
1049 right = NULL; 1701 right = NULL;
1050 } else { 1702 } else {
1051 struct btrfs_disk_key right_key; 1703 struct btrfs_disk_key right_key;
1052 btrfs_node_key(right, &right_key, 0); 1704 btrfs_node_key(right, &right_key, 0);
1705 tree_mod_log_set_node_key(root->fs_info, parent,
1706 &right_key, pslot + 1, 0);
1053 btrfs_set_node_key(parent, &right_key, pslot + 1); 1707 btrfs_set_node_key(parent, &right_key, pslot + 1);
1054 btrfs_mark_buffer_dirty(parent); 1708 btrfs_mark_buffer_dirty(parent);
1055 } 1709 }
@@ -1084,15 +1738,17 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1084 if (btrfs_header_nritems(mid) == 0) { 1738 if (btrfs_header_nritems(mid) == 0) {
1085 clean_tree_block(trans, root, mid); 1739 clean_tree_block(trans, root, mid);
1086 btrfs_tree_unlock(mid); 1740 btrfs_tree_unlock(mid);
1087 del_ptr(trans, root, path, level + 1, pslot); 1741 del_ptr(trans, root, path, level + 1, pslot, 1);
1088 root_sub_used(root, mid->len); 1742 root_sub_used(root, mid->len);
1089 btrfs_free_tree_block(trans, root, mid, 0, 1, 0); 1743 btrfs_free_tree_block(trans, root, mid, 0, 1);
1090 free_extent_buffer_stale(mid); 1744 free_extent_buffer_stale(mid);
1091 mid = NULL; 1745 mid = NULL;
1092 } else { 1746 } else {
1093 /* update the parent key to reflect our changes */ 1747 /* update the parent key to reflect our changes */
1094 struct btrfs_disk_key mid_key; 1748 struct btrfs_disk_key mid_key;
1095 btrfs_node_key(mid, &mid_key, 0); 1749 btrfs_node_key(mid, &mid_key, 0);
1750 tree_mod_log_set_node_key(root->fs_info, parent, &mid_key,
1751 pslot, 0);
1096 btrfs_set_node_key(parent, &mid_key, pslot); 1752 btrfs_set_node_key(parent, &mid_key, pslot);
1097 btrfs_mark_buffer_dirty(parent); 1753 btrfs_mark_buffer_dirty(parent);
1098 } 1754 }
@@ -1190,6 +1846,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1190 struct btrfs_disk_key disk_key; 1846 struct btrfs_disk_key disk_key;
1191 orig_slot += left_nr; 1847 orig_slot += left_nr;
1192 btrfs_node_key(mid, &disk_key, 0); 1848 btrfs_node_key(mid, &disk_key, 0);
1849 tree_mod_log_set_node_key(root->fs_info, parent,
1850 &disk_key, pslot, 0);
1193 btrfs_set_node_key(parent, &disk_key, pslot); 1851 btrfs_set_node_key(parent, &disk_key, pslot);
1194 btrfs_mark_buffer_dirty(parent); 1852 btrfs_mark_buffer_dirty(parent);
1195 if (btrfs_header_nritems(left) > orig_slot) { 1853 if (btrfs_header_nritems(left) > orig_slot) {
@@ -1241,6 +1899,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1241 struct btrfs_disk_key disk_key; 1899 struct btrfs_disk_key disk_key;
1242 1900
1243 btrfs_node_key(right, &disk_key, 0); 1901 btrfs_node_key(right, &disk_key, 0);
1902 tree_mod_log_set_node_key(root->fs_info, parent,
1903 &disk_key, pslot + 1, 0);
1244 btrfs_set_node_key(parent, &disk_key, pslot + 1); 1904 btrfs_set_node_key(parent, &disk_key, pslot + 1);
1245 btrfs_mark_buffer_dirty(parent); 1905 btrfs_mark_buffer_dirty(parent);
1246 1906
@@ -1498,7 +2158,7 @@ static int
1498read_block_for_search(struct btrfs_trans_handle *trans, 2158read_block_for_search(struct btrfs_trans_handle *trans,
1499 struct btrfs_root *root, struct btrfs_path *p, 2159 struct btrfs_root *root, struct btrfs_path *p,
1500 struct extent_buffer **eb_ret, int level, int slot, 2160 struct extent_buffer **eb_ret, int level, int slot,
1501 struct btrfs_key *key) 2161 struct btrfs_key *key, u64 time_seq)
1502{ 2162{
1503 u64 blocknr; 2163 u64 blocknr;
1504 u64 gen; 2164 u64 gen;
@@ -1852,7 +2512,7 @@ cow_done:
1852 } 2512 }
1853 2513
1854 err = read_block_for_search(trans, root, p, 2514 err = read_block_for_search(trans, root, p,
1855 &b, level, slot, key); 2515 &b, level, slot, key, 0);
1856 if (err == -EAGAIN) 2516 if (err == -EAGAIN)
1857 goto again; 2517 goto again;
1858 if (err) { 2518 if (err) {
@@ -1924,6 +2584,115 @@ done:
1924} 2584}
1925 2585
1926/* 2586/*
2587 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2588 * current state of the tree together with the operations recorded in the tree
2589 * modification log to search for the key in a previous version of this tree, as
2590 * denoted by the time_seq parameter.
2591 *
2592 * Naturally, there is no support for insert, delete or cow operations.
2593 *
2594 * The resulting path and return value will be set up as if we called
2595 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2596 */
2597int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2598 struct btrfs_path *p, u64 time_seq)
2599{
2600 struct extent_buffer *b;
2601 int slot;
2602 int ret;
2603 int err;
2604 int level;
2605 int lowest_unlock = 1;
2606 u8 lowest_level = 0;
2607
2608 lowest_level = p->lowest_level;
2609 WARN_ON(p->nodes[0] != NULL);
2610
2611 if (p->search_commit_root) {
2612 BUG_ON(time_seq);
2613 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2614 }
2615
2616again:
2617 b = get_old_root(root, time_seq);
2618 extent_buffer_get(b);
2619 level = btrfs_header_level(b);
2620 btrfs_tree_read_lock(b);
2621 p->locks[level] = BTRFS_READ_LOCK;
2622
2623 while (b) {
2624 level = btrfs_header_level(b);
2625 p->nodes[level] = b;
2626 btrfs_clear_path_blocking(p, NULL, 0);
2627
2628 /*
2629 * we have a lock on b and as long as we aren't changing
2630 * the tree, there is no way to for the items in b to change.
2631 * It is safe to drop the lock on our parent before we
2632 * go through the expensive btree search on b.
2633 */
2634 btrfs_unlock_up_safe(p, level + 1);
2635
2636 ret = bin_search(b, key, level, &slot);
2637
2638 if (level != 0) {
2639 int dec = 0;
2640 if (ret && slot > 0) {
2641 dec = 1;
2642 slot -= 1;
2643 }
2644 p->slots[level] = slot;
2645 unlock_up(p, level, lowest_unlock, 0, NULL);
2646
2647 if (level == lowest_level) {
2648 if (dec)
2649 p->slots[level]++;
2650 goto done;
2651 }
2652
2653 err = read_block_for_search(NULL, root, p, &b, level,
2654 slot, key, time_seq);
2655 if (err == -EAGAIN)
2656 goto again;
2657 if (err) {
2658 ret = err;
2659 goto done;
2660 }
2661
2662 level = btrfs_header_level(b);
2663 err = btrfs_try_tree_read_lock(b);
2664 if (!err) {
2665 btrfs_set_path_blocking(p);
2666 btrfs_tree_read_lock(b);
2667 btrfs_clear_path_blocking(p, b,
2668 BTRFS_READ_LOCK);
2669 }
2670 p->locks[level] = BTRFS_READ_LOCK;
2671 p->nodes[level] = b;
2672 b = tree_mod_log_rewind(root->fs_info, b, time_seq);
2673 if (b != p->nodes[level]) {
2674 btrfs_tree_unlock_rw(p->nodes[level],
2675 p->locks[level]);
2676 p->locks[level] = 0;
2677 p->nodes[level] = b;
2678 }
2679 } else {
2680 p->slots[level] = slot;
2681 unlock_up(p, level, lowest_unlock, 0, NULL);
2682 goto done;
2683 }
2684 }
2685 ret = 1;
2686done:
2687 if (!p->leave_spinning)
2688 btrfs_set_path_blocking(p);
2689 if (ret < 0)
2690 btrfs_release_path(p);
2691
2692 return ret;
2693}
2694
2695/*
1927 * adjust the pointers going up the tree, starting at level 2696 * adjust the pointers going up the tree, starting at level
1928 * making sure the right key of each node is points to 'key'. 2697 * making sure the right key of each node is points to 'key'.
1929 * This is used after shifting pointers to the left, so it stops 2698 * This is used after shifting pointers to the left, so it stops
@@ -1943,6 +2712,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,
1943 if (!path->nodes[i]) 2712 if (!path->nodes[i])
1944 break; 2713 break;
1945 t = path->nodes[i]; 2714 t = path->nodes[i];
2715 tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1);
1946 btrfs_set_node_key(t, key, tslot); 2716 btrfs_set_node_key(t, key, tslot);
1947 btrfs_mark_buffer_dirty(path->nodes[i]); 2717 btrfs_mark_buffer_dirty(path->nodes[i]);
1948 if (tslot != 0) 2718 if (tslot != 0)
@@ -2025,12 +2795,16 @@ static int push_node_left(struct btrfs_trans_handle *trans,
2025 } else 2795 } else
2026 push_items = min(src_nritems - 8, push_items); 2796 push_items = min(src_nritems - 8, push_items);
2027 2797
2798 tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
2799 push_items);
2028 copy_extent_buffer(dst, src, 2800 copy_extent_buffer(dst, src,
2029 btrfs_node_key_ptr_offset(dst_nritems), 2801 btrfs_node_key_ptr_offset(dst_nritems),
2030 btrfs_node_key_ptr_offset(0), 2802 btrfs_node_key_ptr_offset(0),
2031 push_items * sizeof(struct btrfs_key_ptr)); 2803 push_items * sizeof(struct btrfs_key_ptr));
2032 2804
2033 if (push_items < src_nritems) { 2805 if (push_items < src_nritems) {
2806 tree_mod_log_eb_move(root->fs_info, src, 0, push_items,
2807 src_nritems - push_items);
2034 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), 2808 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
2035 btrfs_node_key_ptr_offset(push_items), 2809 btrfs_node_key_ptr_offset(push_items),
2036 (src_nritems - push_items) * 2810 (src_nritems - push_items) *
@@ -2084,11 +2858,14 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
2084 if (max_push < push_items) 2858 if (max_push < push_items)
2085 push_items = max_push; 2859 push_items = max_push;
2086 2860
2861 tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
2087 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), 2862 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
2088 btrfs_node_key_ptr_offset(0), 2863 btrfs_node_key_ptr_offset(0),
2089 (dst_nritems) * 2864 (dst_nritems) *
2090 sizeof(struct btrfs_key_ptr)); 2865 sizeof(struct btrfs_key_ptr));
2091 2866
2867 tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
2868 src_nritems - push_items, push_items);
2092 copy_extent_buffer(dst, src, 2869 copy_extent_buffer(dst, src,
2093 btrfs_node_key_ptr_offset(0), 2870 btrfs_node_key_ptr_offset(0),
2094 btrfs_node_key_ptr_offset(src_nritems - push_items), 2871 btrfs_node_key_ptr_offset(src_nritems - push_items),
@@ -2131,7 +2908,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2131 2908
2132 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 2909 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2133 root->root_key.objectid, &lower_key, 2910 root->root_key.objectid, &lower_key,
2134 level, root->node->start, 0, 0); 2911 level, root->node->start, 0);
2135 if (IS_ERR(c)) 2912 if (IS_ERR(c))
2136 return PTR_ERR(c); 2913 return PTR_ERR(c);
2137 2914
@@ -2163,6 +2940,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2163 btrfs_mark_buffer_dirty(c); 2940 btrfs_mark_buffer_dirty(c);
2164 2941
2165 old = root->node; 2942 old = root->node;
2943 tree_mod_log_set_root_pointer(root, c);
2166 rcu_assign_pointer(root->node, c); 2944 rcu_assign_pointer(root->node, c);
2167 2945
2168 /* the super has an extra ref to root->node */ 2946 /* the super has an extra ref to root->node */
@@ -2186,10 +2964,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2186static void insert_ptr(struct btrfs_trans_handle *trans, 2964static void insert_ptr(struct btrfs_trans_handle *trans,
2187 struct btrfs_root *root, struct btrfs_path *path, 2965 struct btrfs_root *root, struct btrfs_path *path,
2188 struct btrfs_disk_key *key, u64 bytenr, 2966 struct btrfs_disk_key *key, u64 bytenr,
2189 int slot, int level) 2967 int slot, int level, int tree_mod_log)
2190{ 2968{
2191 struct extent_buffer *lower; 2969 struct extent_buffer *lower;
2192 int nritems; 2970 int nritems;
2971 int ret;
2193 2972
2194 BUG_ON(!path->nodes[level]); 2973 BUG_ON(!path->nodes[level]);
2195 btrfs_assert_tree_locked(path->nodes[level]); 2974 btrfs_assert_tree_locked(path->nodes[level]);
@@ -2198,11 +2977,19 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
2198 BUG_ON(slot > nritems); 2977 BUG_ON(slot > nritems);
2199 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root)); 2978 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
2200 if (slot != nritems) { 2979 if (slot != nritems) {
2980 if (tree_mod_log && level)
2981 tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
2982 slot, nritems - slot);
2201 memmove_extent_buffer(lower, 2983 memmove_extent_buffer(lower,
2202 btrfs_node_key_ptr_offset(slot + 1), 2984 btrfs_node_key_ptr_offset(slot + 1),
2203 btrfs_node_key_ptr_offset(slot), 2985 btrfs_node_key_ptr_offset(slot),
2204 (nritems - slot) * sizeof(struct btrfs_key_ptr)); 2986 (nritems - slot) * sizeof(struct btrfs_key_ptr));
2205 } 2987 }
2988 if (tree_mod_log && level) {
2989 ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
2990 MOD_LOG_KEY_ADD);
2991 BUG_ON(ret < 0);
2992 }
2206 btrfs_set_node_key(lower, key, slot); 2993 btrfs_set_node_key(lower, key, slot);
2207 btrfs_set_node_blockptr(lower, slot, bytenr); 2994 btrfs_set_node_blockptr(lower, slot, bytenr);
2208 WARN_ON(trans->transid == 0); 2995 WARN_ON(trans->transid == 0);
@@ -2254,7 +3041,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2254 3041
2255 split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 3042 split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2256 root->root_key.objectid, 3043 root->root_key.objectid,
2257 &disk_key, level, c->start, 0, 0); 3044 &disk_key, level, c->start, 0);
2258 if (IS_ERR(split)) 3045 if (IS_ERR(split))
2259 return PTR_ERR(split); 3046 return PTR_ERR(split);
2260 3047
@@ -2273,7 +3060,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2273 (unsigned long)btrfs_header_chunk_tree_uuid(split), 3060 (unsigned long)btrfs_header_chunk_tree_uuid(split),
2274 BTRFS_UUID_SIZE); 3061 BTRFS_UUID_SIZE);
2275 3062
2276 3063 tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid);
2277 copy_extent_buffer(split, c, 3064 copy_extent_buffer(split, c,
2278 btrfs_node_key_ptr_offset(0), 3065 btrfs_node_key_ptr_offset(0),
2279 btrfs_node_key_ptr_offset(mid), 3066 btrfs_node_key_ptr_offset(mid),
@@ -2286,7 +3073,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2286 btrfs_mark_buffer_dirty(split); 3073 btrfs_mark_buffer_dirty(split);
2287 3074
2288 insert_ptr(trans, root, path, &disk_key, split->start, 3075 insert_ptr(trans, root, path, &disk_key, split->start,
2289 path->slots[level + 1] + 1, level + 1); 3076 path->slots[level + 1] + 1, level + 1, 1);
2290 3077
2291 if (path->slots[level] >= mid) { 3078 if (path->slots[level] >= mid) {
2292 path->slots[level] -= mid; 3079 path->slots[level] -= mid;
@@ -2823,7 +3610,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
2823 btrfs_set_header_nritems(l, mid); 3610 btrfs_set_header_nritems(l, mid);
2824 btrfs_item_key(right, &disk_key, 0); 3611 btrfs_item_key(right, &disk_key, 0);
2825 insert_ptr(trans, root, path, &disk_key, right->start, 3612 insert_ptr(trans, root, path, &disk_key, right->start,
2826 path->slots[1] + 1, 1); 3613 path->slots[1] + 1, 1, 0);
2827 3614
2828 btrfs_mark_buffer_dirty(right); 3615 btrfs_mark_buffer_dirty(right);
2829 btrfs_mark_buffer_dirty(l); 3616 btrfs_mark_buffer_dirty(l);
@@ -3006,7 +3793,7 @@ again:
3006 3793
3007 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 3794 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
3008 root->root_key.objectid, 3795 root->root_key.objectid,
3009 &disk_key, 0, l->start, 0, 0); 3796 &disk_key, 0, l->start, 0);
3010 if (IS_ERR(right)) 3797 if (IS_ERR(right))
3011 return PTR_ERR(right); 3798 return PTR_ERR(right);
3012 3799
@@ -3030,7 +3817,7 @@ again:
3030 if (mid <= slot) { 3817 if (mid <= slot) {
3031 btrfs_set_header_nritems(right, 0); 3818 btrfs_set_header_nritems(right, 0);
3032 insert_ptr(trans, root, path, &disk_key, right->start, 3819 insert_ptr(trans, root, path, &disk_key, right->start,
3033 path->slots[1] + 1, 1); 3820 path->slots[1] + 1, 1, 0);
3034 btrfs_tree_unlock(path->nodes[0]); 3821 btrfs_tree_unlock(path->nodes[0]);
3035 free_extent_buffer(path->nodes[0]); 3822 free_extent_buffer(path->nodes[0]);
3036 path->nodes[0] = right; 3823 path->nodes[0] = right;
@@ -3039,7 +3826,7 @@ again:
3039 } else { 3826 } else {
3040 btrfs_set_header_nritems(right, 0); 3827 btrfs_set_header_nritems(right, 0);
3041 insert_ptr(trans, root, path, &disk_key, right->start, 3828 insert_ptr(trans, root, path, &disk_key, right->start,
3042 path->slots[1], 1); 3829 path->slots[1], 1, 0);
3043 btrfs_tree_unlock(path->nodes[0]); 3830 btrfs_tree_unlock(path->nodes[0]);
3044 free_extent_buffer(path->nodes[0]); 3831 free_extent_buffer(path->nodes[0]);
3045 path->nodes[0] = right; 3832 path->nodes[0] = right;
@@ -3751,19 +4538,29 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
3751 * empty a node. 4538 * empty a node.
3752 */ 4539 */
3753static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4540static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3754 struct btrfs_path *path, int level, int slot) 4541 struct btrfs_path *path, int level, int slot,
4542 int tree_mod_log)
3755{ 4543{
3756 struct extent_buffer *parent = path->nodes[level]; 4544 struct extent_buffer *parent = path->nodes[level];
3757 u32 nritems; 4545 u32 nritems;
4546 int ret;
3758 4547
3759 nritems = btrfs_header_nritems(parent); 4548 nritems = btrfs_header_nritems(parent);
3760 if (slot != nritems - 1) { 4549 if (slot != nritems - 1) {
4550 if (tree_mod_log && level)
4551 tree_mod_log_eb_move(root->fs_info, parent, slot,
4552 slot + 1, nritems - slot - 1);
3761 memmove_extent_buffer(parent, 4553 memmove_extent_buffer(parent,
3762 btrfs_node_key_ptr_offset(slot), 4554 btrfs_node_key_ptr_offset(slot),
3763 btrfs_node_key_ptr_offset(slot + 1), 4555 btrfs_node_key_ptr_offset(slot + 1),
3764 sizeof(struct btrfs_key_ptr) * 4556 sizeof(struct btrfs_key_ptr) *
3765 (nritems - slot - 1)); 4557 (nritems - slot - 1));
4558 } else if (tree_mod_log && level) {
4559 ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4560 MOD_LOG_KEY_REMOVE);
4561 BUG_ON(ret < 0);
3766 } 4562 }
4563
3767 nritems--; 4564 nritems--;
3768 btrfs_set_header_nritems(parent, nritems); 4565 btrfs_set_header_nritems(parent, nritems);
3769 if (nritems == 0 && parent == root->node) { 4566 if (nritems == 0 && parent == root->node) {
@@ -3795,7 +4592,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
3795 struct extent_buffer *leaf) 4592 struct extent_buffer *leaf)
3796{ 4593{
3797 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4594 WARN_ON(btrfs_header_generation(leaf) != trans->transid);
3798 del_ptr(trans, root, path, 1, path->slots[1]); 4595 del_ptr(trans, root, path, 1, path->slots[1], 1);
3799 4596
3800 /* 4597 /*
3801 * btrfs_free_extent is expensive, we want to make sure we 4598 * btrfs_free_extent is expensive, we want to make sure we
@@ -3806,7 +4603,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
3806 root_sub_used(root, leaf->len); 4603 root_sub_used(root, leaf->len);
3807 4604
3808 extent_buffer_get(leaf); 4605 extent_buffer_get(leaf);
3809 btrfs_free_tree_block(trans, root, leaf, 0, 1, 0); 4606 btrfs_free_tree_block(trans, root, leaf, 0, 1);
3810 free_extent_buffer_stale(leaf); 4607 free_extent_buffer_stale(leaf);
3811} 4608}
3812/* 4609/*
@@ -4273,7 +5070,7 @@ again:
4273 next = c; 5070 next = c;
4274 next_rw_lock = path->locks[level]; 5071 next_rw_lock = path->locks[level];
4275 ret = read_block_for_search(NULL, root, path, &next, level, 5072 ret = read_block_for_search(NULL, root, path, &next, level,
4276 slot, &key); 5073 slot, &key, 0);
4277 if (ret == -EAGAIN) 5074 if (ret == -EAGAIN)
4278 goto again; 5075 goto again;
4279 5076
@@ -4310,7 +5107,7 @@ again:
4310 break; 5107 break;
4311 5108
4312 ret = read_block_for_search(NULL, root, path, &next, level, 5109 ret = read_block_for_search(NULL, root, path, &next, level,
4313 0, &key); 5110 0, &key, 0);
4314 if (ret == -EAGAIN) 5111 if (ret == -EAGAIN)
4315 goto again; 5112 goto again;
4316 5113
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1c665ebe47e0..0151ca1ac657 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1140,6 +1140,15 @@ struct btrfs_fs_info {
1140 spinlock_t delayed_iput_lock; 1140 spinlock_t delayed_iput_lock;
1141 struct list_head delayed_iputs; 1141 struct list_head delayed_iputs;
1142 1142
1143 /* this protects tree_mod_seq_list */
1144 spinlock_t tree_mod_seq_lock;
1145 atomic_t tree_mod_seq;
1146 struct list_head tree_mod_seq_list;
1147
1148 /* this protects tree_mod_log */
1149 rwlock_t tree_mod_log_lock;
1150 struct rb_root tree_mod_log;
1151
1143 atomic_t nr_async_submits; 1152 atomic_t nr_async_submits;
1144 atomic_t async_submit_draining; 1153 atomic_t async_submit_draining;
1145 atomic_t nr_async_bios; 1154 atomic_t nr_async_bios;
@@ -2537,11 +2546,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
2537 struct btrfs_root *root, u32 blocksize, 2546 struct btrfs_root *root, u32 blocksize,
2538 u64 parent, u64 root_objectid, 2547 u64 parent, u64 root_objectid,
2539 struct btrfs_disk_key *key, int level, 2548 struct btrfs_disk_key *key, int level,
2540 u64 hint, u64 empty_size, int for_cow); 2549 u64 hint, u64 empty_size);
2541void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 2550void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
2542 struct btrfs_root *root, 2551 struct btrfs_root *root,
2543 struct extent_buffer *buf, 2552 struct extent_buffer *buf,
2544 u64 parent, int last_ref, int for_cow); 2553 u64 parent, int last_ref);
2545struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 2554struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
2546 struct btrfs_root *root, 2555 struct btrfs_root *root,
2547 u64 bytenr, u32 blocksize, 2556 u64 bytenr, u32 blocksize,
@@ -2700,6 +2709,8 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
2700int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root 2709int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2701 *root, struct btrfs_key *key, struct btrfs_path *p, int 2710 *root, struct btrfs_key *key, struct btrfs_path *p, int
2702 ins_len, int cow); 2711 ins_len, int cow);
2712int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2713 struct btrfs_path *p, u64 time_seq);
2703int btrfs_realloc_node(struct btrfs_trans_handle *trans, 2714int btrfs_realloc_node(struct btrfs_trans_handle *trans,
2704 struct btrfs_root *root, struct extent_buffer *parent, 2715 struct btrfs_root *root, struct extent_buffer *parent,
2705 int start_slot, int cache_only, u64 *last_ret, 2716 int start_slot, int cache_only, u64 *last_ret,
@@ -3139,4 +3150,23 @@ void btrfs_reada_detach(void *handle);
3139int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, 3150int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
3140 u64 start, int err); 3151 u64 start, int err);
3141 3152
3153/* delayed seq elem */
3154struct seq_list {
3155 struct list_head list;
3156 u64 seq;
3157 u32 flags;
3158};
3159
3160void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
3161 struct seq_list *elem);
3162void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
3163 struct seq_list *elem);
3164
3165static inline int is_fstree(u64 rootid)
3166{
3167 if (rootid == BTRFS_FS_TREE_OBJECTID ||
3168 (s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
3169 return 1;
3170 return 0;
3171}
3142#endif 3172#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 69f22e3ab3bc..13ae7b04790e 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -525,7 +525,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
525 ref->is_head = 0; 525 ref->is_head = 0;
526 ref->in_tree = 1; 526 ref->in_tree = 1;
527 527
528 if (need_ref_seq(for_cow, ref_root)) 528 if (is_fstree(ref_root))
529 seq = inc_delayed_seq(delayed_refs); 529 seq = inc_delayed_seq(delayed_refs);
530 ref->seq = seq; 530 ref->seq = seq;
531 531
@@ -584,7 +584,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
584 ref->is_head = 0; 584 ref->is_head = 0;
585 ref->in_tree = 1; 585 ref->in_tree = 1;
586 586
587 if (need_ref_seq(for_cow, ref_root)) 587 if (is_fstree(ref_root))
588 seq = inc_delayed_seq(delayed_refs); 588 seq = inc_delayed_seq(delayed_refs);
589 ref->seq = seq; 589 ref->seq = seq;
590 590
@@ -658,10 +658,11 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
658 add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, 658 add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
659 num_bytes, parent, ref_root, level, action, 659 num_bytes, parent, ref_root, level, action,
660 for_cow); 660 for_cow);
661 if (!need_ref_seq(for_cow, ref_root) && 661 if (!is_fstree(ref_root) &&
662 waitqueue_active(&delayed_refs->seq_wait)) 662 waitqueue_active(&delayed_refs->seq_wait))
663 wake_up(&delayed_refs->seq_wait); 663 wake_up(&delayed_refs->seq_wait);
664 spin_unlock(&delayed_refs->lock); 664 spin_unlock(&delayed_refs->lock);
665
665 return 0; 666 return 0;
666} 667}
667 668
@@ -706,10 +707,11 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
706 add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, 707 add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
707 num_bytes, parent, ref_root, owner, offset, 708 num_bytes, parent, ref_root, owner, offset,
708 action, for_cow); 709 action, for_cow);
709 if (!need_ref_seq(for_cow, ref_root) && 710 if (!is_fstree(ref_root) &&
710 waitqueue_active(&delayed_refs->seq_wait)) 711 waitqueue_active(&delayed_refs->seq_wait))
711 wake_up(&delayed_refs->seq_wait); 712 wake_up(&delayed_refs->seq_wait);
712 spin_unlock(&delayed_refs->lock); 713 spin_unlock(&delayed_refs->lock);
714
713 return 0; 715 return 0;
714} 716}
715 717
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index d8f244d94925..413927fb9957 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -195,11 +195,6 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
195int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 195int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
196 struct list_head *cluster, u64 search_start); 196 struct list_head *cluster, u64 search_start);
197 197
198struct seq_list {
199 struct list_head list;
200 u64 seq;
201};
202
203static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs) 198static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs)
204{ 199{
205 assert_spin_locked(&delayed_refs->lock); 200 assert_spin_locked(&delayed_refs->lock);
@@ -230,25 +225,6 @@ int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
230 u64 seq); 225 u64 seq);
231 226
232/* 227/*
233 * delayed refs with a ref_seq > 0 must be held back during backref walking.
234 * this only applies to items in one of the fs-trees. for_cow items never need
235 * to be held back, so they won't get a ref_seq number.
236 */
237static inline int need_ref_seq(int for_cow, u64 rootid)
238{
239 if (for_cow)
240 return 0;
241
242 if (rootid == BTRFS_FS_TREE_OBJECTID)
243 return 1;
244
245 if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
246 return 1;
247
248 return 0;
249}
250
251/*
252 * a node might live in a head or a regular ref, this lets you 228 * a node might live in a head or a regular ref, this lets you
253 * test for the proper type to use. 229 * test for the proper type to use.
254 */ 230 */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b0d49e21b0b1..b99d5127ba18 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1252,7 +1252,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1252 1252
1253 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 1253 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
1254 BTRFS_TREE_LOG_OBJECTID, NULL, 1254 BTRFS_TREE_LOG_OBJECTID, NULL,
1255 0, 0, 0, 0); 1255 0, 0, 0);
1256 if (IS_ERR(leaf)) { 1256 if (IS_ERR(leaf)) {
1257 kfree(root); 1257 kfree(root);
1258 return ERR_CAST(leaf); 1258 return ERR_CAST(leaf);
@@ -1914,11 +1914,14 @@ int open_ctree(struct super_block *sb,
1914 spin_lock_init(&fs_info->delayed_iput_lock); 1914 spin_lock_init(&fs_info->delayed_iput_lock);
1915 spin_lock_init(&fs_info->defrag_inodes_lock); 1915 spin_lock_init(&fs_info->defrag_inodes_lock);
1916 spin_lock_init(&fs_info->free_chunk_lock); 1916 spin_lock_init(&fs_info->free_chunk_lock);
1917 spin_lock_init(&fs_info->tree_mod_seq_lock);
1918 rwlock_init(&fs_info->tree_mod_log_lock);
1917 mutex_init(&fs_info->reloc_mutex); 1919 mutex_init(&fs_info->reloc_mutex);
1918 1920
1919 init_completion(&fs_info->kobj_unregister); 1921 init_completion(&fs_info->kobj_unregister);
1920 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 1922 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
1921 INIT_LIST_HEAD(&fs_info->space_info); 1923 INIT_LIST_HEAD(&fs_info->space_info);
1924 INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
1922 btrfs_mapping_init(&fs_info->mapping_tree); 1925 btrfs_mapping_init(&fs_info->mapping_tree);
1923 btrfs_init_block_rsv(&fs_info->global_block_rsv); 1926 btrfs_init_block_rsv(&fs_info->global_block_rsv);
1924 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv); 1927 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
@@ -1931,12 +1934,14 @@ int open_ctree(struct super_block *sb,
1931 atomic_set(&fs_info->async_submit_draining, 0); 1934 atomic_set(&fs_info->async_submit_draining, 0);
1932 atomic_set(&fs_info->nr_async_bios, 0); 1935 atomic_set(&fs_info->nr_async_bios, 0);
1933 atomic_set(&fs_info->defrag_running, 0); 1936 atomic_set(&fs_info->defrag_running, 0);
1937 atomic_set(&fs_info->tree_mod_seq, 0);
1934 fs_info->sb = sb; 1938 fs_info->sb = sb;
1935 fs_info->max_inline = 8192 * 1024; 1939 fs_info->max_inline = 8192 * 1024;
1936 fs_info->metadata_ratio = 0; 1940 fs_info->metadata_ratio = 0;
1937 fs_info->defrag_inodes = RB_ROOT; 1941 fs_info->defrag_inodes = RB_ROOT;
1938 fs_info->trans_no_join = 0; 1942 fs_info->trans_no_join = 0;
1939 fs_info->free_chunk_space = 0; 1943 fs_info->free_chunk_space = 0;
1944 fs_info->tree_mod_log = RB_ROOT;
1940 1945
1941 /* readahead state */ 1946 /* readahead state */
1942 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); 1947 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1902726fa70a..4b5a1e1bdefb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5218,7 +5218,7 @@ out:
5218void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 5218void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5219 struct btrfs_root *root, 5219 struct btrfs_root *root,
5220 struct extent_buffer *buf, 5220 struct extent_buffer *buf,
5221 u64 parent, int last_ref, int for_cow) 5221 u64 parent, int last_ref)
5222{ 5222{
5223 struct btrfs_block_group_cache *cache = NULL; 5223 struct btrfs_block_group_cache *cache = NULL;
5224 int ret; 5224 int ret;
@@ -5228,7 +5228,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5228 buf->start, buf->len, 5228 buf->start, buf->len,
5229 parent, root->root_key.objectid, 5229 parent, root->root_key.objectid,
5230 btrfs_header_level(buf), 5230 btrfs_header_level(buf),
5231 BTRFS_DROP_DELAYED_REF, NULL, for_cow); 5231 BTRFS_DROP_DELAYED_REF, NULL, 0);
5232 BUG_ON(ret); /* -ENOMEM */ 5232 BUG_ON(ret); /* -ENOMEM */
5233 } 5233 }
5234 5234
@@ -6250,7 +6250,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6250 struct btrfs_root *root, u32 blocksize, 6250 struct btrfs_root *root, u32 blocksize,
6251 u64 parent, u64 root_objectid, 6251 u64 parent, u64 root_objectid,
6252 struct btrfs_disk_key *key, int level, 6252 struct btrfs_disk_key *key, int level,
6253 u64 hint, u64 empty_size, int for_cow) 6253 u64 hint, u64 empty_size)
6254{ 6254{
6255 struct btrfs_key ins; 6255 struct btrfs_key ins;
6256 struct btrfs_block_rsv *block_rsv; 6256 struct btrfs_block_rsv *block_rsv;
@@ -6298,7 +6298,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6298 ins.objectid, 6298 ins.objectid,
6299 ins.offset, parent, root_objectid, 6299 ins.offset, parent, root_objectid,
6300 level, BTRFS_ADD_DELAYED_EXTENT, 6300 level, BTRFS_ADD_DELAYED_EXTENT,
6301 extent_op, for_cow); 6301 extent_op, 0);
6302 BUG_ON(ret); /* -ENOMEM */ 6302 BUG_ON(ret); /* -ENOMEM */
6303 } 6303 }
6304 return buf; 6304 return buf;
@@ -6716,7 +6716,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6716 btrfs_header_owner(path->nodes[level + 1])); 6716 btrfs_header_owner(path->nodes[level + 1]));
6717 } 6717 }
6718 6718
6719 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0); 6719 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
6720out: 6720out:
6721 wc->refs[level] = 0; 6721 wc->refs[level] = 0;
6722 wc->flags[level] = 0; 6722 wc->flags[level] = 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b3692c1373aa..2c8f7b204617 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3924,6 +3924,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3924 eb->start = start; 3924 eb->start = start;
3925 eb->len = len; 3925 eb->len = len;
3926 eb->tree = tree; 3926 eb->tree = tree;
3927 eb->bflags = 0;
3927 rwlock_init(&eb->lock); 3928 rwlock_init(&eb->lock);
3928 atomic_set(&eb->write_locks, 0); 3929 atomic_set(&eb->write_locks, 0);
3929 atomic_set(&eb->read_locks, 0); 3930 atomic_set(&eb->read_locks, 0);
@@ -3961,6 +3962,60 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3961 return eb; 3962 return eb;
3962} 3963}
3963 3964
3965struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
3966{
3967 unsigned long i;
3968 struct page *p;
3969 struct extent_buffer *new;
3970 unsigned long num_pages = num_extent_pages(src->start, src->len);
3971
3972 new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_ATOMIC);
3973 if (new == NULL)
3974 return NULL;
3975
3976 for (i = 0; i < num_pages; i++) {
3977 p = alloc_page(GFP_ATOMIC);
3978 BUG_ON(!p);
3979 attach_extent_buffer_page(new, p);
3980 WARN_ON(PageDirty(p));
3981 SetPageUptodate(p);
3982 new->pages[i] = p;
3983 }
3984
3985 copy_extent_buffer(new, src, 0, 0, src->len);
3986 set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
3987 set_bit(EXTENT_BUFFER_DUMMY, &new->bflags);
3988
3989 return new;
3990}
3991
3992struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
3993{
3994 struct extent_buffer *eb;
3995 unsigned long num_pages = num_extent_pages(0, len);
3996 unsigned long i;
3997
3998 eb = __alloc_extent_buffer(NULL, start, len, GFP_ATOMIC);
3999 if (!eb)
4000 return NULL;
4001
4002 for (i = 0; i < num_pages; i++) {
4003 eb->pages[i] = alloc_page(GFP_ATOMIC);
4004 if (!eb->pages[i])
4005 goto err;
4006 }
4007 set_extent_buffer_uptodate(eb);
4008 btrfs_set_header_nritems(eb, 0);
4009 set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
4010
4011 return eb;
4012err:
4013 for (i--; i > 0; i--)
4014 __free_page(eb->pages[i]);
4015 __free_extent_buffer(eb);
4016 return NULL;
4017}
4018
3964static int extent_buffer_under_io(struct extent_buffer *eb) 4019static int extent_buffer_under_io(struct extent_buffer *eb)
3965{ 4020{
3966 return (atomic_read(&eb->io_pages) || 4021 return (atomic_read(&eb->io_pages) ||
@@ -3977,6 +4032,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
3977 unsigned long index; 4032 unsigned long index;
3978 unsigned long num_pages; 4033 unsigned long num_pages;
3979 struct page *page; 4034 struct page *page;
4035 int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
3980 4036
3981 BUG_ON(extent_buffer_under_io(eb)); 4037 BUG_ON(extent_buffer_under_io(eb));
3982 4038
@@ -3988,7 +4044,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
3988 do { 4044 do {
3989 index--; 4045 index--;
3990 page = extent_buffer_page(eb, index); 4046 page = extent_buffer_page(eb, index);
3991 if (page) { 4047 if (page && mapped) {
3992 spin_lock(&page->mapping->private_lock); 4048 spin_lock(&page->mapping->private_lock);
3993 /* 4049 /*
3994 * We do this since we'll remove the pages after we've 4050 * We do this since we'll remove the pages after we've
@@ -4013,6 +4069,8 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
4013 } 4069 }
4014 spin_unlock(&page->mapping->private_lock); 4070 spin_unlock(&page->mapping->private_lock);
4015 4071
4072 }
4073 if (page) {
4016 /* One for when we alloced the page */ 4074 /* One for when we alloced the page */
4017 page_cache_release(page); 4075 page_cache_release(page);
4018 } 4076 }
@@ -4231,14 +4289,18 @@ static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
4231{ 4289{
4232 WARN_ON(atomic_read(&eb->refs) == 0); 4290 WARN_ON(atomic_read(&eb->refs) == 0);
4233 if (atomic_dec_and_test(&eb->refs)) { 4291 if (atomic_dec_and_test(&eb->refs)) {
4234 struct extent_io_tree *tree = eb->tree; 4292 if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) {
4293 spin_unlock(&eb->refs_lock);
4294 } else {
4295 struct extent_io_tree *tree = eb->tree;
4235 4296
4236 spin_unlock(&eb->refs_lock); 4297 spin_unlock(&eb->refs_lock);
4237 4298
4238 spin_lock(&tree->buffer_lock); 4299 spin_lock(&tree->buffer_lock);
4239 radix_tree_delete(&tree->buffer, 4300 radix_tree_delete(&tree->buffer,
4240 eb->start >> PAGE_CACHE_SHIFT); 4301 eb->start >> PAGE_CACHE_SHIFT);
4241 spin_unlock(&tree->buffer_lock); 4302 spin_unlock(&tree->buffer_lock);
4303 }
4242 4304
4243 /* Should be safe to release our pages at this point */ 4305 /* Should be safe to release our pages at this point */
4244 btrfs_release_extent_buffer_page(eb, 0); 4306 btrfs_release_extent_buffer_page(eb, 0);
@@ -4256,6 +4318,10 @@ void free_extent_buffer(struct extent_buffer *eb)
4256 4318
4257 spin_lock(&eb->refs_lock); 4319 spin_lock(&eb->refs_lock);
4258 if (atomic_read(&eb->refs) == 2 && 4320 if (atomic_read(&eb->refs) == 2 &&
4321 test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
4322 atomic_dec(&eb->refs);
4323
4324 if (atomic_read(&eb->refs) == 2 &&
4259 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && 4325 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
4260 !extent_buffer_under_io(eb) && 4326 !extent_buffer_under_io(eb) &&
4261 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4327 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 4d8124b64577..25900af5b15d 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -39,6 +39,7 @@
39#define EXTENT_BUFFER_STALE 6 39#define EXTENT_BUFFER_STALE 6
40#define EXTENT_BUFFER_WRITEBACK 7 40#define EXTENT_BUFFER_WRITEBACK 7
41#define EXTENT_BUFFER_IOERR 8 41#define EXTENT_BUFFER_IOERR 8
42#define EXTENT_BUFFER_DUMMY 9
42 43
43/* these are flags for extent_clear_unlock_delalloc */ 44/* these are flags for extent_clear_unlock_delalloc */
44#define EXTENT_CLEAR_UNLOCK_PAGE 0x1 45#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -264,6 +265,8 @@ void set_page_extent_mapped(struct page *page);
264 265
265struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, 266struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
266 u64 start, unsigned long len); 267 u64 start, unsigned long len);
268struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len);
269struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
267struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 270struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
268 u64 start, unsigned long len); 271 u64 start, unsigned long len);
269void free_extent_buffer(struct extent_buffer *eb); 272void free_extent_buffer(struct extent_buffer *eb);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0f8c354c4c76..24b776c08d99 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -368,7 +368,7 @@ static noinline int create_subvol(struct btrfs_root *root,
368 return PTR_ERR(trans); 368 return PTR_ERR(trans);
369 369
370 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 370 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
371 0, objectid, NULL, 0, 0, 0, 0); 371 0, objectid, NULL, 0, 0, 0);
372 if (IS_ERR(leaf)) { 372 if (IS_ERR(leaf)) {
373 ret = PTR_ERR(leaf); 373 ret = PTR_ERR(leaf);
374 goto fail; 374 goto fail;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 82b03afcbd92..1791c6e3d834 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -56,48 +56,49 @@ static noinline void switch_commit_root(struct btrfs_root *root)
56static noinline int join_transaction(struct btrfs_root *root, int nofail) 56static noinline int join_transaction(struct btrfs_root *root, int nofail)
57{ 57{
58 struct btrfs_transaction *cur_trans; 58 struct btrfs_transaction *cur_trans;
59 struct btrfs_fs_info *fs_info = root->fs_info;
59 60
60 spin_lock(&root->fs_info->trans_lock); 61 spin_lock(&fs_info->trans_lock);
61loop: 62loop:
62 /* The file system has been taken offline. No new transactions. */ 63 /* The file system has been taken offline. No new transactions. */
63 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 64 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
64 spin_unlock(&root->fs_info->trans_lock); 65 spin_unlock(&fs_info->trans_lock);
65 return -EROFS; 66 return -EROFS;
66 } 67 }
67 68
68 if (root->fs_info->trans_no_join) { 69 if (fs_info->trans_no_join) {
69 if (!nofail) { 70 if (!nofail) {
70 spin_unlock(&root->fs_info->trans_lock); 71 spin_unlock(&fs_info->trans_lock);
71 return -EBUSY; 72 return -EBUSY;
72 } 73 }
73 } 74 }
74 75
75 cur_trans = root->fs_info->running_transaction; 76 cur_trans = fs_info->running_transaction;
76 if (cur_trans) { 77 if (cur_trans) {
77 if (cur_trans->aborted) { 78 if (cur_trans->aborted) {
78 spin_unlock(&root->fs_info->trans_lock); 79 spin_unlock(&fs_info->trans_lock);
79 return cur_trans->aborted; 80 return cur_trans->aborted;
80 } 81 }
81 atomic_inc(&cur_trans->use_count); 82 atomic_inc(&cur_trans->use_count);
82 atomic_inc(&cur_trans->num_writers); 83 atomic_inc(&cur_trans->num_writers);
83 cur_trans->num_joined++; 84 cur_trans->num_joined++;
84 spin_unlock(&root->fs_info->trans_lock); 85 spin_unlock(&fs_info->trans_lock);
85 return 0; 86 return 0;
86 } 87 }
87 spin_unlock(&root->fs_info->trans_lock); 88 spin_unlock(&fs_info->trans_lock);
88 89
89 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); 90 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
90 if (!cur_trans) 91 if (!cur_trans)
91 return -ENOMEM; 92 return -ENOMEM;
92 93
93 spin_lock(&root->fs_info->trans_lock); 94 spin_lock(&fs_info->trans_lock);
94 if (root->fs_info->running_transaction) { 95 if (fs_info->running_transaction) {
95 /* 96 /*
96 * someone started a transaction after we unlocked. Make sure 97 * someone started a transaction after we unlocked. Make sure
97 * to redo the trans_no_join checks above 98 * to redo the trans_no_join checks above
98 */ 99 */
99 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 100 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
100 cur_trans = root->fs_info->running_transaction; 101 cur_trans = fs_info->running_transaction;
101 goto loop; 102 goto loop;
102 } 103 }
103 104
@@ -122,20 +123,38 @@ loop:
122 cur_trans->delayed_refs.flushing = 0; 123 cur_trans->delayed_refs.flushing = 0;
123 cur_trans->delayed_refs.run_delayed_start = 0; 124 cur_trans->delayed_refs.run_delayed_start = 0;
124 cur_trans->delayed_refs.seq = 1; 125 cur_trans->delayed_refs.seq = 1;
126
127 /*
128 * although the tree mod log is per file system and not per transaction,
129 * the log must never go across transaction boundaries.
130 */
131 smp_mb();
132 if (!list_empty(&fs_info->tree_mod_seq_list)) {
133 printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when "
134 "creating a fresh transaction\n");
135 WARN_ON(1);
136 }
137 if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) {
138 printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
139 "creating a fresh transaction\n");
140 WARN_ON(1);
141 }
142 atomic_set(&fs_info->tree_mod_seq, 0);
143
125 init_waitqueue_head(&cur_trans->delayed_refs.seq_wait); 144 init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);
126 spin_lock_init(&cur_trans->commit_lock); 145 spin_lock_init(&cur_trans->commit_lock);
127 spin_lock_init(&cur_trans->delayed_refs.lock); 146 spin_lock_init(&cur_trans->delayed_refs.lock);
128 INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head); 147 INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
129 148
130 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 149 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
131 list_add_tail(&cur_trans->list, &root->fs_info->trans_list); 150 list_add_tail(&cur_trans->list, &fs_info->trans_list);
132 extent_io_tree_init(&cur_trans->dirty_pages, 151 extent_io_tree_init(&cur_trans->dirty_pages,
133 root->fs_info->btree_inode->i_mapping); 152 fs_info->btree_inode->i_mapping);
134 root->fs_info->generation++; 153 fs_info->generation++;
135 cur_trans->transid = root->fs_info->generation; 154 cur_trans->transid = fs_info->generation;
136 root->fs_info->running_transaction = cur_trans; 155 fs_info->running_transaction = cur_trans;
137 cur_trans->aborted = 0; 156 cur_trans->aborted = 0;
138 spin_unlock(&root->fs_info->trans_lock); 157 spin_unlock(&fs_info->trans_lock);
139 158
140 return 0; 159 return 0;
141} 160}
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index ad993bc2df93..ab942f46b3dd 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -23,9 +23,9 @@
23 * 23 *
24 * ulist = ulist_alloc(); 24 * ulist = ulist_alloc();
25 * ulist_add(ulist, root); 25 * ulist_add(ulist, root);
26 * elem = NULL; 26 * ULIST_ITER_INIT(&uiter);
27 * 27 *
28 * while ((elem = ulist_next(ulist, elem)) { 28 * while ((elem = ulist_next(ulist, &uiter)) {
29 * for (all child nodes n in elem) 29 * for (all child nodes n in elem)
30 * ulist_add(ulist, n); 30 * ulist_add(ulist, n);
31 * do something useful with the node; 31 * do something useful with the node;
@@ -146,11 +146,20 @@ EXPORT_SYMBOL(ulist_free);
146int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, 146int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
147 gfp_t gfp_mask) 147 gfp_t gfp_mask)
148{ 148{
149 return ulist_add_merge(ulist, val, aux, NULL, gfp_mask);
150}
151
152int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux,
153 unsigned long *old_aux, gfp_t gfp_mask)
154{
149 int i; 155 int i;
150 156
151 for (i = 0; i < ulist->nnodes; ++i) { 157 for (i = 0; i < ulist->nnodes; ++i) {
152 if (ulist->nodes[i].val == val) 158 if (ulist->nodes[i].val == val) {
159 if (old_aux)
160 *old_aux = ulist->nodes[i].aux;
153 return 0; 161 return 0;
162 }
154 } 163 }
155 164
156 if (ulist->nnodes >= ulist->nodes_alloced) { 165 if (ulist->nnodes >= ulist->nodes_alloced) {
@@ -188,33 +197,26 @@ EXPORT_SYMBOL(ulist_add);
188/** 197/**
189 * ulist_next - iterate ulist 198 * ulist_next - iterate ulist
190 * @ulist: ulist to iterate 199 * @ulist: ulist to iterate
191 * @prev: previously returned element or %NULL to start iteration 200 * @uiter: iterator variable, initialized with ULIST_ITER_INIT(&iterator)
192 * 201 *
193 * Note: locking must be provided by the caller. In case of rwlocks only read 202 * Note: locking must be provided by the caller. In case of rwlocks only read
194 * locking is needed 203 * locking is needed
195 * 204 *
196 * This function is used to iterate an ulist. The iteration is started with 205 * This function is used to iterate an ulist.
197 * @prev = %NULL. It returns the next element from the ulist or %NULL when the 206 * It returns the next element from the ulist or %NULL when the
198 * end is reached. No guarantee is made with respect to the order in which 207 * end is reached. No guarantee is made with respect to the order in which
199 * the elements are returned. They might neither be returned in order of 208 * the elements are returned. They might neither be returned in order of
200 * addition nor in ascending order. 209 * addition nor in ascending order.
201 * It is allowed to call ulist_add during an enumeration. Newly added items 210 * It is allowed to call ulist_add during an enumeration. Newly added items
202 * are guaranteed to show up in the running enumeration. 211 * are guaranteed to show up in the running enumeration.
203 */ 212 */
204struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev) 213struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter)
205{ 214{
206 int next;
207
208 if (ulist->nnodes == 0) 215 if (ulist->nnodes == 0)
209 return NULL; 216 return NULL;
210 217 if (uiter->i < 0 || uiter->i >= ulist->nnodes)
211 if (!prev)
212 return &ulist->nodes[0];
213
214 next = (prev - ulist->nodes) + 1;
215 if (next < 0 || next >= ulist->nnodes)
216 return NULL; 218 return NULL;
217 219
218 return &ulist->nodes[next]; 220 return &ulist->nodes[uiter->i++];
219} 221}
220EXPORT_SYMBOL(ulist_next); 222EXPORT_SYMBOL(ulist_next);
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 6568c3527732..21bdc8ec8130 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -24,6 +24,10 @@
24 */ 24 */
25#define ULIST_SIZE 16 25#define ULIST_SIZE 16
26 26
27struct ulist_iterator {
28 int i;
29};
30
27/* 31/*
28 * element of the list 32 * element of the list
29 */ 33 */
@@ -61,7 +65,13 @@ void ulist_fini(struct ulist *ulist);
61void ulist_reinit(struct ulist *ulist); 65void ulist_reinit(struct ulist *ulist);
62struct ulist *ulist_alloc(gfp_t gfp_mask); 66struct ulist *ulist_alloc(gfp_t gfp_mask);
63void ulist_free(struct ulist *ulist); 67void ulist_free(struct ulist *ulist);
64int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, gfp_t gfp_mask); 68int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
65struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev); 69 gfp_t gfp_mask);
70int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux,
71 unsigned long *old_aux, gfp_t gfp_mask);
72struct ulist_node *ulist_next(struct ulist *ulist,
73 struct ulist_iterator *uiter);
74
75#define ULIST_ITER_INIT(uiter) ((uiter)->i = 0)
66 76
67#endif 77#endif