aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-01-17 18:49:54 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-01-17 18:49:54 -0500
commitf9156c7288e2d11501ded4d7fe6d9a3a41ee4057 (patch)
tree7bd26fc9a111c6af1601ecd2d1b0ab60da32f3f0
parent67175b855bfd6ed95ffeff95532173c07de6432d (diff)
parent96bdc7dc61fb1b1e8e858dafb13abee8482ba064 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (62 commits) Btrfs: use larger system chunks Btrfs: add a delalloc mutex to inodes for delalloc reservations Btrfs: space leak tracepoints Btrfs: protect orphan block rsv with spin_lock Btrfs: add allocator tracepoints Btrfs: don't call btrfs_throttle in file write Btrfs: release space on error in page_mkwrite Btrfs: fix btrfsck error 400 when truncating a compressed Btrfs: do not use btrfs_end_transaction_throttle everywhere Btrfs: add balance progress reporting Btrfs: allow for resuming restriper after it was paused Btrfs: allow for canceling restriper Btrfs: allow for pausing restriper Btrfs: add skip_balance mount option Btrfs: recover balance on mount Btrfs: save balance parameters to disk Btrfs: soft profile changing mode (aka soft convert) Btrfs: implement online profile changing Btrfs: do not reduce profile in do_chunk_alloc() Btrfs: virtual address space subset filter ... Fix up trivial conflict in fs/btrfs/ioctl.c due to the use of the new mnt_drop_write_file() helper.
-rw-r--r--fs/btrfs/Kconfig19
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/backref.c1131
-rw-r--r--fs/btrfs/backref.h5
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/check-integrity.c3068
-rw-r--r--fs/btrfs/check-integrity.h36
-rw-r--r--fs/btrfs/ctree.c42
-rw-r--r--fs/btrfs/ctree.h237
-rw-r--r--fs/btrfs/delayed-inode.c45
-rw-r--r--fs/btrfs/delayed-ref.c153
-rw-r--r--fs/btrfs/delayed-ref.h104
-rw-r--r--fs/btrfs/disk-io.c49
-rw-r--r--fs/btrfs/extent-tree.c465
-rw-r--r--fs/btrfs/extent_io.c6
-rw-r--r--fs/btrfs/extent_io.h2
-rw-r--r--fs/btrfs/file.c11
-rw-r--r--fs/btrfs/free-space-cache.c417
-rw-r--r--fs/btrfs/inode-map.c4
-rw-r--r--fs/btrfs/inode.c66
-rw-r--r--fs/btrfs/ioctl.c261
-rw-r--r--fs/btrfs/ioctl.h54
-rw-r--r--fs/btrfs/locking.c53
-rw-r--r--fs/btrfs/relocation.c20
-rw-r--r--fs/btrfs/scrub.c12
-rw-r--r--fs/btrfs/super.c47
-rw-r--r--fs/btrfs/transaction.c20
-rw-r--r--fs/btrfs/tree-log.c2
-rw-r--r--fs/btrfs/ulist.c220
-rw-r--r--fs/btrfs/ulist.h68
-rw-r--r--fs/btrfs/volumes.c988
-rw-r--r--fs/btrfs/volumes.h54
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--include/trace/events/btrfs.h203
34 files changed, 6949 insertions, 921 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index ecb9fd3be143..d33f01c08b60 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -31,3 +31,22 @@ config BTRFS_FS_POSIX_ACL
31 Linux website <http://acl.bestbits.at/>. 31 Linux website <http://acl.bestbits.at/>.
32 32
33 If you don't know what Access Control Lists are, say N 33 If you don't know what Access Control Lists are, say N
34
35config BTRFS_FS_CHECK_INTEGRITY
36 bool "Btrfs with integrity check tool compiled in (DANGEROUS)"
37 depends on BTRFS_FS
38 help
39 Adds code that examines all block write requests (including
40 writes of the super block). The goal is to verify that the
41 state of the filesystem on disk is always consistent, i.e.,
42 after a power-loss or kernel panic event the filesystem is
43 in a consistent state.
44
45 If the integrity check tool is included and activated in
46 the mount options, plenty of kernel memory is used, and
47 plenty of additional CPU cycles are spent. Enabling this
48 functionality is not intended for normal use.
49
50 In most cases, unless you are a btrfs developer who needs
51 to verify the integrity of (super)-block write requests
52 during the run of a regression test, say N
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index c0ddfd29c5e5..0c4fa2befae7 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,6 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o backref.o 11 reada.o backref.o ulist.o
12 12
13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 22c64fff1bd5..b9a843226de8 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -19,18 +19,789 @@
19#include "ctree.h" 19#include "ctree.h"
20#include "disk-io.h" 20#include "disk-io.h"
21#include "backref.h" 21#include "backref.h"
22#include "ulist.h"
23#include "transaction.h"
24#include "delayed-ref.h"
22 25
23struct __data_ref { 26/*
27 * this structure records all encountered refs on the way up to the root
28 */
29struct __prelim_ref {
24 struct list_head list; 30 struct list_head list;
25 u64 inum; 31 u64 root_id;
26 u64 root; 32 struct btrfs_key key;
27 u64 extent_data_item_offset; 33 int level;
34 int count;
35 u64 parent;
36 u64 wanted_disk_byte;
28}; 37};
29 38
30struct __shared_ref { 39static int __add_prelim_ref(struct list_head *head, u64 root_id,
31 struct list_head list; 40 struct btrfs_key *key, int level, u64 parent,
41 u64 wanted_disk_byte, int count)
42{
43 struct __prelim_ref *ref;
44
45 /* in case we're adding delayed refs, we're holding the refs spinlock */
46 ref = kmalloc(sizeof(*ref), GFP_ATOMIC);
47 if (!ref)
48 return -ENOMEM;
49
50 ref->root_id = root_id;
51 if (key)
52 ref->key = *key;
53 else
54 memset(&ref->key, 0, sizeof(ref->key));
55
56 ref->level = level;
57 ref->count = count;
58 ref->parent = parent;
59 ref->wanted_disk_byte = wanted_disk_byte;
60 list_add_tail(&ref->list, head);
61
62 return 0;
63}
64
65static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
66 struct ulist *parents,
67 struct extent_buffer *eb, int level,
68 u64 wanted_objectid, u64 wanted_disk_byte)
69{
70 int ret;
71 int slot;
72 struct btrfs_file_extent_item *fi;
73 struct btrfs_key key;
32 u64 disk_byte; 74 u64 disk_byte;
33}; 75
76add_parent:
77 ret = ulist_add(parents, eb->start, 0, GFP_NOFS);
78 if (ret < 0)
79 return ret;
80
81 if (level != 0)
82 return 0;
83
84 /*
85 * if the current leaf is full with EXTENT_DATA items, we must
86 * check the next one if that holds a reference as well.
87 * ref->count cannot be used to skip this check.
88 * repeat this until we don't find any additional EXTENT_DATA items.
89 */
90 while (1) {
91 ret = btrfs_next_leaf(root, path);
92 if (ret < 0)
93 return ret;
94 if (ret)
95 return 0;
96
97 eb = path->nodes[0];
98 for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) {
99 btrfs_item_key_to_cpu(eb, &key, slot);
100 if (key.objectid != wanted_objectid ||
101 key.type != BTRFS_EXTENT_DATA_KEY)
102 return 0;
103 fi = btrfs_item_ptr(eb, slot,
104 struct btrfs_file_extent_item);
105 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
106 if (disk_byte == wanted_disk_byte)
107 goto add_parent;
108 }
109 }
110
111 return 0;
112}
113
114/*
115 * resolve an indirect backref in the form (root_id, key, level)
116 * to a logical address
117 */
118static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
119 struct __prelim_ref *ref,
120 struct ulist *parents)
121{
122 struct btrfs_path *path;
123 struct btrfs_root *root;
124 struct btrfs_key root_key;
125 struct btrfs_key key = {0};
126 struct extent_buffer *eb;
127 int ret = 0;
128 int root_level;
129 int level = ref->level;
130
131 path = btrfs_alloc_path();
132 if (!path)
133 return -ENOMEM;
134
135 root_key.objectid = ref->root_id;
136 root_key.type = BTRFS_ROOT_ITEM_KEY;
137 root_key.offset = (u64)-1;
138 root = btrfs_read_fs_root_no_name(fs_info, &root_key);
139 if (IS_ERR(root)) {
140 ret = PTR_ERR(root);
141 goto out;
142 }
143
144 rcu_read_lock();
145 root_level = btrfs_header_level(root->node);
146 rcu_read_unlock();
147
148 if (root_level + 1 == level)
149 goto out;
150
151 path->lowest_level = level;
152 ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0);
153 pr_debug("search slot in root %llu (level %d, ref count %d) returned "
154 "%d for key (%llu %u %llu)\n",
155 (unsigned long long)ref->root_id, level, ref->count, ret,
156 (unsigned long long)ref->key.objectid, ref->key.type,
157 (unsigned long long)ref->key.offset);
158 if (ret < 0)
159 goto out;
160
161 eb = path->nodes[level];
162 if (!eb) {
163 WARN_ON(1);
164 ret = 1;
165 goto out;
166 }
167
168 if (level == 0) {
169 if (ret == 1 && path->slots[0] >= btrfs_header_nritems(eb)) {
170 ret = btrfs_next_leaf(root, path);
171 if (ret)
172 goto out;
173 eb = path->nodes[0];
174 }
175
176 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
177 }
178
179 /* the last two parameters will only be used for level == 0 */
180 ret = add_all_parents(root, path, parents, eb, level, key.objectid,
181 ref->wanted_disk_byte);
182out:
183 btrfs_free_path(path);
184 return ret;
185}
186
187/*
188 * resolve all indirect backrefs from the list
189 */
190static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
191 struct list_head *head)
192{
193 int err;
194 int ret = 0;
195 struct __prelim_ref *ref;
196 struct __prelim_ref *ref_safe;
197 struct __prelim_ref *new_ref;
198 struct ulist *parents;
199 struct ulist_node *node;
200
201 parents = ulist_alloc(GFP_NOFS);
202 if (!parents)
203 return -ENOMEM;
204
205 /*
206 * _safe allows us to insert directly after the current item without
207 * iterating over the newly inserted items.
208 * we're also allowed to re-assign ref during iteration.
209 */
210 list_for_each_entry_safe(ref, ref_safe, head, list) {
211 if (ref->parent) /* already direct */
212 continue;
213 if (ref->count == 0)
214 continue;
215 err = __resolve_indirect_ref(fs_info, ref, parents);
216 if (err) {
217 if (ret == 0)
218 ret = err;
219 continue;
220 }
221
222 /* we put the first parent into the ref at hand */
223 node = ulist_next(parents, NULL);
224 ref->parent = node ? node->val : 0;
225
226 /* additional parents require new refs being added here */
227 while ((node = ulist_next(parents, node))) {
228 new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS);
229 if (!new_ref) {
230 ret = -ENOMEM;
231 break;
232 }
233 memcpy(new_ref, ref, sizeof(*ref));
234 new_ref->parent = node->val;
235 list_add(&new_ref->list, &ref->list);
236 }
237 ulist_reinit(parents);
238 }
239
240 ulist_free(parents);
241 return ret;
242}
243
244/*
245 * merge two lists of backrefs and adjust counts accordingly
246 *
247 * mode = 1: merge identical keys, if key is set
248 * mode = 2: merge identical parents
249 */
250static int __merge_refs(struct list_head *head, int mode)
251{
252 struct list_head *pos1;
253
254 list_for_each(pos1, head) {
255 struct list_head *n2;
256 struct list_head *pos2;
257 struct __prelim_ref *ref1;
258
259 ref1 = list_entry(pos1, struct __prelim_ref, list);
260
261 if (mode == 1 && ref1->key.type == 0)
262 continue;
263 for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
264 pos2 = n2, n2 = pos2->next) {
265 struct __prelim_ref *ref2;
266
267 ref2 = list_entry(pos2, struct __prelim_ref, list);
268
269 if (mode == 1) {
270 if (memcmp(&ref1->key, &ref2->key,
271 sizeof(ref1->key)) ||
272 ref1->level != ref2->level ||
273 ref1->root_id != ref2->root_id)
274 continue;
275 ref1->count += ref2->count;
276 } else {
277 if (ref1->parent != ref2->parent)
278 continue;
279 ref1->count += ref2->count;
280 }
281 list_del(&ref2->list);
282 kfree(ref2);
283 }
284
285 }
286 return 0;
287}
288
289/*
290 * add all currently queued delayed refs from this head whose seq nr is
291 * smaller or equal that seq to the list
292 */
293static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
294 struct btrfs_key *info_key,
295 struct list_head *prefs)
296{
297 struct btrfs_delayed_extent_op *extent_op = head->extent_op;
298 struct rb_node *n = &head->node.rb_node;
299 int sgn;
300 int ret;
301
302 if (extent_op && extent_op->update_key)
303 btrfs_disk_key_to_cpu(info_key, &extent_op->key);
304
305 while ((n = rb_prev(n))) {
306 struct btrfs_delayed_ref_node *node;
307 node = rb_entry(n, struct btrfs_delayed_ref_node,
308 rb_node);
309 if (node->bytenr != head->node.bytenr)
310 break;
311 WARN_ON(node->is_head);
312
313 if (node->seq > seq)
314 continue;
315
316 switch (node->action) {
317 case BTRFS_ADD_DELAYED_EXTENT:
318 case BTRFS_UPDATE_DELAYED_HEAD:
319 WARN_ON(1);
320 continue;
321 case BTRFS_ADD_DELAYED_REF:
322 sgn = 1;
323 break;
324 case BTRFS_DROP_DELAYED_REF:
325 sgn = -1;
326 break;
327 default:
328 BUG_ON(1);
329 }
330 switch (node->type) {
331 case BTRFS_TREE_BLOCK_REF_KEY: {
332 struct btrfs_delayed_tree_ref *ref;
333
334 ref = btrfs_delayed_node_to_tree_ref(node);
335 ret = __add_prelim_ref(prefs, ref->root, info_key,
336 ref->level + 1, 0, node->bytenr,
337 node->ref_mod * sgn);
338 break;
339 }
340 case BTRFS_SHARED_BLOCK_REF_KEY: {
341 struct btrfs_delayed_tree_ref *ref;
342
343 ref = btrfs_delayed_node_to_tree_ref(node);
344 ret = __add_prelim_ref(prefs, ref->root, info_key,
345 ref->level + 1, ref->parent,
346 node->bytenr,
347 node->ref_mod * sgn);
348 break;
349 }
350 case BTRFS_EXTENT_DATA_REF_KEY: {
351 struct btrfs_delayed_data_ref *ref;
352 struct btrfs_key key;
353
354 ref = btrfs_delayed_node_to_data_ref(node);
355
356 key.objectid = ref->objectid;
357 key.type = BTRFS_EXTENT_DATA_KEY;
358 key.offset = ref->offset;
359 ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0,
360 node->bytenr,
361 node->ref_mod * sgn);
362 break;
363 }
364 case BTRFS_SHARED_DATA_REF_KEY: {
365 struct btrfs_delayed_data_ref *ref;
366 struct btrfs_key key;
367
368 ref = btrfs_delayed_node_to_data_ref(node);
369
370 key.objectid = ref->objectid;
371 key.type = BTRFS_EXTENT_DATA_KEY;
372 key.offset = ref->offset;
373 ret = __add_prelim_ref(prefs, ref->root, &key, 0,
374 ref->parent, node->bytenr,
375 node->ref_mod * sgn);
376 break;
377 }
378 default:
379 WARN_ON(1);
380 }
381 BUG_ON(ret);
382 }
383
384 return 0;
385}
386
387/*
388 * add all inline backrefs for bytenr to the list
389 */
390static int __add_inline_refs(struct btrfs_fs_info *fs_info,
391 struct btrfs_path *path, u64 bytenr,
392 struct btrfs_key *info_key, int *info_level,
393 struct list_head *prefs)
394{
395 int ret;
396 int slot;
397 struct extent_buffer *leaf;
398 struct btrfs_key key;
399 unsigned long ptr;
400 unsigned long end;
401 struct btrfs_extent_item *ei;
402 u64 flags;
403 u64 item_size;
404
405 /*
406 * enumerate all inline refs
407 */
408 leaf = path->nodes[0];
409 slot = path->slots[0] - 1;
410
411 item_size = btrfs_item_size_nr(leaf, slot);
412 BUG_ON(item_size < sizeof(*ei));
413
414 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
415 flags = btrfs_extent_flags(leaf, ei);
416
417 ptr = (unsigned long)(ei + 1);
418 end = (unsigned long)ei + item_size;
419
420 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
421 struct btrfs_tree_block_info *info;
422 struct btrfs_disk_key disk_key;
423
424 info = (struct btrfs_tree_block_info *)ptr;
425 *info_level = btrfs_tree_block_level(leaf, info);
426 btrfs_tree_block_key(leaf, info, &disk_key);
427 btrfs_disk_key_to_cpu(info_key, &disk_key);
428 ptr += sizeof(struct btrfs_tree_block_info);
429 BUG_ON(ptr > end);
430 } else {
431 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
432 }
433
434 while (ptr < end) {
435 struct btrfs_extent_inline_ref *iref;
436 u64 offset;
437 int type;
438
439 iref = (struct btrfs_extent_inline_ref *)ptr;
440 type = btrfs_extent_inline_ref_type(leaf, iref);
441 offset = btrfs_extent_inline_ref_offset(leaf, iref);
442
443 switch (type) {
444 case BTRFS_SHARED_BLOCK_REF_KEY:
445 ret = __add_prelim_ref(prefs, 0, info_key,
446 *info_level + 1, offset,
447 bytenr, 1);
448 break;
449 case BTRFS_SHARED_DATA_REF_KEY: {
450 struct btrfs_shared_data_ref *sdref;
451 int count;
452
453 sdref = (struct btrfs_shared_data_ref *)(iref + 1);
454 count = btrfs_shared_data_ref_count(leaf, sdref);
455 ret = __add_prelim_ref(prefs, 0, NULL, 0, offset,
456 bytenr, count);
457 break;
458 }
459 case BTRFS_TREE_BLOCK_REF_KEY:
460 ret = __add_prelim_ref(prefs, offset, info_key,
461 *info_level + 1, 0, bytenr, 1);
462 break;
463 case BTRFS_EXTENT_DATA_REF_KEY: {
464 struct btrfs_extent_data_ref *dref;
465 int count;
466 u64 root;
467
468 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
469 count = btrfs_extent_data_ref_count(leaf, dref);
470 key.objectid = btrfs_extent_data_ref_objectid(leaf,
471 dref);
472 key.type = BTRFS_EXTENT_DATA_KEY;
473 key.offset = btrfs_extent_data_ref_offset(leaf, dref);
474 root = btrfs_extent_data_ref_root(leaf, dref);
475 ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr,
476 count);
477 break;
478 }
479 default:
480 WARN_ON(1);
481 }
482 BUG_ON(ret);
483 ptr += btrfs_extent_inline_ref_size(type);
484 }
485
486 return 0;
487}
488
489/*
490 * add all non-inline backrefs for bytenr to the list
491 */
492static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
493 struct btrfs_path *path, u64 bytenr,
494 struct btrfs_key *info_key, int info_level,
495 struct list_head *prefs)
496{
497 struct btrfs_root *extent_root = fs_info->extent_root;
498 int ret;
499 int slot;
500 struct extent_buffer *leaf;
501 struct btrfs_key key;
502
503 while (1) {
504 ret = btrfs_next_item(extent_root, path);
505 if (ret < 0)
506 break;
507 if (ret) {
508 ret = 0;
509 break;
510 }
511
512 slot = path->slots[0];
513 leaf = path->nodes[0];
514 btrfs_item_key_to_cpu(leaf, &key, slot);
515
516 if (key.objectid != bytenr)
517 break;
518 if (key.type < BTRFS_TREE_BLOCK_REF_KEY)
519 continue;
520 if (key.type > BTRFS_SHARED_DATA_REF_KEY)
521 break;
522
523 switch (key.type) {
524 case BTRFS_SHARED_BLOCK_REF_KEY:
525 ret = __add_prelim_ref(prefs, 0, info_key,
526 info_level + 1, key.offset,
527 bytenr, 1);
528 break;
529 case BTRFS_SHARED_DATA_REF_KEY: {
530 struct btrfs_shared_data_ref *sdref;
531 int count;
532
533 sdref = btrfs_item_ptr(leaf, slot,
534 struct btrfs_shared_data_ref);
535 count = btrfs_shared_data_ref_count(leaf, sdref);
536 ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset,
537 bytenr, count);
538 break;
539 }
540 case BTRFS_TREE_BLOCK_REF_KEY:
541 ret = __add_prelim_ref(prefs, key.offset, info_key,
542 info_level + 1, 0, bytenr, 1);
543 break;
544 case BTRFS_EXTENT_DATA_REF_KEY: {
545 struct btrfs_extent_data_ref *dref;
546 int count;
547 u64 root;
548
549 dref = btrfs_item_ptr(leaf, slot,
550 struct btrfs_extent_data_ref);
551 count = btrfs_extent_data_ref_count(leaf, dref);
552 key.objectid = btrfs_extent_data_ref_objectid(leaf,
553 dref);
554 key.type = BTRFS_EXTENT_DATA_KEY;
555 key.offset = btrfs_extent_data_ref_offset(leaf, dref);
556 root = btrfs_extent_data_ref_root(leaf, dref);
557 ret = __add_prelim_ref(prefs, root, &key, 0, 0,
558 bytenr, count);
559 break;
560 }
561 default:
562 WARN_ON(1);
563 }
564 BUG_ON(ret);
565 }
566
567 return ret;
568}
569
570/*
571 * this adds all existing backrefs (inline backrefs, backrefs and delayed
572 * refs) for the given bytenr to the refs list, merges duplicates and resolves
573 * indirect refs to their parent bytenr.
574 * When roots are found, they're added to the roots list
575 *
576 * FIXME some caching might speed things up
577 */
578static int find_parent_nodes(struct btrfs_trans_handle *trans,
579 struct btrfs_fs_info *fs_info, u64 bytenr,
580 u64 seq, struct ulist *refs, struct ulist *roots)
581{
582 struct btrfs_key key;
583 struct btrfs_path *path;
584 struct btrfs_key info_key = { 0 };
585 struct btrfs_delayed_ref_root *delayed_refs = NULL;
586 struct btrfs_delayed_ref_head *head = NULL;
587 int info_level = 0;
588 int ret;
589 struct list_head prefs_delayed;
590 struct list_head prefs;
591 struct __prelim_ref *ref;
592
593 INIT_LIST_HEAD(&prefs);
594 INIT_LIST_HEAD(&prefs_delayed);
595
596 key.objectid = bytenr;
597 key.type = BTRFS_EXTENT_ITEM_KEY;
598 key.offset = (u64)-1;
599
600 path = btrfs_alloc_path();
601 if (!path)
602 return -ENOMEM;
603
604 /*
605 * grab both a lock on the path and a lock on the delayed ref head.
606 * We need both to get a consistent picture of how the refs look
607 * at a specified point in time
608 */
609again:
610 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
611 if (ret < 0)
612 goto out;
613 BUG_ON(ret == 0);
614
615 /*
616 * look if there are updates for this ref queued and lock the head
617 */
618 delayed_refs = &trans->transaction->delayed_refs;
619 spin_lock(&delayed_refs->lock);
620 head = btrfs_find_delayed_ref_head(trans, bytenr);
621 if (head) {
622 if (!mutex_trylock(&head->mutex)) {
623 atomic_inc(&head->node.refs);
624 spin_unlock(&delayed_refs->lock);
625
626 btrfs_release_path(path);
627
628 /*
629 * Mutex was contended, block until it's
630 * released and try again
631 */
632 mutex_lock(&head->mutex);
633 mutex_unlock(&head->mutex);
634 btrfs_put_delayed_ref(&head->node);
635 goto again;
636 }
637 ret = __add_delayed_refs(head, seq, &info_key, &prefs_delayed);
638 if (ret)
639 goto out;
640 }
641 spin_unlock(&delayed_refs->lock);
642
643 if (path->slots[0]) {
644 struct extent_buffer *leaf;
645 int slot;
646
647 leaf = path->nodes[0];
648 slot = path->slots[0] - 1;
649 btrfs_item_key_to_cpu(leaf, &key, slot);
650 if (key.objectid == bytenr &&
651 key.type == BTRFS_EXTENT_ITEM_KEY) {
652 ret = __add_inline_refs(fs_info, path, bytenr,
653 &info_key, &info_level, &prefs);
654 if (ret)
655 goto out;
656 ret = __add_keyed_refs(fs_info, path, bytenr, &info_key,
657 info_level, &prefs);
658 if (ret)
659 goto out;
660 }
661 }
662 btrfs_release_path(path);
663
664 /*
665 * when adding the delayed refs above, the info_key might not have
666 * been known yet. Go over the list and replace the missing keys
667 */
668 list_for_each_entry(ref, &prefs_delayed, list) {
669 if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0)
670 memcpy(&ref->key, &info_key, sizeof(ref->key));
671 }
672 list_splice_init(&prefs_delayed, &prefs);
673
674 ret = __merge_refs(&prefs, 1);
675 if (ret)
676 goto out;
677
678 ret = __resolve_indirect_refs(fs_info, &prefs);
679 if (ret)
680 goto out;
681
682 ret = __merge_refs(&prefs, 2);
683 if (ret)
684 goto out;
685
686 while (!list_empty(&prefs)) {
687 ref = list_first_entry(&prefs, struct __prelim_ref, list);
688 list_del(&ref->list);
689 if (ref->count < 0)
690 WARN_ON(1);
691 if (ref->count && ref->root_id && ref->parent == 0) {
692 /* no parent == root of tree */
693 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
694 BUG_ON(ret < 0);
695 }
696 if (ref->count && ref->parent) {
697 ret = ulist_add(refs, ref->parent, 0, GFP_NOFS);
698 BUG_ON(ret < 0);
699 }
700 kfree(ref);
701 }
702
703out:
704 if (head)
705 mutex_unlock(&head->mutex);
706 btrfs_free_path(path);
707 while (!list_empty(&prefs)) {
708 ref = list_first_entry(&prefs, struct __prelim_ref, list);
709 list_del(&ref->list);
710 kfree(ref);
711 }
712 while (!list_empty(&prefs_delayed)) {
713 ref = list_first_entry(&prefs_delayed, struct __prelim_ref,
714 list);
715 list_del(&ref->list);
716 kfree(ref);
717 }
718
719 return ret;
720}
721
722/*
723 * Finds all leafs with a reference to the specified combination of bytenr and
724 * offset. key_list_head will point to a list of corresponding keys (caller must
725 * free each list element). The leafs will be stored in the leafs ulist, which
726 * must be freed with ulist_free.
727 *
728 * returns 0 on success, <0 on error
729 */
730static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
731 struct btrfs_fs_info *fs_info, u64 bytenr,
732 u64 num_bytes, u64 seq, struct ulist **leafs)
733{
734 struct ulist *tmp;
735 int ret;
736
737 tmp = ulist_alloc(GFP_NOFS);
738 if (!tmp)
739 return -ENOMEM;
740 *leafs = ulist_alloc(GFP_NOFS);
741 if (!*leafs) {
742 ulist_free(tmp);
743 return -ENOMEM;
744 }
745
746 ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp);
747 ulist_free(tmp);
748
749 if (ret < 0 && ret != -ENOENT) {
750 ulist_free(*leafs);
751 return ret;
752 }
753
754 return 0;
755}
756
757/*
758 * walk all backrefs for a given extent to find all roots that reference this
759 * extent. Walking a backref means finding all extents that reference this
760 * extent and in turn walk the backrefs of those, too. Naturally this is a
761 * recursive process, but here it is implemented in an iterative fashion: We
762 * find all referencing extents for the extent in question and put them on a
763 * list. In turn, we find all referencing extents for those, further appending
764 * to the list. The way we iterate the list allows adding more elements after
765 * the current while iterating. The process stops when we reach the end of the
766 * list. Found roots are added to the roots list.
767 *
768 * returns 0 on success, < 0 on error.
769 */
770int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
771 struct btrfs_fs_info *fs_info, u64 bytenr,
772 u64 num_bytes, u64 seq, struct ulist **roots)
773{
774 struct ulist *tmp;
775 struct ulist_node *node = NULL;
776 int ret;
777
778 tmp = ulist_alloc(GFP_NOFS);
779 if (!tmp)
780 return -ENOMEM;
781 *roots = ulist_alloc(GFP_NOFS);
782 if (!*roots) {
783 ulist_free(tmp);
784 return -ENOMEM;
785 }
786
787 while (1) {
788 ret = find_parent_nodes(trans, fs_info, bytenr, seq,
789 tmp, *roots);
790 if (ret < 0 && ret != -ENOENT) {
791 ulist_free(tmp);
792 ulist_free(*roots);
793 return ret;
794 }
795 node = ulist_next(tmp, node);
796 if (!node)
797 break;
798 bytenr = node->val;
799 }
800
801 ulist_free(tmp);
802 return 0;
803}
804
34 805
35static int __inode_info(u64 inum, u64 ioff, u8 key_type, 806static int __inode_info(u64 inum, u64 ioff, u8 key_type,
36 struct btrfs_root *fs_root, struct btrfs_path *path, 807 struct btrfs_root *fs_root, struct btrfs_path *path,
@@ -181,8 +952,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
181 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); 952 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
182 if (found_key->type != BTRFS_EXTENT_ITEM_KEY || 953 if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
183 found_key->objectid > logical || 954 found_key->objectid > logical ||
184 found_key->objectid + found_key->offset <= logical) 955 found_key->objectid + found_key->offset <= logical) {
956 pr_debug("logical %llu is not within any extent\n",
957 (unsigned long long)logical);
185 return -ENOENT; 958 return -ENOENT;
959 }
186 960
187 eb = path->nodes[0]; 961 eb = path->nodes[0];
188 item_size = btrfs_item_size_nr(eb, path->slots[0]); 962 item_size = btrfs_item_size_nr(eb, path->slots[0]);
@@ -191,6 +965,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
191 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); 965 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
192 flags = btrfs_extent_flags(eb, ei); 966 flags = btrfs_extent_flags(eb, ei);
193 967
968 pr_debug("logical %llu is at position %llu within the extent (%llu "
969 "EXTENT_ITEM %llu) flags %#llx size %u\n",
970 (unsigned long long)logical,
971 (unsigned long long)(logical - found_key->objectid),
972 (unsigned long long)found_key->objectid,
973 (unsigned long long)found_key->offset,
974 (unsigned long long)flags, item_size);
194 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) 975 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
195 return BTRFS_EXTENT_FLAG_TREE_BLOCK; 976 return BTRFS_EXTENT_FLAG_TREE_BLOCK;
196 if (flags & BTRFS_EXTENT_FLAG_DATA) 977 if (flags & BTRFS_EXTENT_FLAG_DATA)
@@ -287,128 +1068,11 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
287 return 0; 1068 return 0;
288} 1069}
289 1070
290static int __data_list_add(struct list_head *head, u64 inum, 1071static int iterate_leaf_refs(struct btrfs_fs_info *fs_info,
291 u64 extent_data_item_offset, u64 root) 1072 struct btrfs_path *path, u64 logical,
292{ 1073 u64 orig_extent_item_objectid,
293 struct __data_ref *ref; 1074 u64 extent_item_pos, u64 root,
294 1075 iterate_extent_inodes_t *iterate, void *ctx)
295 ref = kmalloc(sizeof(*ref), GFP_NOFS);
296 if (!ref)
297 return -ENOMEM;
298
299 ref->inum = inum;
300 ref->extent_data_item_offset = extent_data_item_offset;
301 ref->root = root;
302 list_add_tail(&ref->list, head);
303
304 return 0;
305}
306
307static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb,
308 struct btrfs_extent_data_ref *dref)
309{
310 return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref),
311 btrfs_extent_data_ref_offset(eb, dref),
312 btrfs_extent_data_ref_root(eb, dref));
313}
314
315static int __shared_list_add(struct list_head *head, u64 disk_byte)
316{
317 struct __shared_ref *ref;
318
319 ref = kmalloc(sizeof(*ref), GFP_NOFS);
320 if (!ref)
321 return -ENOMEM;
322
323 ref->disk_byte = disk_byte;
324 list_add_tail(&ref->list, head);
325
326 return 0;
327}
328
329static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info,
330 u64 logical, u64 inum,
331 u64 extent_data_item_offset,
332 u64 extent_offset,
333 struct btrfs_path *path,
334 struct list_head *data_refs,
335 iterate_extent_inodes_t *iterate,
336 void *ctx)
337{
338 u64 ref_root;
339 u32 item_size;
340 struct btrfs_key key;
341 struct extent_buffer *eb;
342 struct btrfs_extent_item *ei;
343 struct btrfs_extent_inline_ref *eiref;
344 struct __data_ref *ref;
345 int ret;
346 int type;
347 int last;
348 unsigned long ptr = 0;
349
350 WARN_ON(!list_empty(data_refs));
351 ret = extent_from_logical(fs_info, logical, path, &key);
352 if (ret & BTRFS_EXTENT_FLAG_DATA)
353 ret = -EIO;
354 if (ret < 0)
355 goto out;
356
357 eb = path->nodes[0];
358 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
359 item_size = btrfs_item_size_nr(eb, path->slots[0]);
360
361 ret = 0;
362 ref_root = 0;
363 /*
364 * as done in iterate_extent_inodes, we first build a list of refs to
365 * iterate, then free the path and then iterate them to avoid deadlocks.
366 */
367 do {
368 last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
369 &eiref, &type);
370 if (last < 0) {
371 ret = last;
372 goto out;
373 }
374 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
375 type == BTRFS_SHARED_BLOCK_REF_KEY) {
376 ref_root = btrfs_extent_inline_ref_offset(eb, eiref);
377 ret = __data_list_add(data_refs, inum,
378 extent_data_item_offset,
379 ref_root);
380 }
381 } while (!ret && !last);
382
383 btrfs_release_path(path);
384
385 if (ref_root == 0) {
386 printk(KERN_ERR "btrfs: failed to find tree block ref "
387 "for shared data backref %llu\n", logical);
388 WARN_ON(1);
389 ret = -EIO;
390 }
391
392out:
393 while (!list_empty(data_refs)) {
394 ref = list_first_entry(data_refs, struct __data_ref, list);
395 list_del(&ref->list);
396 if (!ret)
397 ret = iterate(ref->inum, extent_offset +
398 ref->extent_data_item_offset,
399 ref->root, ctx);
400 kfree(ref);
401 }
402
403 return ret;
404}
405
406static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
407 u64 logical, u64 orig_extent_item_objectid,
408 u64 extent_offset, struct btrfs_path *path,
409 struct list_head *data_refs,
410 iterate_extent_inodes_t *iterate,
411 void *ctx)
412{ 1076{
413 u64 disk_byte; 1077 u64 disk_byte;
414 struct btrfs_key key; 1078 struct btrfs_key key;
@@ -416,8 +1080,10 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
416 struct extent_buffer *eb; 1080 struct extent_buffer *eb;
417 int slot; 1081 int slot;
418 int nritems; 1082 int nritems;
419 int ret; 1083 int ret = 0;
420 int found = 0; 1084 int extent_type;
1085 u64 data_offset;
1086 u64 data_len;
421 1087
422 eb = read_tree_block(fs_info->tree_root, logical, 1088 eb = read_tree_block(fs_info->tree_root, logical,
423 fs_info->tree_root->leafsize, 0); 1089 fs_info->tree_root->leafsize, 0);
@@ -435,149 +1101,99 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
435 if (key.type != BTRFS_EXTENT_DATA_KEY) 1101 if (key.type != BTRFS_EXTENT_DATA_KEY)
436 continue; 1102 continue;
437 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 1103 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
438 if (!fi) { 1104 extent_type = btrfs_file_extent_type(eb, fi);
439 free_extent_buffer(eb); 1105 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
440 return -EIO; 1106 continue;
441 } 1107 /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
442 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); 1108 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
443 if (disk_byte != orig_extent_item_objectid) { 1109 if (disk_byte != orig_extent_item_objectid)
444 if (found) 1110 continue;
445 break;
446 else
447 continue;
448 }
449 ++found;
450 ret = __iter_shared_inline_ref_inodes(fs_info, logical,
451 key.objectid,
452 key.offset,
453 extent_offset, path,
454 data_refs,
455 iterate, ctx);
456 if (ret)
457 break;
458 }
459 1111
460 if (!found) { 1112 data_offset = btrfs_file_extent_offset(eb, fi);
461 printk(KERN_ERR "btrfs: failed to follow shared data backref " 1113 data_len = btrfs_file_extent_num_bytes(eb, fi);
462 "to parent %llu\n", logical); 1114
463 WARN_ON(1); 1115 if (extent_item_pos < data_offset ||
464 ret = -EIO; 1116 extent_item_pos >= data_offset + data_len)
1117 continue;
1118
1119 pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), "
1120 "root %llu\n", orig_extent_item_objectid,
1121 key.objectid, key.offset, root);
1122 ret = iterate(key.objectid,
1123 key.offset + (extent_item_pos - data_offset),
1124 root, ctx);
1125 if (ret) {
1126 pr_debug("stopping iteration because ret=%d\n", ret);
1127 break;
1128 }
465 } 1129 }
466 1130
467 free_extent_buffer(eb); 1131 free_extent_buffer(eb);
1132
468 return ret; 1133 return ret;
469} 1134}
470 1135
471/* 1136/*
472 * calls iterate() for every inode that references the extent identified by 1137 * calls iterate() for every inode that references the extent identified by
473 * the given parameters. will use the path given as a parameter and return it 1138 * the given parameters.
474 * released.
475 * when the iterator function returns a non-zero value, iteration stops. 1139 * when the iterator function returns a non-zero value, iteration stops.
1140 * path is guaranteed to be in released state when iterate() is called.
476 */ 1141 */
477int iterate_extent_inodes(struct btrfs_fs_info *fs_info, 1142int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
478 struct btrfs_path *path, 1143 struct btrfs_path *path,
479 u64 extent_item_objectid, 1144 u64 extent_item_objectid, u64 extent_item_pos,
480 u64 extent_offset,
481 iterate_extent_inodes_t *iterate, void *ctx) 1145 iterate_extent_inodes_t *iterate, void *ctx)
482{ 1146{
483 unsigned long ptr = 0;
484 int last;
485 int ret; 1147 int ret;
486 int type;
487 u64 logical;
488 u32 item_size;
489 struct btrfs_extent_inline_ref *eiref;
490 struct btrfs_extent_data_ref *dref;
491 struct extent_buffer *eb;
492 struct btrfs_extent_item *ei;
493 struct btrfs_key key;
494 struct list_head data_refs = LIST_HEAD_INIT(data_refs); 1148 struct list_head data_refs = LIST_HEAD_INIT(data_refs);
495 struct list_head shared_refs = LIST_HEAD_INIT(shared_refs); 1149 struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
496 struct __data_ref *ref_d; 1150 struct btrfs_trans_handle *trans;
497 struct __shared_ref *ref_s; 1151 struct ulist *refs;
498 1152 struct ulist *roots;
499 eb = path->nodes[0]; 1153 struct ulist_node *ref_node = NULL;
500 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); 1154 struct ulist_node *root_node = NULL;
501 item_size = btrfs_item_size_nr(eb, path->slots[0]); 1155 struct seq_list seq_elem;
502 1156 struct btrfs_delayed_ref_root *delayed_refs;
503 /* first we iterate the inline refs, ... */ 1157
504 do { 1158 trans = btrfs_join_transaction(fs_info->extent_root);
505 last = __get_extent_inline_ref(&ptr, eb, ei, item_size, 1159 if (IS_ERR(trans))
506 &eiref, &type); 1160 return PTR_ERR(trans);
507 if (last == -ENOENT) { 1161
508 ret = 0; 1162 pr_debug("resolving all inodes for extent %llu\n",
509 break; 1163 extent_item_objectid);
510 } 1164
511 if (last < 0) { 1165 delayed_refs = &trans->transaction->delayed_refs;
512 ret = last; 1166 spin_lock(&delayed_refs->lock);
513 break; 1167 btrfs_get_delayed_seq(delayed_refs, &seq_elem);
514 } 1168 spin_unlock(&delayed_refs->lock);
1169
1170 ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
1171 extent_item_pos, seq_elem.seq,
1172 &refs);
515 1173
516 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1174 if (ret)
517 dref = (struct btrfs_extent_data_ref *)(&eiref->offset); 1175 goto out;
518 ret = __data_list_add_eb(&data_refs, eb, dref);
519 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
520 logical = btrfs_extent_inline_ref_offset(eb, eiref);
521 ret = __shared_list_add(&shared_refs, logical);
522 }
523 } while (!ret && !last);
524 1176
525 /* ... then we proceed to in-tree references and ... */ 1177 while (!ret && (ref_node = ulist_next(refs, ref_node))) {
526 while (!ret) { 1178 ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1,
527 ++path->slots[0]; 1179 seq_elem.seq, &roots);
528 if (path->slots[0] > btrfs_header_nritems(eb)) { 1180 if (ret)
529 ret = btrfs_next_leaf(fs_info->extent_root, path);
530 if (ret) {
531 if (ret == 1)
532 ret = 0; /* we're done */
533 break;
534 }
535 eb = path->nodes[0];
536 }
537 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
538 if (key.objectid != extent_item_objectid)
539 break; 1181 break;
540 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1182 while (!ret && (root_node = ulist_next(roots, root_node))) {
541 dref = btrfs_item_ptr(eb, path->slots[0], 1183 pr_debug("root %llu references leaf %llu\n",
542 struct btrfs_extent_data_ref); 1184 root_node->val, ref_node->val);
543 ret = __data_list_add_eb(&data_refs, eb, dref); 1185 ret = iterate_leaf_refs(fs_info, path, ref_node->val,
544 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1186 extent_item_objectid,
545 ret = __shared_list_add(&shared_refs, key.offset); 1187 extent_item_pos, root_node->val,
1188 iterate, ctx);
546 } 1189 }
547 } 1190 }
548 1191
549 btrfs_release_path(path); 1192 ulist_free(refs);
550 1193 ulist_free(roots);
551 /* 1194out:
552 * ... only at the very end we can process the refs we found. this is 1195 btrfs_put_delayed_seq(delayed_refs, &seq_elem);
553 * because the iterator function we call is allowed to make tree lookups 1196 btrfs_end_transaction(trans, fs_info->extent_root);
554 * and we have to avoid deadlocks. additionally, we need more tree
555 * lookups ourselves for shared data refs.
556 */
557 while (!list_empty(&data_refs)) {
558 ref_d = list_first_entry(&data_refs, struct __data_ref, list);
559 list_del(&ref_d->list);
560 if (!ret)
561 ret = iterate(ref_d->inum, extent_offset +
562 ref_d->extent_data_item_offset,
563 ref_d->root, ctx);
564 kfree(ref_d);
565 }
566
567 while (!list_empty(&shared_refs)) {
568 ref_s = list_first_entry(&shared_refs, struct __shared_ref,
569 list);
570 list_del(&ref_s->list);
571 if (!ret)
572 ret = __iter_shared_inline_ref(fs_info,
573 ref_s->disk_byte,
574 extent_item_objectid,
575 extent_offset, path,
576 &data_refs,
577 iterate, ctx);
578 kfree(ref_s);
579 }
580
581 return ret; 1197 return ret;
582} 1198}
583 1199
@@ -586,19 +1202,20 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
586 iterate_extent_inodes_t *iterate, void *ctx) 1202 iterate_extent_inodes_t *iterate, void *ctx)
587{ 1203{
588 int ret; 1204 int ret;
589 u64 offset; 1205 u64 extent_item_pos;
590 struct btrfs_key found_key; 1206 struct btrfs_key found_key;
591 1207
592 ret = extent_from_logical(fs_info, logical, path, 1208 ret = extent_from_logical(fs_info, logical, path,
593 &found_key); 1209 &found_key);
1210 btrfs_release_path(path);
594 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) 1211 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
595 ret = -EINVAL; 1212 ret = -EINVAL;
596 if (ret < 0) 1213 if (ret < 0)
597 return ret; 1214 return ret;
598 1215
599 offset = logical - found_key.objectid; 1216 extent_item_pos = logical - found_key.objectid;
600 ret = iterate_extent_inodes(fs_info, path, found_key.objectid, 1217 ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
601 offset, iterate, ctx); 1218 extent_item_pos, iterate, ctx);
602 1219
603 return ret; 1220 return ret;
604} 1221}
@@ -643,6 +1260,10 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
643 for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) { 1260 for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
644 name_len = btrfs_inode_ref_name_len(eb, iref); 1261 name_len = btrfs_inode_ref_name_len(eb, iref);
645 /* path must be released before calling iterate()! */ 1262 /* path must be released before calling iterate()! */
1263 pr_debug("following ref at offset %u for inode %llu in "
1264 "tree %llu\n", cur,
1265 (unsigned long long)found_key.objectid,
1266 (unsigned long long)fs_root->objectid);
646 ret = iterate(parent, iref, eb, ctx); 1267 ret = iterate(parent, iref, eb, ctx);
647 if (ret) { 1268 if (ret) {
648 free_extent_buffer(eb); 1269 free_extent_buffer(eb);
@@ -683,10 +1304,14 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
683 return PTR_ERR(fspath); 1304 return PTR_ERR(fspath);
684 1305
685 if (fspath > fspath_min) { 1306 if (fspath > fspath_min) {
1307 pr_debug("path resolved: %s\n", fspath);
686 ipath->fspath->val[i] = (u64)(unsigned long)fspath; 1308 ipath->fspath->val[i] = (u64)(unsigned long)fspath;
687 ++ipath->fspath->elem_cnt; 1309 ++ipath->fspath->elem_cnt;
688 ipath->fspath->bytes_left = fspath - fspath_min; 1310 ipath->fspath->bytes_left = fspath - fspath_min;
689 } else { 1311 } else {
1312 pr_debug("missed path, not enough space. missing bytes: %lu, "
1313 "constructed so far: %s\n",
1314 (unsigned long)(fspath_min - fspath), fspath_min);
690 ++ipath->fspath->elem_missed; 1315 ++ipath->fspath->elem_missed;
691 ipath->fspath->bytes_missing += fspath_min - fspath; 1316 ipath->fspath->bytes_missing += fspath_min - fspath;
692 ipath->fspath->bytes_left = 0; 1317 ipath->fspath->bytes_left = 0;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 92618837cb8f..d00dfa9ca934 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -20,6 +20,7 @@
20#define __BTRFS_BACKREF__ 20#define __BTRFS_BACKREF__
21 21
22#include "ioctl.h" 22#include "ioctl.h"
23#include "ulist.h"
23 24
24struct inode_fs_paths { 25struct inode_fs_paths {
25 struct btrfs_path *btrfs_path; 26 struct btrfs_path *btrfs_path;
@@ -54,6 +55,10 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
54 55
55int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); 56int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
56 57
58int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
59 struct btrfs_fs_info *fs_info, u64 bytenr,
60 u64 num_bytes, u64 seq, struct ulist **roots);
61
57struct btrfs_data_container *init_data_container(u32 total_bytes); 62struct btrfs_data_container *init_data_container(u32 total_bytes);
58struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, 63struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
59 struct btrfs_path *path); 64 struct btrfs_path *path);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 634608d2a6d0..9b9b15fd5204 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -51,6 +51,9 @@ struct btrfs_inode {
51 /* held while logging the inode in tree-log.c */ 51 /* held while logging the inode in tree-log.c */
52 struct mutex log_mutex; 52 struct mutex log_mutex;
53 53
54 /* held while doing delalloc reservations */
55 struct mutex delalloc_mutex;
56
54 /* used to order data wrt metadata */ 57 /* used to order data wrt metadata */
55 struct btrfs_ordered_inode_tree ordered_tree; 58 struct btrfs_ordered_inode_tree ordered_tree;
56 59
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
new file mode 100644
index 000000000000..ad0b3ba735b7
--- /dev/null
+++ b/fs/btrfs/check-integrity.c
@@ -0,0 +1,3068 @@
1/*
2 * Copyright (C) STRATO AG 2011. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19/*
20 * This module can be used to catch cases when the btrfs kernel
21 * code executes write requests to the disk that bring the file
22 * system in an inconsistent state. In such a state, a power-loss
23 * or kernel panic event would cause that the data on disk is
24 * lost or at least damaged.
25 *
26 * Code is added that examines all block write requests during
27 * runtime (including writes of the super block). Three rules
28 * are verified and an error is printed on violation of the
29 * rules:
30 * 1. It is not allowed to write a disk block which is
31 * currently referenced by the super block (either directly
32 * or indirectly).
33 * 2. When a super block is written, it is verified that all
34 * referenced (directly or indirectly) blocks fulfill the
35 * following requirements:
36 * 2a. All referenced blocks have either been present when
37 * the file system was mounted, (i.e., they have been
38 * referenced by the super block) or they have been
39 * written since then and the write completion callback
40 * was called and a FLUSH request to the device where
41 * these blocks are located was received and completed.
42 * 2b. All referenced blocks need to have a generation
43 * number which is equal to the parent's number.
44 *
45 * One issue that was found using this module was that the log
46 * tree on disk became temporarily corrupted because disk blocks
47 * that had been in use for the log tree had been freed and
48 * reused too early, while being referenced by the written super
49 * block.
50 *
51 * The search term in the kernel log that can be used to filter
52 * on the existence of detected integrity issues is
53 * "btrfs: attempt".
54 *
55 * The integrity check is enabled via mount options. These
56 * mount options are only supported if the integrity check
57 * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY.
58 *
59 * Example #1, apply integrity checks to all metadata:
60 * mount /dev/sdb1 /mnt -o check_int
61 *
62 * Example #2, apply integrity checks to all metadata and
63 * to data extents:
64 * mount /dev/sdb1 /mnt -o check_int_data
65 *
66 * Example #3, apply integrity checks to all metadata and dump
67 * the tree that the super block references to kernel messages
68 * each time after a super block was written:
69 * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263
70 *
71 * If the integrity check tool is included and activated in
72 * the mount options, plenty of kernel memory is used, and
73 * plenty of additional CPU cycles are spent. Enabling this
74 * functionality is not intended for normal use. In most
75 * cases, unless you are a btrfs developer who needs to verify
76 * the integrity of (super)-block write requests, do not
77 * enable the config option BTRFS_FS_CHECK_INTEGRITY to
78 * include and compile the integrity check tool.
79 */
80
81#include <linux/sched.h>
82#include <linux/slab.h>
83#include <linux/buffer_head.h>
84#include <linux/mutex.h>
85#include <linux/crc32c.h>
86#include <linux/genhd.h>
87#include <linux/blkdev.h>
88#include "ctree.h"
89#include "disk-io.h"
90#include "transaction.h"
91#include "extent_io.h"
92#include "disk-io.h"
93#include "volumes.h"
94#include "print-tree.h"
95#include "locking.h"
96#include "check-integrity.h"
97
98#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
99#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
100#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100
101#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051
102#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807
103#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530
104#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
105#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters,
106 * excluding " [...]" */
107#define BTRFSIC_BLOCK_SIZE PAGE_SIZE
108
109#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
110
111/*
112 * The definition of the bitmask fields for the print_mask.
113 * They are specified with the mount option check_integrity_print_mask.
114 */
115#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE 0x00000001
116#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION 0x00000002
117#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE 0x00000004
118#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE 0x00000008
119#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH 0x00000010
120#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH 0x00000020
121#define BTRFSIC_PRINT_MASK_VERBOSE 0x00000040
122#define BTRFSIC_PRINT_MASK_VERY_VERBOSE 0x00000080
123#define BTRFSIC_PRINT_MASK_INITIAL_TREE 0x00000100
124#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES 0x00000200
125#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400
126#define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800
127#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000
128
129struct btrfsic_dev_state;
130struct btrfsic_state;
131
132struct btrfsic_block {
133 u32 magic_num; /* only used for debug purposes */
134 unsigned int is_metadata:1; /* if it is meta-data, not data-data */
135 unsigned int is_superblock:1; /* if it is one of the superblocks */
136 unsigned int is_iodone:1; /* if is done by lower subsystem */
137 unsigned int iodone_w_error:1; /* error was indicated to endio */
138 unsigned int never_written:1; /* block was added because it was
139 * referenced, not because it was
140 * written */
141 unsigned int mirror_num:2; /* large enough to hold
142 * BTRFS_SUPER_MIRROR_MAX */
143 struct btrfsic_dev_state *dev_state;
144 u64 dev_bytenr; /* key, physical byte num on disk */
145 u64 logical_bytenr; /* logical byte num on disk */
146 u64 generation;
147 struct btrfs_disk_key disk_key; /* extra info to print in case of
148 * issues, will not always be correct */
149 struct list_head collision_resolving_node; /* list node */
150 struct list_head all_blocks_node; /* list node */
151
152 /* the following two lists contain block_link items */
153 struct list_head ref_to_list; /* list */
154 struct list_head ref_from_list; /* list */
155 struct btrfsic_block *next_in_same_bio;
156 void *orig_bio_bh_private;
157 union {
158 bio_end_io_t *bio;
159 bh_end_io_t *bh;
160 } orig_bio_bh_end_io;
161 int submit_bio_bh_rw;
162 u64 flush_gen; /* only valid if !never_written */
163};
164
165/*
166 * Elements of this type are allocated dynamically and required because
167 * each block object can refer to and can be ref from multiple blocks.
168 * The key to lookup them in the hashtable is the dev_bytenr of
169 * the block ref to plus the one from the block refered from.
170 * The fact that they are searchable via a hashtable and that a
171 * ref_cnt is maintained is not required for the btrfs integrity
172 * check algorithm itself, it is only used to make the output more
173 * beautiful in case that an error is detected (an error is defined
174 * as a write operation to a block while that block is still referenced).
175 */
176struct btrfsic_block_link {
177 u32 magic_num; /* only used for debug purposes */
178 u32 ref_cnt;
179 struct list_head node_ref_to; /* list node */
180 struct list_head node_ref_from; /* list node */
181 struct list_head collision_resolving_node; /* list node */
182 struct btrfsic_block *block_ref_to;
183 struct btrfsic_block *block_ref_from;
184 u64 parent_generation;
185};
186
187struct btrfsic_dev_state {
188 u32 magic_num; /* only used for debug purposes */
189 struct block_device *bdev;
190 struct btrfsic_state *state;
191 struct list_head collision_resolving_node; /* list node */
192 struct btrfsic_block dummy_block_for_bio_bh_flush;
193 u64 last_flush_gen;
194 char name[BDEVNAME_SIZE];
195};
196
197struct btrfsic_block_hashtable {
198 struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE];
199};
200
201struct btrfsic_block_link_hashtable {
202 struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE];
203};
204
205struct btrfsic_dev_state_hashtable {
206 struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE];
207};
208
209struct btrfsic_block_data_ctx {
210 u64 start; /* virtual bytenr */
211 u64 dev_bytenr; /* physical bytenr on device */
212 u32 len;
213 struct btrfsic_dev_state *dev;
214 char *data;
215 struct buffer_head *bh; /* do not use if set to NULL */
216};
217
218/* This structure is used to implement recursion without occupying
219 * any stack space, refer to btrfsic_process_metablock() */
220struct btrfsic_stack_frame {
221 u32 magic;
222 u32 nr;
223 int error;
224 int i;
225 int limit_nesting;
226 int num_copies;
227 int mirror_num;
228 struct btrfsic_block *block;
229 struct btrfsic_block_data_ctx *block_ctx;
230 struct btrfsic_block *next_block;
231 struct btrfsic_block_data_ctx next_block_ctx;
232 struct btrfs_header *hdr;
233 struct btrfsic_stack_frame *prev;
234};
235
236/* Some state per mounted filesystem */
237struct btrfsic_state {
238 u32 print_mask;
239 int include_extent_data;
240 int csum_size;
241 struct list_head all_blocks_list;
242 struct btrfsic_block_hashtable block_hashtable;
243 struct btrfsic_block_link_hashtable block_link_hashtable;
244 struct btrfs_root *root;
245 u64 max_superblock_generation;
246 struct btrfsic_block *latest_superblock;
247};
248
249static void btrfsic_block_init(struct btrfsic_block *b);
250static struct btrfsic_block *btrfsic_block_alloc(void);
251static void btrfsic_block_free(struct btrfsic_block *b);
252static void btrfsic_block_link_init(struct btrfsic_block_link *n);
253static struct btrfsic_block_link *btrfsic_block_link_alloc(void);
254static void btrfsic_block_link_free(struct btrfsic_block_link *n);
255static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds);
256static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void);
257static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds);
258static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h);
259static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
260 struct btrfsic_block_hashtable *h);
261static void btrfsic_block_hashtable_remove(struct btrfsic_block *b);
262static struct btrfsic_block *btrfsic_block_hashtable_lookup(
263 struct block_device *bdev,
264 u64 dev_bytenr,
265 struct btrfsic_block_hashtable *h);
266static void btrfsic_block_link_hashtable_init(
267 struct btrfsic_block_link_hashtable *h);
268static void btrfsic_block_link_hashtable_add(
269 struct btrfsic_block_link *l,
270 struct btrfsic_block_link_hashtable *h);
271static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l);
272static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
273 struct block_device *bdev_ref_to,
274 u64 dev_bytenr_ref_to,
275 struct block_device *bdev_ref_from,
276 u64 dev_bytenr_ref_from,
277 struct btrfsic_block_link_hashtable *h);
278static void btrfsic_dev_state_hashtable_init(
279 struct btrfsic_dev_state_hashtable *h);
280static void btrfsic_dev_state_hashtable_add(
281 struct btrfsic_dev_state *ds,
282 struct btrfsic_dev_state_hashtable *h);
283static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds);
284static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
285 struct block_device *bdev,
286 struct btrfsic_dev_state_hashtable *h);
287static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void);
288static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf);
289static int btrfsic_process_superblock(struct btrfsic_state *state,
290 struct btrfs_fs_devices *fs_devices);
291static int btrfsic_process_metablock(struct btrfsic_state *state,
292 struct btrfsic_block *block,
293 struct btrfsic_block_data_ctx *block_ctx,
294 struct btrfs_header *hdr,
295 int limit_nesting, int force_iodone_flag);
296static int btrfsic_create_link_to_next_block(
297 struct btrfsic_state *state,
298 struct btrfsic_block *block,
299 struct btrfsic_block_data_ctx
300 *block_ctx, u64 next_bytenr,
301 int limit_nesting,
302 struct btrfsic_block_data_ctx *next_block_ctx,
303 struct btrfsic_block **next_blockp,
304 int force_iodone_flag,
305 int *num_copiesp, int *mirror_nump,
306 struct btrfs_disk_key *disk_key,
307 u64 parent_generation);
308static int btrfsic_handle_extent_data(struct btrfsic_state *state,
309 struct btrfsic_block *block,
310 struct btrfsic_block_data_ctx *block_ctx,
311 u32 item_offset, int force_iodone_flag);
312static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
313 struct btrfsic_block_data_ctx *block_ctx_out,
314 int mirror_num);
315static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
316 u32 len, struct block_device *bdev,
317 struct btrfsic_block_data_ctx *block_ctx_out);
318static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
319static int btrfsic_read_block(struct btrfsic_state *state,
320 struct btrfsic_block_data_ctx *block_ctx);
321static void btrfsic_dump_database(struct btrfsic_state *state);
322static int btrfsic_test_for_metadata(struct btrfsic_state *state,
323 const u8 *data, unsigned int size);
324static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
325 u64 dev_bytenr, u8 *mapped_data,
326 unsigned int len, struct bio *bio,
327 int *bio_is_patched,
328 struct buffer_head *bh,
329 int submit_bio_bh_rw);
330static int btrfsic_process_written_superblock(
331 struct btrfsic_state *state,
332 struct btrfsic_block *const block,
333 struct btrfs_super_block *const super_hdr);
334static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status);
335static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate);
336static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
337 const struct btrfsic_block *block,
338 int recursion_level);
339static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
340 struct btrfsic_block *const block,
341 int recursion_level);
342static void btrfsic_print_add_link(const struct btrfsic_state *state,
343 const struct btrfsic_block_link *l);
344static void btrfsic_print_rem_link(const struct btrfsic_state *state,
345 const struct btrfsic_block_link *l);
346static char btrfsic_get_block_type(const struct btrfsic_state *state,
347 const struct btrfsic_block *block);
348static void btrfsic_dump_tree(const struct btrfsic_state *state);
349static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
350 const struct btrfsic_block *block,
351 int indent_level);
352static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
353 struct btrfsic_state *state,
354 struct btrfsic_block_data_ctx *next_block_ctx,
355 struct btrfsic_block *next_block,
356 struct btrfsic_block *from_block,
357 u64 parent_generation);
358static struct btrfsic_block *btrfsic_block_lookup_or_add(
359 struct btrfsic_state *state,
360 struct btrfsic_block_data_ctx *block_ctx,
361 const char *additional_string,
362 int is_metadata,
363 int is_iodone,
364 int never_written,
365 int mirror_num,
366 int *was_created);
367static int btrfsic_process_superblock_dev_mirror(
368 struct btrfsic_state *state,
369 struct btrfsic_dev_state *dev_state,
370 struct btrfs_device *device,
371 int superblock_mirror_num,
372 struct btrfsic_dev_state **selected_dev_state,
373 struct btrfs_super_block *selected_super);
374static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
375 struct block_device *bdev);
376static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
377 u64 bytenr,
378 struct btrfsic_dev_state *dev_state,
379 u64 dev_bytenr, char *data);
380
381static struct mutex btrfsic_mutex;
382static int btrfsic_is_initialized;
383static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable;
384
385
386static void btrfsic_block_init(struct btrfsic_block *b)
387{
388 b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER;
389 b->dev_state = NULL;
390 b->dev_bytenr = 0;
391 b->logical_bytenr = 0;
392 b->generation = BTRFSIC_GENERATION_UNKNOWN;
393 b->disk_key.objectid = 0;
394 b->disk_key.type = 0;
395 b->disk_key.offset = 0;
396 b->is_metadata = 0;
397 b->is_superblock = 0;
398 b->is_iodone = 0;
399 b->iodone_w_error = 0;
400 b->never_written = 0;
401 b->mirror_num = 0;
402 b->next_in_same_bio = NULL;
403 b->orig_bio_bh_private = NULL;
404 b->orig_bio_bh_end_io.bio = NULL;
405 INIT_LIST_HEAD(&b->collision_resolving_node);
406 INIT_LIST_HEAD(&b->all_blocks_node);
407 INIT_LIST_HEAD(&b->ref_to_list);
408 INIT_LIST_HEAD(&b->ref_from_list);
409 b->submit_bio_bh_rw = 0;
410 b->flush_gen = 0;
411}
412
413static struct btrfsic_block *btrfsic_block_alloc(void)
414{
415 struct btrfsic_block *b;
416
417 b = kzalloc(sizeof(*b), GFP_NOFS);
418 if (NULL != b)
419 btrfsic_block_init(b);
420
421 return b;
422}
423
424static void btrfsic_block_free(struct btrfsic_block *b)
425{
426 BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num));
427 kfree(b);
428}
429
430static void btrfsic_block_link_init(struct btrfsic_block_link *l)
431{
432 l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER;
433 l->ref_cnt = 1;
434 INIT_LIST_HEAD(&l->node_ref_to);
435 INIT_LIST_HEAD(&l->node_ref_from);
436 INIT_LIST_HEAD(&l->collision_resolving_node);
437 l->block_ref_to = NULL;
438 l->block_ref_from = NULL;
439}
440
441static struct btrfsic_block_link *btrfsic_block_link_alloc(void)
442{
443 struct btrfsic_block_link *l;
444
445 l = kzalloc(sizeof(*l), GFP_NOFS);
446 if (NULL != l)
447 btrfsic_block_link_init(l);
448
449 return l;
450}
451
452static void btrfsic_block_link_free(struct btrfsic_block_link *l)
453{
454 BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num));
455 kfree(l);
456}
457
458static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
459{
460 ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
461 ds->bdev = NULL;
462 ds->state = NULL;
463 ds->name[0] = '\0';
464 INIT_LIST_HEAD(&ds->collision_resolving_node);
465 ds->last_flush_gen = 0;
466 btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
467 ds->dummy_block_for_bio_bh_flush.is_iodone = 1;
468 ds->dummy_block_for_bio_bh_flush.dev_state = ds;
469}
470
471static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void)
472{
473 struct btrfsic_dev_state *ds;
474
475 ds = kzalloc(sizeof(*ds), GFP_NOFS);
476 if (NULL != ds)
477 btrfsic_dev_state_init(ds);
478
479 return ds;
480}
481
482static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds)
483{
484 BUG_ON(!(NULL == ds ||
485 BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num));
486 kfree(ds);
487}
488
489static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h)
490{
491 int i;
492
493 for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++)
494 INIT_LIST_HEAD(h->table + i);
495}
496
497static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
498 struct btrfsic_block_hashtable *h)
499{
500 const unsigned int hashval =
501 (((unsigned int)(b->dev_bytenr >> 16)) ^
502 ((unsigned int)((uintptr_t)b->dev_state->bdev))) &
503 (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
504
505 list_add(&b->collision_resolving_node, h->table + hashval);
506}
507
508static void btrfsic_block_hashtable_remove(struct btrfsic_block *b)
509{
510 list_del(&b->collision_resolving_node);
511}
512
513static struct btrfsic_block *btrfsic_block_hashtable_lookup(
514 struct block_device *bdev,
515 u64 dev_bytenr,
516 struct btrfsic_block_hashtable *h)
517{
518 const unsigned int hashval =
519 (((unsigned int)(dev_bytenr >> 16)) ^
520 ((unsigned int)((uintptr_t)bdev))) &
521 (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
522 struct list_head *elem;
523
524 list_for_each(elem, h->table + hashval) {
525 struct btrfsic_block *const b =
526 list_entry(elem, struct btrfsic_block,
527 collision_resolving_node);
528
529 if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr)
530 return b;
531 }
532
533 return NULL;
534}
535
536static void btrfsic_block_link_hashtable_init(
537 struct btrfsic_block_link_hashtable *h)
538{
539 int i;
540
541 for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++)
542 INIT_LIST_HEAD(h->table + i);
543}
544
545static void btrfsic_block_link_hashtable_add(
546 struct btrfsic_block_link *l,
547 struct btrfsic_block_link_hashtable *h)
548{
549 const unsigned int hashval =
550 (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^
551 ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^
552 ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^
553 ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev)))
554 & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
555
556 BUG_ON(NULL == l->block_ref_to);
557 BUG_ON(NULL == l->block_ref_from);
558 list_add(&l->collision_resolving_node, h->table + hashval);
559}
560
561static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l)
562{
563 list_del(&l->collision_resolving_node);
564}
565
566static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
567 struct block_device *bdev_ref_to,
568 u64 dev_bytenr_ref_to,
569 struct block_device *bdev_ref_from,
570 u64 dev_bytenr_ref_from,
571 struct btrfsic_block_link_hashtable *h)
572{
573 const unsigned int hashval =
574 (((unsigned int)(dev_bytenr_ref_to >> 16)) ^
575 ((unsigned int)(dev_bytenr_ref_from >> 16)) ^
576 ((unsigned int)((uintptr_t)bdev_ref_to)) ^
577 ((unsigned int)((uintptr_t)bdev_ref_from))) &
578 (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
579 struct list_head *elem;
580
581 list_for_each(elem, h->table + hashval) {
582 struct btrfsic_block_link *const l =
583 list_entry(elem, struct btrfsic_block_link,
584 collision_resolving_node);
585
586 BUG_ON(NULL == l->block_ref_to);
587 BUG_ON(NULL == l->block_ref_from);
588 if (l->block_ref_to->dev_state->bdev == bdev_ref_to &&
589 l->block_ref_to->dev_bytenr == dev_bytenr_ref_to &&
590 l->block_ref_from->dev_state->bdev == bdev_ref_from &&
591 l->block_ref_from->dev_bytenr == dev_bytenr_ref_from)
592 return l;
593 }
594
595 return NULL;
596}
597
598static void btrfsic_dev_state_hashtable_init(
599 struct btrfsic_dev_state_hashtable *h)
600{
601 int i;
602
603 for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++)
604 INIT_LIST_HEAD(h->table + i);
605}
606
607static void btrfsic_dev_state_hashtable_add(
608 struct btrfsic_dev_state *ds,
609 struct btrfsic_dev_state_hashtable *h)
610{
611 const unsigned int hashval =
612 (((unsigned int)((uintptr_t)ds->bdev)) &
613 (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
614
615 list_add(&ds->collision_resolving_node, h->table + hashval);
616}
617
618static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds)
619{
620 list_del(&ds->collision_resolving_node);
621}
622
623static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
624 struct block_device *bdev,
625 struct btrfsic_dev_state_hashtable *h)
626{
627 const unsigned int hashval =
628 (((unsigned int)((uintptr_t)bdev)) &
629 (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
630 struct list_head *elem;
631
632 list_for_each(elem, h->table + hashval) {
633 struct btrfsic_dev_state *const ds =
634 list_entry(elem, struct btrfsic_dev_state,
635 collision_resolving_node);
636
637 if (ds->bdev == bdev)
638 return ds;
639 }
640
641 return NULL;
642}
643
644static int btrfsic_process_superblock(struct btrfsic_state *state,
645 struct btrfs_fs_devices *fs_devices)
646{
647 int ret;
648 struct btrfs_super_block *selected_super;
649 struct list_head *dev_head = &fs_devices->devices;
650 struct btrfs_device *device;
651 struct btrfsic_dev_state *selected_dev_state = NULL;
652 int pass;
653
654 BUG_ON(NULL == state);
655 selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS);
656 if (NULL == selected_super) {
657 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
658 return -1;
659 }
660
661 list_for_each_entry(device, dev_head, dev_list) {
662 int i;
663 struct btrfsic_dev_state *dev_state;
664
665 if (!device->bdev || !device->name)
666 continue;
667
668 dev_state = btrfsic_dev_state_lookup(device->bdev);
669 BUG_ON(NULL == dev_state);
670 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
671 ret = btrfsic_process_superblock_dev_mirror(
672 state, dev_state, device, i,
673 &selected_dev_state, selected_super);
674 if (0 != ret && 0 == i) {
675 kfree(selected_super);
676 return ret;
677 }
678 }
679 }
680
681 if (NULL == state->latest_superblock) {
682 printk(KERN_INFO "btrfsic: no superblock found!\n");
683 kfree(selected_super);
684 return -1;
685 }
686
687 state->csum_size = btrfs_super_csum_size(selected_super);
688
689 for (pass = 0; pass < 3; pass++) {
690 int num_copies;
691 int mirror_num;
692 u64 next_bytenr;
693
694 switch (pass) {
695 case 0:
696 next_bytenr = btrfs_super_root(selected_super);
697 if (state->print_mask &
698 BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
699 printk(KERN_INFO "root@%llu\n",
700 (unsigned long long)next_bytenr);
701 break;
702 case 1:
703 next_bytenr = btrfs_super_chunk_root(selected_super);
704 if (state->print_mask &
705 BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
706 printk(KERN_INFO "chunk@%llu\n",
707 (unsigned long long)next_bytenr);
708 break;
709 case 2:
710 next_bytenr = btrfs_super_log_root(selected_super);
711 if (0 == next_bytenr)
712 continue;
713 if (state->print_mask &
714 BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
715 printk(KERN_INFO "log@%llu\n",
716 (unsigned long long)next_bytenr);
717 break;
718 }
719
720 num_copies =
721 btrfs_num_copies(&state->root->fs_info->mapping_tree,
722 next_bytenr, PAGE_SIZE);
723 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
724 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
725 (unsigned long long)next_bytenr, num_copies);
726
727 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
728 struct btrfsic_block *next_block;
729 struct btrfsic_block_data_ctx tmp_next_block_ctx;
730 struct btrfsic_block_link *l;
731 struct btrfs_header *hdr;
732
733 ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
734 &tmp_next_block_ctx,
735 mirror_num);
736 if (ret) {
737 printk(KERN_INFO "btrfsic:"
738 " btrfsic_map_block(root @%llu,"
739 " mirror %d) failed!\n",
740 (unsigned long long)next_bytenr,
741 mirror_num);
742 kfree(selected_super);
743 return -1;
744 }
745
746 next_block = btrfsic_block_hashtable_lookup(
747 tmp_next_block_ctx.dev->bdev,
748 tmp_next_block_ctx.dev_bytenr,
749 &state->block_hashtable);
750 BUG_ON(NULL == next_block);
751
752 l = btrfsic_block_link_hashtable_lookup(
753 tmp_next_block_ctx.dev->bdev,
754 tmp_next_block_ctx.dev_bytenr,
755 state->latest_superblock->dev_state->
756 bdev,
757 state->latest_superblock->dev_bytenr,
758 &state->block_link_hashtable);
759 BUG_ON(NULL == l);
760
761 ret = btrfsic_read_block(state, &tmp_next_block_ctx);
762 if (ret < (int)BTRFSIC_BLOCK_SIZE) {
763 printk(KERN_INFO
764 "btrfsic: read @logical %llu failed!\n",
765 (unsigned long long)
766 tmp_next_block_ctx.start);
767 btrfsic_release_block_ctx(&tmp_next_block_ctx);
768 kfree(selected_super);
769 return -1;
770 }
771
772 hdr = (struct btrfs_header *)tmp_next_block_ctx.data;
773 ret = btrfsic_process_metablock(state,
774 next_block,
775 &tmp_next_block_ctx,
776 hdr,
777 BTRFS_MAX_LEVEL + 3, 1);
778 btrfsic_release_block_ctx(&tmp_next_block_ctx);
779 }
780 }
781
782 kfree(selected_super);
783 return ret;
784}
785
786static int btrfsic_process_superblock_dev_mirror(
787 struct btrfsic_state *state,
788 struct btrfsic_dev_state *dev_state,
789 struct btrfs_device *device,
790 int superblock_mirror_num,
791 struct btrfsic_dev_state **selected_dev_state,
792 struct btrfs_super_block *selected_super)
793{
794 struct btrfs_super_block *super_tmp;
795 u64 dev_bytenr;
796 struct buffer_head *bh;
797 struct btrfsic_block *superblock_tmp;
798 int pass;
799 struct block_device *const superblock_bdev = device->bdev;
800
801 /* super block bytenr is always the unmapped device bytenr */
802 dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
803 bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096);
804 if (NULL == bh)
805 return -1;
806 super_tmp = (struct btrfs_super_block *)
807 (bh->b_data + (dev_bytenr & 4095));
808
809 if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
810 strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC,
811 sizeof(super_tmp->magic)) ||
812 memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) {
813 brelse(bh);
814 return 0;
815 }
816
817 superblock_tmp =
818 btrfsic_block_hashtable_lookup(superblock_bdev,
819 dev_bytenr,
820 &state->block_hashtable);
821 if (NULL == superblock_tmp) {
822 superblock_tmp = btrfsic_block_alloc();
823 if (NULL == superblock_tmp) {
824 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
825 brelse(bh);
826 return -1;
827 }
828 /* for superblock, only the dev_bytenr makes sense */
829 superblock_tmp->dev_bytenr = dev_bytenr;
830 superblock_tmp->dev_state = dev_state;
831 superblock_tmp->logical_bytenr = dev_bytenr;
832 superblock_tmp->generation = btrfs_super_generation(super_tmp);
833 superblock_tmp->is_metadata = 1;
834 superblock_tmp->is_superblock = 1;
835 superblock_tmp->is_iodone = 1;
836 superblock_tmp->never_written = 0;
837 superblock_tmp->mirror_num = 1 + superblock_mirror_num;
838 if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
839 printk(KERN_INFO "New initial S-block (bdev %p, %s)"
840 " @%llu (%s/%llu/%d)\n",
841 superblock_bdev, device->name,
842 (unsigned long long)dev_bytenr,
843 dev_state->name,
844 (unsigned long long)dev_bytenr,
845 superblock_mirror_num);
846 list_add(&superblock_tmp->all_blocks_node,
847 &state->all_blocks_list);
848 btrfsic_block_hashtable_add(superblock_tmp,
849 &state->block_hashtable);
850 }
851
852 /* select the one with the highest generation field */
853 if (btrfs_super_generation(super_tmp) >
854 state->max_superblock_generation ||
855 0 == state->max_superblock_generation) {
856 memcpy(selected_super, super_tmp, sizeof(*selected_super));
857 *selected_dev_state = dev_state;
858 state->max_superblock_generation =
859 btrfs_super_generation(super_tmp);
860 state->latest_superblock = superblock_tmp;
861 }
862
863 for (pass = 0; pass < 3; pass++) {
864 u64 next_bytenr;
865 int num_copies;
866 int mirror_num;
867 const char *additional_string = NULL;
868 struct btrfs_disk_key tmp_disk_key;
869
870 tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
871 tmp_disk_key.offset = 0;
872 switch (pass) {
873 case 0:
874 tmp_disk_key.objectid =
875 cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
876 additional_string = "initial root ";
877 next_bytenr = btrfs_super_root(super_tmp);
878 break;
879 case 1:
880 tmp_disk_key.objectid =
881 cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
882 additional_string = "initial chunk ";
883 next_bytenr = btrfs_super_chunk_root(super_tmp);
884 break;
885 case 2:
886 tmp_disk_key.objectid =
887 cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
888 additional_string = "initial log ";
889 next_bytenr = btrfs_super_log_root(super_tmp);
890 if (0 == next_bytenr)
891 continue;
892 break;
893 }
894
895 num_copies =
896 btrfs_num_copies(&state->root->fs_info->mapping_tree,
897 next_bytenr, PAGE_SIZE);
898 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
899 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
900 (unsigned long long)next_bytenr, num_copies);
901 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
902 struct btrfsic_block *next_block;
903 struct btrfsic_block_data_ctx tmp_next_block_ctx;
904 struct btrfsic_block_link *l;
905
906 if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
907 &tmp_next_block_ctx,
908 mirror_num)) {
909 printk(KERN_INFO "btrfsic: btrfsic_map_block("
910 "bytenr @%llu, mirror %d) failed!\n",
911 (unsigned long long)next_bytenr,
912 mirror_num);
913 brelse(bh);
914 return -1;
915 }
916
917 next_block = btrfsic_block_lookup_or_add(
918 state, &tmp_next_block_ctx,
919 additional_string, 1, 1, 0,
920 mirror_num, NULL);
921 if (NULL == next_block) {
922 btrfsic_release_block_ctx(&tmp_next_block_ctx);
923 brelse(bh);
924 return -1;
925 }
926
927 next_block->disk_key = tmp_disk_key;
928 next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
929 l = btrfsic_block_link_lookup_or_add(
930 state, &tmp_next_block_ctx,
931 next_block, superblock_tmp,
932 BTRFSIC_GENERATION_UNKNOWN);
933 btrfsic_release_block_ctx(&tmp_next_block_ctx);
934 if (NULL == l) {
935 brelse(bh);
936 return -1;
937 }
938 }
939 }
940 if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES)
941 btrfsic_dump_tree_sub(state, superblock_tmp, 0);
942
943 brelse(bh);
944 return 0;
945}
946
947static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
948{
949 struct btrfsic_stack_frame *sf;
950
951 sf = kzalloc(sizeof(*sf), GFP_NOFS);
952 if (NULL == sf)
953 printk(KERN_INFO "btrfsic: alloc memory failed!\n");
954 else
955 sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER;
956 return sf;
957}
958
959static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf)
960{
961 BUG_ON(!(NULL == sf ||
962 BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic));
963 kfree(sf);
964}
965
966static int btrfsic_process_metablock(
967 struct btrfsic_state *state,
968 struct btrfsic_block *const first_block,
969 struct btrfsic_block_data_ctx *const first_block_ctx,
970 struct btrfs_header *const first_hdr,
971 int first_limit_nesting, int force_iodone_flag)
972{
973 struct btrfsic_stack_frame initial_stack_frame = { 0 };
974 struct btrfsic_stack_frame *sf;
975 struct btrfsic_stack_frame *next_stack;
976
977 sf = &initial_stack_frame;
978 sf->error = 0;
979 sf->i = -1;
980 sf->limit_nesting = first_limit_nesting;
981 sf->block = first_block;
982 sf->block_ctx = first_block_ctx;
983 sf->next_block = NULL;
984 sf->hdr = first_hdr;
985 sf->prev = NULL;
986
987continue_with_new_stack_frame:
988 sf->block->generation = le64_to_cpu(sf->hdr->generation);
989 if (0 == sf->hdr->level) {
990 struct btrfs_leaf *const leafhdr =
991 (struct btrfs_leaf *)sf->hdr;
992
993 if (-1 == sf->i) {
994 sf->nr = le32_to_cpu(leafhdr->header.nritems);
995
996 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
997 printk(KERN_INFO
998 "leaf %llu items %d generation %llu"
999 " owner %llu\n",
1000 (unsigned long long)
1001 sf->block_ctx->start,
1002 sf->nr,
1003 (unsigned long long)
1004 le64_to_cpu(leafhdr->header.generation),
1005 (unsigned long long)
1006 le64_to_cpu(leafhdr->header.owner));
1007 }
1008
1009continue_with_current_leaf_stack_frame:
1010 if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
1011 sf->i++;
1012 sf->num_copies = 0;
1013 }
1014
1015 if (sf->i < sf->nr) {
1016 struct btrfs_item *disk_item = leafhdr->items + sf->i;
1017 struct btrfs_disk_key *disk_key = &disk_item->key;
1018 u8 type;
1019 const u32 item_offset = le32_to_cpu(disk_item->offset);
1020
1021 type = disk_key->type;
1022
1023 if (BTRFS_ROOT_ITEM_KEY == type) {
1024 const struct btrfs_root_item *const root_item =
1025 (struct btrfs_root_item *)
1026 (sf->block_ctx->data +
1027 offsetof(struct btrfs_leaf, items) +
1028 item_offset);
1029 const u64 next_bytenr =
1030 le64_to_cpu(root_item->bytenr);
1031
1032 sf->error =
1033 btrfsic_create_link_to_next_block(
1034 state,
1035 sf->block,
1036 sf->block_ctx,
1037 next_bytenr,
1038 sf->limit_nesting,
1039 &sf->next_block_ctx,
1040 &sf->next_block,
1041 force_iodone_flag,
1042 &sf->num_copies,
1043 &sf->mirror_num,
1044 disk_key,
1045 le64_to_cpu(root_item->
1046 generation));
1047 if (sf->error)
1048 goto one_stack_frame_backwards;
1049
1050 if (NULL != sf->next_block) {
1051 struct btrfs_header *const next_hdr =
1052 (struct btrfs_header *)
1053 sf->next_block_ctx.data;
1054
1055 next_stack =
1056 btrfsic_stack_frame_alloc();
1057 if (NULL == next_stack) {
1058 btrfsic_release_block_ctx(
1059 &sf->
1060 next_block_ctx);
1061 goto one_stack_frame_backwards;
1062 }
1063
1064 next_stack->i = -1;
1065 next_stack->block = sf->next_block;
1066 next_stack->block_ctx =
1067 &sf->next_block_ctx;
1068 next_stack->next_block = NULL;
1069 next_stack->hdr = next_hdr;
1070 next_stack->limit_nesting =
1071 sf->limit_nesting - 1;
1072 next_stack->prev = sf;
1073 sf = next_stack;
1074 goto continue_with_new_stack_frame;
1075 }
1076 } else if (BTRFS_EXTENT_DATA_KEY == type &&
1077 state->include_extent_data) {
1078 sf->error = btrfsic_handle_extent_data(
1079 state,
1080 sf->block,
1081 sf->block_ctx,
1082 item_offset,
1083 force_iodone_flag);
1084 if (sf->error)
1085 goto one_stack_frame_backwards;
1086 }
1087
1088 goto continue_with_current_leaf_stack_frame;
1089 }
1090 } else {
1091 struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr;
1092
1093 if (-1 == sf->i) {
1094 sf->nr = le32_to_cpu(nodehdr->header.nritems);
1095
1096 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1097 printk(KERN_INFO "node %llu level %d items %d"
1098 " generation %llu owner %llu\n",
1099 (unsigned long long)
1100 sf->block_ctx->start,
1101 nodehdr->header.level, sf->nr,
1102 (unsigned long long)
1103 le64_to_cpu(nodehdr->header.generation),
1104 (unsigned long long)
1105 le64_to_cpu(nodehdr->header.owner));
1106 }
1107
1108continue_with_current_node_stack_frame:
1109 if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
1110 sf->i++;
1111 sf->num_copies = 0;
1112 }
1113
1114 if (sf->i < sf->nr) {
1115 struct btrfs_key_ptr *disk_key_ptr =
1116 nodehdr->ptrs + sf->i;
1117 const u64 next_bytenr =
1118 le64_to_cpu(disk_key_ptr->blockptr);
1119
1120 sf->error = btrfsic_create_link_to_next_block(
1121 state,
1122 sf->block,
1123 sf->block_ctx,
1124 next_bytenr,
1125 sf->limit_nesting,
1126 &sf->next_block_ctx,
1127 &sf->next_block,
1128 force_iodone_flag,
1129 &sf->num_copies,
1130 &sf->mirror_num,
1131 &disk_key_ptr->key,
1132 le64_to_cpu(disk_key_ptr->generation));
1133 if (sf->error)
1134 goto one_stack_frame_backwards;
1135
1136 if (NULL != sf->next_block) {
1137 struct btrfs_header *const next_hdr =
1138 (struct btrfs_header *)
1139 sf->next_block_ctx.data;
1140
1141 next_stack = btrfsic_stack_frame_alloc();
1142 if (NULL == next_stack)
1143 goto one_stack_frame_backwards;
1144
1145 next_stack->i = -1;
1146 next_stack->block = sf->next_block;
1147 next_stack->block_ctx = &sf->next_block_ctx;
1148 next_stack->next_block = NULL;
1149 next_stack->hdr = next_hdr;
1150 next_stack->limit_nesting =
1151 sf->limit_nesting - 1;
1152 next_stack->prev = sf;
1153 sf = next_stack;
1154 goto continue_with_new_stack_frame;
1155 }
1156
1157 goto continue_with_current_node_stack_frame;
1158 }
1159 }
1160
1161one_stack_frame_backwards:
1162 if (NULL != sf->prev) {
1163 struct btrfsic_stack_frame *const prev = sf->prev;
1164
1165 /* the one for the initial block is freed in the caller */
1166 btrfsic_release_block_ctx(sf->block_ctx);
1167
1168 if (sf->error) {
1169 prev->error = sf->error;
1170 btrfsic_stack_frame_free(sf);
1171 sf = prev;
1172 goto one_stack_frame_backwards;
1173 }
1174
1175 btrfsic_stack_frame_free(sf);
1176 sf = prev;
1177 goto continue_with_new_stack_frame;
1178 } else {
1179 BUG_ON(&initial_stack_frame != sf);
1180 }
1181
1182 return sf->error;
1183}
1184
1185static int btrfsic_create_link_to_next_block(
1186 struct btrfsic_state *state,
1187 struct btrfsic_block *block,
1188 struct btrfsic_block_data_ctx *block_ctx,
1189 u64 next_bytenr,
1190 int limit_nesting,
1191 struct btrfsic_block_data_ctx *next_block_ctx,
1192 struct btrfsic_block **next_blockp,
1193 int force_iodone_flag,
1194 int *num_copiesp, int *mirror_nump,
1195 struct btrfs_disk_key *disk_key,
1196 u64 parent_generation)
1197{
1198 struct btrfsic_block *next_block = NULL;
1199 int ret;
1200 struct btrfsic_block_link *l;
1201 int did_alloc_block_link;
1202 int block_was_created;
1203
1204 *next_blockp = NULL;
1205 if (0 == *num_copiesp) {
1206 *num_copiesp =
1207 btrfs_num_copies(&state->root->fs_info->mapping_tree,
1208 next_bytenr, PAGE_SIZE);
1209 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
1210 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
1211 (unsigned long long)next_bytenr, *num_copiesp);
1212 *mirror_nump = 1;
1213 }
1214
1215 if (*mirror_nump > *num_copiesp)
1216 return 0;
1217
1218 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1219 printk(KERN_INFO
1220 "btrfsic_create_link_to_next_block(mirror_num=%d)\n",
1221 *mirror_nump);
1222 ret = btrfsic_map_block(state, next_bytenr,
1223 BTRFSIC_BLOCK_SIZE,
1224 next_block_ctx, *mirror_nump);
1225 if (ret) {
1226 printk(KERN_INFO
1227 "btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
1228 (unsigned long long)next_bytenr, *mirror_nump);
1229 btrfsic_release_block_ctx(next_block_ctx);
1230 *next_blockp = NULL;
1231 return -1;
1232 }
1233
1234 next_block = btrfsic_block_lookup_or_add(state,
1235 next_block_ctx, "referenced ",
1236 1, force_iodone_flag,
1237 !force_iodone_flag,
1238 *mirror_nump,
1239 &block_was_created);
1240 if (NULL == next_block) {
1241 btrfsic_release_block_ctx(next_block_ctx);
1242 *next_blockp = NULL;
1243 return -1;
1244 }
1245 if (block_was_created) {
1246 l = NULL;
1247 next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
1248 } else {
1249 if (next_block->logical_bytenr != next_bytenr &&
1250 !(!next_block->is_metadata &&
1251 0 == next_block->logical_bytenr)) {
1252 printk(KERN_INFO
1253 "Referenced block @%llu (%s/%llu/%d)"
1254 " found in hash table, %c,"
1255 " bytenr mismatch (!= stored %llu).\n",
1256 (unsigned long long)next_bytenr,
1257 next_block_ctx->dev->name,
1258 (unsigned long long)next_block_ctx->dev_bytenr,
1259 *mirror_nump,
1260 btrfsic_get_block_type(state, next_block),
1261 (unsigned long long)next_block->logical_bytenr);
1262 } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1263 printk(KERN_INFO
1264 "Referenced block @%llu (%s/%llu/%d)"
1265 " found in hash table, %c.\n",
1266 (unsigned long long)next_bytenr,
1267 next_block_ctx->dev->name,
1268 (unsigned long long)next_block_ctx->dev_bytenr,
1269 *mirror_nump,
1270 btrfsic_get_block_type(state, next_block));
1271 next_block->logical_bytenr = next_bytenr;
1272
1273 next_block->mirror_num = *mirror_nump;
1274 l = btrfsic_block_link_hashtable_lookup(
1275 next_block_ctx->dev->bdev,
1276 next_block_ctx->dev_bytenr,
1277 block_ctx->dev->bdev,
1278 block_ctx->dev_bytenr,
1279 &state->block_link_hashtable);
1280 }
1281
1282 next_block->disk_key = *disk_key;
1283 if (NULL == l) {
1284 l = btrfsic_block_link_alloc();
1285 if (NULL == l) {
1286 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
1287 btrfsic_release_block_ctx(next_block_ctx);
1288 *next_blockp = NULL;
1289 return -1;
1290 }
1291
1292 did_alloc_block_link = 1;
1293 l->block_ref_to = next_block;
1294 l->block_ref_from = block;
1295 l->ref_cnt = 1;
1296 l->parent_generation = parent_generation;
1297
1298 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1299 btrfsic_print_add_link(state, l);
1300
1301 list_add(&l->node_ref_to, &block->ref_to_list);
1302 list_add(&l->node_ref_from, &next_block->ref_from_list);
1303
1304 btrfsic_block_link_hashtable_add(l,
1305 &state->block_link_hashtable);
1306 } else {
1307 did_alloc_block_link = 0;
1308 if (0 == limit_nesting) {
1309 l->ref_cnt++;
1310 l->parent_generation = parent_generation;
1311 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1312 btrfsic_print_add_link(state, l);
1313 }
1314 }
1315
1316 if (limit_nesting > 0 && did_alloc_block_link) {
1317 ret = btrfsic_read_block(state, next_block_ctx);
1318 if (ret < (int)BTRFSIC_BLOCK_SIZE) {
1319 printk(KERN_INFO
1320 "btrfsic: read block @logical %llu failed!\n",
1321 (unsigned long long)next_bytenr);
1322 btrfsic_release_block_ctx(next_block_ctx);
1323 *next_blockp = NULL;
1324 return -1;
1325 }
1326
1327 *next_blockp = next_block;
1328 } else {
1329 *next_blockp = NULL;
1330 }
1331 (*mirror_nump)++;
1332
1333 return 0;
1334}
1335
1336static int btrfsic_handle_extent_data(
1337 struct btrfsic_state *state,
1338 struct btrfsic_block *block,
1339 struct btrfsic_block_data_ctx *block_ctx,
1340 u32 item_offset, int force_iodone_flag)
1341{
1342 int ret;
1343 struct btrfs_file_extent_item *file_extent_item =
1344 (struct btrfs_file_extent_item *)(block_ctx->data +
1345 offsetof(struct btrfs_leaf,
1346 items) + item_offset);
1347 u64 next_bytenr =
1348 le64_to_cpu(file_extent_item->disk_bytenr) +
1349 le64_to_cpu(file_extent_item->offset);
1350 u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes);
1351 u64 generation = le64_to_cpu(file_extent_item->generation);
1352 struct btrfsic_block_link *l;
1353
1354 if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
1355 printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu,"
1356 " offset = %llu, num_bytes = %llu\n",
1357 file_extent_item->type,
1358 (unsigned long long)
1359 le64_to_cpu(file_extent_item->disk_bytenr),
1360 (unsigned long long)
1361 le64_to_cpu(file_extent_item->offset),
1362 (unsigned long long)
1363 le64_to_cpu(file_extent_item->num_bytes));
1364 if (BTRFS_FILE_EXTENT_REG != file_extent_item->type ||
1365 ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr))
1366 return 0;
1367 while (num_bytes > 0) {
1368 u32 chunk_len;
1369 int num_copies;
1370 int mirror_num;
1371
1372 if (num_bytes > BTRFSIC_BLOCK_SIZE)
1373 chunk_len = BTRFSIC_BLOCK_SIZE;
1374 else
1375 chunk_len = num_bytes;
1376
1377 num_copies =
1378 btrfs_num_copies(&state->root->fs_info->mapping_tree,
1379 next_bytenr, PAGE_SIZE);
1380 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
1381 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
1382 (unsigned long long)next_bytenr, num_copies);
1383 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
1384 struct btrfsic_block_data_ctx next_block_ctx;
1385 struct btrfsic_block *next_block;
1386 int block_was_created;
1387
1388 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1389 printk(KERN_INFO "btrfsic_handle_extent_data("
1390 "mirror_num=%d)\n", mirror_num);
1391 if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
1392 printk(KERN_INFO
1393 "\tdisk_bytenr = %llu, num_bytes %u\n",
1394 (unsigned long long)next_bytenr,
1395 chunk_len);
1396 ret = btrfsic_map_block(state, next_bytenr,
1397 chunk_len, &next_block_ctx,
1398 mirror_num);
1399 if (ret) {
1400 printk(KERN_INFO
1401 "btrfsic: btrfsic_map_block(@%llu,"
1402 " mirror=%d) failed!\n",
1403 (unsigned long long)next_bytenr,
1404 mirror_num);
1405 return -1;
1406 }
1407
1408 next_block = btrfsic_block_lookup_or_add(
1409 state,
1410 &next_block_ctx,
1411 "referenced ",
1412 0,
1413 force_iodone_flag,
1414 !force_iodone_flag,
1415 mirror_num,
1416 &block_was_created);
1417 if (NULL == next_block) {
1418 printk(KERN_INFO
1419 "btrfsic: error, kmalloc failed!\n");
1420 btrfsic_release_block_ctx(&next_block_ctx);
1421 return -1;
1422 }
1423 if (!block_was_created) {
1424 if (next_block->logical_bytenr != next_bytenr &&
1425 !(!next_block->is_metadata &&
1426 0 == next_block->logical_bytenr)) {
1427 printk(KERN_INFO
1428 "Referenced block"
1429 " @%llu (%s/%llu/%d)"
1430 " found in hash table, D,"
1431 " bytenr mismatch"
1432 " (!= stored %llu).\n",
1433 (unsigned long long)next_bytenr,
1434 next_block_ctx.dev->name,
1435 (unsigned long long)
1436 next_block_ctx.dev_bytenr,
1437 mirror_num,
1438 (unsigned long long)
1439 next_block->logical_bytenr);
1440 }
1441 next_block->logical_bytenr = next_bytenr;
1442 next_block->mirror_num = mirror_num;
1443 }
1444
1445 l = btrfsic_block_link_lookup_or_add(state,
1446 &next_block_ctx,
1447 next_block, block,
1448 generation);
1449 btrfsic_release_block_ctx(&next_block_ctx);
1450 if (NULL == l)
1451 return -1;
1452 }
1453
1454 next_bytenr += chunk_len;
1455 num_bytes -= chunk_len;
1456 }
1457
1458 return 0;
1459}
1460
1461static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
1462 struct btrfsic_block_data_ctx *block_ctx_out,
1463 int mirror_num)
1464{
1465 int ret;
1466 u64 length;
1467 struct btrfs_bio *multi = NULL;
1468 struct btrfs_device *device;
1469
1470 length = len;
1471 ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ,
1472 bytenr, &length, &multi, mirror_num);
1473
1474 device = multi->stripes[0].dev;
1475 block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
1476 block_ctx_out->dev_bytenr = multi->stripes[0].physical;
1477 block_ctx_out->start = bytenr;
1478 block_ctx_out->len = len;
1479 block_ctx_out->data = NULL;
1480 block_ctx_out->bh = NULL;
1481
1482 if (0 == ret)
1483 kfree(multi);
1484 if (NULL == block_ctx_out->dev) {
1485 ret = -ENXIO;
1486 printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
1487 }
1488
1489 return ret;
1490}
1491
1492static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
1493 u32 len, struct block_device *bdev,
1494 struct btrfsic_block_data_ctx *block_ctx_out)
1495{
1496 block_ctx_out->dev = btrfsic_dev_state_lookup(bdev);
1497 block_ctx_out->dev_bytenr = bytenr;
1498 block_ctx_out->start = bytenr;
1499 block_ctx_out->len = len;
1500 block_ctx_out->data = NULL;
1501 block_ctx_out->bh = NULL;
1502 if (NULL != block_ctx_out->dev) {
1503 return 0;
1504 } else {
1505 printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n");
1506 return -ENXIO;
1507 }
1508}
1509
1510static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
1511{
1512 if (NULL != block_ctx->bh) {
1513 brelse(block_ctx->bh);
1514 block_ctx->bh = NULL;
1515 }
1516}
1517
1518static int btrfsic_read_block(struct btrfsic_state *state,
1519 struct btrfsic_block_data_ctx *block_ctx)
1520{
1521 block_ctx->bh = NULL;
1522 if (block_ctx->dev_bytenr & 4095) {
1523 printk(KERN_INFO
1524 "btrfsic: read_block() with unaligned bytenr %llu\n",
1525 (unsigned long long)block_ctx->dev_bytenr);
1526 return -1;
1527 }
1528 if (block_ctx->len > 4096) {
1529 printk(KERN_INFO
1530 "btrfsic: read_block() with too huge size %d\n",
1531 block_ctx->len);
1532 return -1;
1533 }
1534
1535 block_ctx->bh = __bread(block_ctx->dev->bdev,
1536 block_ctx->dev_bytenr >> 12, 4096);
1537 if (NULL == block_ctx->bh)
1538 return -1;
1539 block_ctx->data = block_ctx->bh->b_data;
1540
1541 return block_ctx->len;
1542}
1543
1544static void btrfsic_dump_database(struct btrfsic_state *state)
1545{
1546 struct list_head *elem_all;
1547
1548 BUG_ON(NULL == state);
1549
1550 printk(KERN_INFO "all_blocks_list:\n");
1551 list_for_each(elem_all, &state->all_blocks_list) {
1552 const struct btrfsic_block *const b_all =
1553 list_entry(elem_all, struct btrfsic_block,
1554 all_blocks_node);
1555 struct list_head *elem_ref_to;
1556 struct list_head *elem_ref_from;
1557
1558 printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n",
1559 btrfsic_get_block_type(state, b_all),
1560 (unsigned long long)b_all->logical_bytenr,
1561 b_all->dev_state->name,
1562 (unsigned long long)b_all->dev_bytenr,
1563 b_all->mirror_num);
1564
1565 list_for_each(elem_ref_to, &b_all->ref_to_list) {
1566 const struct btrfsic_block_link *const l =
1567 list_entry(elem_ref_to,
1568 struct btrfsic_block_link,
1569 node_ref_to);
1570
1571 printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
1572 " refers %u* to"
1573 " %c @%llu (%s/%llu/%d)\n",
1574 btrfsic_get_block_type(state, b_all),
1575 (unsigned long long)b_all->logical_bytenr,
1576 b_all->dev_state->name,
1577 (unsigned long long)b_all->dev_bytenr,
1578 b_all->mirror_num,
1579 l->ref_cnt,
1580 btrfsic_get_block_type(state, l->block_ref_to),
1581 (unsigned long long)
1582 l->block_ref_to->logical_bytenr,
1583 l->block_ref_to->dev_state->name,
1584 (unsigned long long)l->block_ref_to->dev_bytenr,
1585 l->block_ref_to->mirror_num);
1586 }
1587
1588 list_for_each(elem_ref_from, &b_all->ref_from_list) {
1589 const struct btrfsic_block_link *const l =
1590 list_entry(elem_ref_from,
1591 struct btrfsic_block_link,
1592 node_ref_from);
1593
1594 printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
1595 " is ref %u* from"
1596 " %c @%llu (%s/%llu/%d)\n",
1597 btrfsic_get_block_type(state, b_all),
1598 (unsigned long long)b_all->logical_bytenr,
1599 b_all->dev_state->name,
1600 (unsigned long long)b_all->dev_bytenr,
1601 b_all->mirror_num,
1602 l->ref_cnt,
1603 btrfsic_get_block_type(state, l->block_ref_from),
1604 (unsigned long long)
1605 l->block_ref_from->logical_bytenr,
1606 l->block_ref_from->dev_state->name,
1607 (unsigned long long)
1608 l->block_ref_from->dev_bytenr,
1609 l->block_ref_from->mirror_num);
1610 }
1611
1612 printk(KERN_INFO "\n");
1613 }
1614}
1615
1616/*
1617 * Test whether the disk block contains a tree block (leaf or node)
1618 * (note that this test fails for the super block)
1619 */
1620static int btrfsic_test_for_metadata(struct btrfsic_state *state,
1621 const u8 *data, unsigned int size)
1622{
1623 struct btrfs_header *h;
1624 u8 csum[BTRFS_CSUM_SIZE];
1625 u32 crc = ~(u32)0;
1626 int fail = 0;
1627 int crc_fail = 0;
1628
1629 h = (struct btrfs_header *)data;
1630
1631 if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE))
1632 fail++;
1633
1634 crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE);
1635 btrfs_csum_final(crc, csum);
1636 if (memcmp(csum, h->csum, state->csum_size))
1637 crc_fail++;
1638
1639 return fail || crc_fail;
1640}
1641
1642static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1643 u64 dev_bytenr,
1644 u8 *mapped_data, unsigned int len,
1645 struct bio *bio,
1646 int *bio_is_patched,
1647 struct buffer_head *bh,
1648 int submit_bio_bh_rw)
1649{
1650 int is_metadata;
1651 struct btrfsic_block *block;
1652 struct btrfsic_block_data_ctx block_ctx;
1653 int ret;
1654 struct btrfsic_state *state = dev_state->state;
1655 struct block_device *bdev = dev_state->bdev;
1656
1657 WARN_ON(len > PAGE_SIZE);
1658 is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len));
1659 if (NULL != bio_is_patched)
1660 *bio_is_patched = 0;
1661
1662 block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
1663 &state->block_hashtable);
1664 if (NULL != block) {
1665 u64 bytenr;
1666 struct list_head *elem_ref_to;
1667 struct list_head *tmp_ref_to;
1668
1669 if (block->is_superblock) {
1670 bytenr = le64_to_cpu(((struct btrfs_super_block *)
1671 mapped_data)->bytenr);
1672 is_metadata = 1;
1673 if (state->print_mask &
1674 BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
1675 printk(KERN_INFO
1676 "[before new superblock is written]:\n");
1677 btrfsic_dump_tree_sub(state, block, 0);
1678 }
1679 }
1680 if (is_metadata) {
1681 if (!block->is_superblock) {
1682 bytenr = le64_to_cpu(((struct btrfs_header *)
1683 mapped_data)->bytenr);
1684 btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
1685 dev_state,
1686 dev_bytenr,
1687 mapped_data);
1688 }
1689 if (block->logical_bytenr != bytenr) {
1690 printk(KERN_INFO
1691 "Written block @%llu (%s/%llu/%d)"
1692 " found in hash table, %c,"
1693 " bytenr mismatch"
1694 " (!= stored %llu).\n",
1695 (unsigned long long)bytenr,
1696 dev_state->name,
1697 (unsigned long long)dev_bytenr,
1698 block->mirror_num,
1699 btrfsic_get_block_type(state, block),
1700 (unsigned long long)
1701 block->logical_bytenr);
1702 block->logical_bytenr = bytenr;
1703 } else if (state->print_mask &
1704 BTRFSIC_PRINT_MASK_VERBOSE)
1705 printk(KERN_INFO
1706 "Written block @%llu (%s/%llu/%d)"
1707 " found in hash table, %c.\n",
1708 (unsigned long long)bytenr,
1709 dev_state->name,
1710 (unsigned long long)dev_bytenr,
1711 block->mirror_num,
1712 btrfsic_get_block_type(state, block));
1713 } else {
1714 bytenr = block->logical_bytenr;
1715 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1716 printk(KERN_INFO
1717 "Written block @%llu (%s/%llu/%d)"
1718 " found in hash table, %c.\n",
1719 (unsigned long long)bytenr,
1720 dev_state->name,
1721 (unsigned long long)dev_bytenr,
1722 block->mirror_num,
1723 btrfsic_get_block_type(state, block));
1724 }
1725
1726 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1727 printk(KERN_INFO
1728 "ref_to_list: %cE, ref_from_list: %cE\n",
1729 list_empty(&block->ref_to_list) ? ' ' : '!',
1730 list_empty(&block->ref_from_list) ? ' ' : '!');
1731 if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
1732 printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
1733 " @%llu (%s/%llu/%d), old(gen=%llu,"
1734 " objectid=%llu, type=%d, offset=%llu),"
1735 " new(gen=%llu),"
1736 " which is referenced by most recent superblock"
1737 " (superblockgen=%llu)!\n",
1738 btrfsic_get_block_type(state, block),
1739 (unsigned long long)bytenr,
1740 dev_state->name,
1741 (unsigned long long)dev_bytenr,
1742 block->mirror_num,
1743 (unsigned long long)block->generation,
1744 (unsigned long long)
1745 le64_to_cpu(block->disk_key.objectid),
1746 block->disk_key.type,
1747 (unsigned long long)
1748 le64_to_cpu(block->disk_key.offset),
1749 (unsigned long long)
1750 le64_to_cpu(((struct btrfs_header *)
1751 mapped_data)->generation),
1752 (unsigned long long)
1753 state->max_superblock_generation);
1754 btrfsic_dump_tree(state);
1755 }
1756
1757 if (!block->is_iodone && !block->never_written) {
1758 printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
1759 " @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu,"
1760 " which is not yet iodone!\n",
1761 btrfsic_get_block_type(state, block),
1762 (unsigned long long)bytenr,
1763 dev_state->name,
1764 (unsigned long long)dev_bytenr,
1765 block->mirror_num,
1766 (unsigned long long)block->generation,
1767 (unsigned long long)
1768 le64_to_cpu(((struct btrfs_header *)
1769 mapped_data)->generation));
1770 /* it would not be safe to go on */
1771 btrfsic_dump_tree(state);
1772 return;
1773 }
1774
1775 /*
1776 * Clear all references of this block. Do not free
1777 * the block itself even if is not referenced anymore
1778 * because it still carries valueable information
1779 * like whether it was ever written and IO completed.
1780 */
1781 list_for_each_safe(elem_ref_to, tmp_ref_to,
1782 &block->ref_to_list) {
1783 struct btrfsic_block_link *const l =
1784 list_entry(elem_ref_to,
1785 struct btrfsic_block_link,
1786 node_ref_to);
1787
1788 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1789 btrfsic_print_rem_link(state, l);
1790 l->ref_cnt--;
1791 if (0 == l->ref_cnt) {
1792 list_del(&l->node_ref_to);
1793 list_del(&l->node_ref_from);
1794 btrfsic_block_link_hashtable_remove(l);
1795 btrfsic_block_link_free(l);
1796 }
1797 }
1798
1799 if (block->is_superblock)
1800 ret = btrfsic_map_superblock(state, bytenr, len,
1801 bdev, &block_ctx);
1802 else
1803 ret = btrfsic_map_block(state, bytenr, len,
1804 &block_ctx, 0);
1805 if (ret) {
1806 printk(KERN_INFO
1807 "btrfsic: btrfsic_map_block(root @%llu)"
1808 " failed!\n", (unsigned long long)bytenr);
1809 return;
1810 }
1811 block_ctx.data = mapped_data;
1812 /* the following is required in case of writes to mirrors,
1813 * use the same that was used for the lookup */
1814 block_ctx.dev = dev_state;
1815 block_ctx.dev_bytenr = dev_bytenr;
1816
1817 if (is_metadata || state->include_extent_data) {
1818 block->never_written = 0;
1819 block->iodone_w_error = 0;
1820 if (NULL != bio) {
1821 block->is_iodone = 0;
1822 BUG_ON(NULL == bio_is_patched);
1823 if (!*bio_is_patched) {
1824 block->orig_bio_bh_private =
1825 bio->bi_private;
1826 block->orig_bio_bh_end_io.bio =
1827 bio->bi_end_io;
1828 block->next_in_same_bio = NULL;
1829 bio->bi_private = block;
1830 bio->bi_end_io = btrfsic_bio_end_io;
1831 *bio_is_patched = 1;
1832 } else {
1833 struct btrfsic_block *chained_block =
1834 (struct btrfsic_block *)
1835 bio->bi_private;
1836
1837 BUG_ON(NULL == chained_block);
1838 block->orig_bio_bh_private =
1839 chained_block->orig_bio_bh_private;
1840 block->orig_bio_bh_end_io.bio =
1841 chained_block->orig_bio_bh_end_io.
1842 bio;
1843 block->next_in_same_bio = chained_block;
1844 bio->bi_private = block;
1845 }
1846 } else if (NULL != bh) {
1847 block->is_iodone = 0;
1848 block->orig_bio_bh_private = bh->b_private;
1849 block->orig_bio_bh_end_io.bh = bh->b_end_io;
1850 block->next_in_same_bio = NULL;
1851 bh->b_private = block;
1852 bh->b_end_io = btrfsic_bh_end_io;
1853 } else {
1854 block->is_iodone = 1;
1855 block->orig_bio_bh_private = NULL;
1856 block->orig_bio_bh_end_io.bio = NULL;
1857 block->next_in_same_bio = NULL;
1858 }
1859 }
1860
1861 block->flush_gen = dev_state->last_flush_gen + 1;
1862 block->submit_bio_bh_rw = submit_bio_bh_rw;
1863 if (is_metadata) {
1864 block->logical_bytenr = bytenr;
1865 block->is_metadata = 1;
1866 if (block->is_superblock) {
1867 ret = btrfsic_process_written_superblock(
1868 state,
1869 block,
1870 (struct btrfs_super_block *)
1871 mapped_data);
1872 if (state->print_mask &
1873 BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
1874 printk(KERN_INFO
1875 "[after new superblock is written]:\n");
1876 btrfsic_dump_tree_sub(state, block, 0);
1877 }
1878 } else {
1879 block->mirror_num = 0; /* unknown */
1880 ret = btrfsic_process_metablock(
1881 state,
1882 block,
1883 &block_ctx,
1884 (struct btrfs_header *)
1885 block_ctx.data,
1886 0, 0);
1887 }
1888 if (ret)
1889 printk(KERN_INFO
1890 "btrfsic: btrfsic_process_metablock"
1891 "(root @%llu) failed!\n",
1892 (unsigned long long)dev_bytenr);
1893 } else {
1894 block->is_metadata = 0;
1895 block->mirror_num = 0; /* unknown */
1896 block->generation = BTRFSIC_GENERATION_UNKNOWN;
1897 if (!state->include_extent_data
1898 && list_empty(&block->ref_from_list)) {
1899 /*
1900 * disk block is overwritten with extent
1901 * data (not meta data) and we are configured
1902 * to not include extent data: take the
1903 * chance and free the block's memory
1904 */
1905 btrfsic_block_hashtable_remove(block);
1906 list_del(&block->all_blocks_node);
1907 btrfsic_block_free(block);
1908 }
1909 }
1910 btrfsic_release_block_ctx(&block_ctx);
1911 } else {
1912 /* block has not been found in hash table */
1913 u64 bytenr;
1914
1915 if (!is_metadata) {
1916 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1917 printk(KERN_INFO "Written block (%s/%llu/?)"
1918 " !found in hash table, D.\n",
1919 dev_state->name,
1920 (unsigned long long)dev_bytenr);
1921 if (!state->include_extent_data)
1922 return; /* ignore that written D block */
1923
1924 /* this is getting ugly for the
1925 * include_extent_data case... */
1926 bytenr = 0; /* unknown */
1927 block_ctx.start = bytenr;
1928 block_ctx.len = len;
1929 block_ctx.bh = NULL;
1930 } else {
1931 bytenr = le64_to_cpu(((struct btrfs_header *)
1932 mapped_data)->bytenr);
1933 btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
1934 dev_bytenr,
1935 mapped_data);
1936 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1937 printk(KERN_INFO
1938 "Written block @%llu (%s/%llu/?)"
1939 " !found in hash table, M.\n",
1940 (unsigned long long)bytenr,
1941 dev_state->name,
1942 (unsigned long long)dev_bytenr);
1943
1944 ret = btrfsic_map_block(state, bytenr, len, &block_ctx,
1945 0);
1946 if (ret) {
1947 printk(KERN_INFO
1948 "btrfsic: btrfsic_map_block(root @%llu)"
1949 " failed!\n",
1950 (unsigned long long)dev_bytenr);
1951 return;
1952 }
1953 }
1954 block_ctx.data = mapped_data;
1955 /* the following is required in case of writes to mirrors,
1956 * use the same that was used for the lookup */
1957 block_ctx.dev = dev_state;
1958 block_ctx.dev_bytenr = dev_bytenr;
1959
1960 block = btrfsic_block_alloc();
1961 if (NULL == block) {
1962 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
1963 btrfsic_release_block_ctx(&block_ctx);
1964 return;
1965 }
1966 block->dev_state = dev_state;
1967 block->dev_bytenr = dev_bytenr;
1968 block->logical_bytenr = bytenr;
1969 block->is_metadata = is_metadata;
1970 block->never_written = 0;
1971 block->iodone_w_error = 0;
1972 block->mirror_num = 0; /* unknown */
1973 block->flush_gen = dev_state->last_flush_gen + 1;
1974 block->submit_bio_bh_rw = submit_bio_bh_rw;
1975 if (NULL != bio) {
1976 block->is_iodone = 0;
1977 BUG_ON(NULL == bio_is_patched);
1978 if (!*bio_is_patched) {
1979 block->orig_bio_bh_private = bio->bi_private;
1980 block->orig_bio_bh_end_io.bio = bio->bi_end_io;
1981 block->next_in_same_bio = NULL;
1982 bio->bi_private = block;
1983 bio->bi_end_io = btrfsic_bio_end_io;
1984 *bio_is_patched = 1;
1985 } else {
1986 struct btrfsic_block *chained_block =
1987 (struct btrfsic_block *)
1988 bio->bi_private;
1989
1990 BUG_ON(NULL == chained_block);
1991 block->orig_bio_bh_private =
1992 chained_block->orig_bio_bh_private;
1993 block->orig_bio_bh_end_io.bio =
1994 chained_block->orig_bio_bh_end_io.bio;
1995 block->next_in_same_bio = chained_block;
1996 bio->bi_private = block;
1997 }
1998 } else if (NULL != bh) {
1999 block->is_iodone = 0;
2000 block->orig_bio_bh_private = bh->b_private;
2001 block->orig_bio_bh_end_io.bh = bh->b_end_io;
2002 block->next_in_same_bio = NULL;
2003 bh->b_private = block;
2004 bh->b_end_io = btrfsic_bh_end_io;
2005 } else {
2006 block->is_iodone = 1;
2007 block->orig_bio_bh_private = NULL;
2008 block->orig_bio_bh_end_io.bio = NULL;
2009 block->next_in_same_bio = NULL;
2010 }
2011 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2012 printk(KERN_INFO
2013 "New written %c-block @%llu (%s/%llu/%d)\n",
2014 is_metadata ? 'M' : 'D',
2015 (unsigned long long)block->logical_bytenr,
2016 block->dev_state->name,
2017 (unsigned long long)block->dev_bytenr,
2018 block->mirror_num);
2019 list_add(&block->all_blocks_node, &state->all_blocks_list);
2020 btrfsic_block_hashtable_add(block, &state->block_hashtable);
2021
2022 if (is_metadata) {
2023 ret = btrfsic_process_metablock(state, block,
2024 &block_ctx,
2025 (struct btrfs_header *)
2026 block_ctx.data, 0, 0);
2027 if (ret)
2028 printk(KERN_INFO
2029 "btrfsic: process_metablock(root @%llu)"
2030 " failed!\n",
2031 (unsigned long long)dev_bytenr);
2032 }
2033 btrfsic_release_block_ctx(&block_ctx);
2034 }
2035}
2036
2037static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
2038{
2039 struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private;
2040 int iodone_w_error;
2041
2042 /* mutex is not held! This is not save if IO is not yet completed
2043 * on umount */
2044 iodone_w_error = 0;
2045 if (bio_error_status)
2046 iodone_w_error = 1;
2047
2048 BUG_ON(NULL == block);
2049 bp->bi_private = block->orig_bio_bh_private;
2050 bp->bi_end_io = block->orig_bio_bh_end_io.bio;
2051
2052 do {
2053 struct btrfsic_block *next_block;
2054 struct btrfsic_dev_state *const dev_state = block->dev_state;
2055
2056 if ((dev_state->state->print_mask &
2057 BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
2058 printk(KERN_INFO
2059 "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
2060 bio_error_status,
2061 btrfsic_get_block_type(dev_state->state, block),
2062 (unsigned long long)block->logical_bytenr,
2063 dev_state->name,
2064 (unsigned long long)block->dev_bytenr,
2065 block->mirror_num);
2066 next_block = block->next_in_same_bio;
2067 block->iodone_w_error = iodone_w_error;
2068 if (block->submit_bio_bh_rw & REQ_FLUSH) {
2069 dev_state->last_flush_gen++;
2070 if ((dev_state->state->print_mask &
2071 BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
2072 printk(KERN_INFO
2073 "bio_end_io() new %s flush_gen=%llu\n",
2074 dev_state->name,
2075 (unsigned long long)
2076 dev_state->last_flush_gen);
2077 }
2078 if (block->submit_bio_bh_rw & REQ_FUA)
2079 block->flush_gen = 0; /* FUA completed means block is
2080 * on disk */
2081 block->is_iodone = 1; /* for FLUSH, this releases the block */
2082 block = next_block;
2083 } while (NULL != block);
2084
2085 bp->bi_end_io(bp, bio_error_status);
2086}
2087
2088static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
2089{
2090 struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private;
2091 int iodone_w_error = !uptodate;
2092 struct btrfsic_dev_state *dev_state;
2093
2094 BUG_ON(NULL == block);
2095 dev_state = block->dev_state;
2096 if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
2097 printk(KERN_INFO
2098 "bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n",
2099 iodone_w_error,
2100 btrfsic_get_block_type(dev_state->state, block),
2101 (unsigned long long)block->logical_bytenr,
2102 block->dev_state->name,
2103 (unsigned long long)block->dev_bytenr,
2104 block->mirror_num);
2105
2106 block->iodone_w_error = iodone_w_error;
2107 if (block->submit_bio_bh_rw & REQ_FLUSH) {
2108 dev_state->last_flush_gen++;
2109 if ((dev_state->state->print_mask &
2110 BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
2111 printk(KERN_INFO
2112 "bh_end_io() new %s flush_gen=%llu\n",
2113 dev_state->name,
2114 (unsigned long long)dev_state->last_flush_gen);
2115 }
2116 if (block->submit_bio_bh_rw & REQ_FUA)
2117 block->flush_gen = 0; /* FUA completed means block is on disk */
2118
2119 bh->b_private = block->orig_bio_bh_private;
2120 bh->b_end_io = block->orig_bio_bh_end_io.bh;
2121 block->is_iodone = 1; /* for FLUSH, this releases the block */
2122 bh->b_end_io(bh, uptodate);
2123}
2124
2125static int btrfsic_process_written_superblock(
2126 struct btrfsic_state *state,
2127 struct btrfsic_block *const superblock,
2128 struct btrfs_super_block *const super_hdr)
2129{
2130 int pass;
2131
2132 superblock->generation = btrfs_super_generation(super_hdr);
2133 if (!(superblock->generation > state->max_superblock_generation ||
2134 0 == state->max_superblock_generation)) {
2135 if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
2136 printk(KERN_INFO
2137 "btrfsic: superblock @%llu (%s/%llu/%d)"
2138 " with old gen %llu <= %llu\n",
2139 (unsigned long long)superblock->logical_bytenr,
2140 superblock->dev_state->name,
2141 (unsigned long long)superblock->dev_bytenr,
2142 superblock->mirror_num,
2143 (unsigned long long)
2144 btrfs_super_generation(super_hdr),
2145 (unsigned long long)
2146 state->max_superblock_generation);
2147 } else {
2148 if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
2149 printk(KERN_INFO
2150 "btrfsic: got new superblock @%llu (%s/%llu/%d)"
2151 " with new gen %llu > %llu\n",
2152 (unsigned long long)superblock->logical_bytenr,
2153 superblock->dev_state->name,
2154 (unsigned long long)superblock->dev_bytenr,
2155 superblock->mirror_num,
2156 (unsigned long long)
2157 btrfs_super_generation(super_hdr),
2158 (unsigned long long)
2159 state->max_superblock_generation);
2160
2161 state->max_superblock_generation =
2162 btrfs_super_generation(super_hdr);
2163 state->latest_superblock = superblock;
2164 }
2165
2166 for (pass = 0; pass < 3; pass++) {
2167 int ret;
2168 u64 next_bytenr;
2169 struct btrfsic_block *next_block;
2170 struct btrfsic_block_data_ctx tmp_next_block_ctx;
2171 struct btrfsic_block_link *l;
2172 int num_copies;
2173 int mirror_num;
2174 const char *additional_string = NULL;
2175 struct btrfs_disk_key tmp_disk_key;
2176
2177 tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
2178 tmp_disk_key.offset = 0;
2179
2180 switch (pass) {
2181 case 0:
2182 tmp_disk_key.objectid =
2183 cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
2184 additional_string = "root ";
2185 next_bytenr = btrfs_super_root(super_hdr);
2186 if (state->print_mask &
2187 BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
2188 printk(KERN_INFO "root@%llu\n",
2189 (unsigned long long)next_bytenr);
2190 break;
2191 case 1:
2192 tmp_disk_key.objectid =
2193 cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
2194 additional_string = "chunk ";
2195 next_bytenr = btrfs_super_chunk_root(super_hdr);
2196 if (state->print_mask &
2197 BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
2198 printk(KERN_INFO "chunk@%llu\n",
2199 (unsigned long long)next_bytenr);
2200 break;
2201 case 2:
2202 tmp_disk_key.objectid =
2203 cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
2204 additional_string = "log ";
2205 next_bytenr = btrfs_super_log_root(super_hdr);
2206 if (0 == next_bytenr)
2207 continue;
2208 if (state->print_mask &
2209 BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
2210 printk(KERN_INFO "log@%llu\n",
2211 (unsigned long long)next_bytenr);
2212 break;
2213 }
2214
2215 num_copies =
2216 btrfs_num_copies(&state->root->fs_info->mapping_tree,
2217 next_bytenr, PAGE_SIZE);
2218 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
2219 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
2220 (unsigned long long)next_bytenr, num_copies);
2221 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
2222 int was_created;
2223
2224 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2225 printk(KERN_INFO
2226 "btrfsic_process_written_superblock("
2227 "mirror_num=%d)\n", mirror_num);
2228 ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
2229 &tmp_next_block_ctx,
2230 mirror_num);
2231 if (ret) {
2232 printk(KERN_INFO
2233 "btrfsic: btrfsic_map_block(@%llu,"
2234 " mirror=%d) failed!\n",
2235 (unsigned long long)next_bytenr,
2236 mirror_num);
2237 return -1;
2238 }
2239
2240 next_block = btrfsic_block_lookup_or_add(
2241 state,
2242 &tmp_next_block_ctx,
2243 additional_string,
2244 1, 0, 1,
2245 mirror_num,
2246 &was_created);
2247 if (NULL == next_block) {
2248 printk(KERN_INFO
2249 "btrfsic: error, kmalloc failed!\n");
2250 btrfsic_release_block_ctx(&tmp_next_block_ctx);
2251 return -1;
2252 }
2253
2254 next_block->disk_key = tmp_disk_key;
2255 if (was_created)
2256 next_block->generation =
2257 BTRFSIC_GENERATION_UNKNOWN;
2258 l = btrfsic_block_link_lookup_or_add(
2259 state,
2260 &tmp_next_block_ctx,
2261 next_block,
2262 superblock,
2263 BTRFSIC_GENERATION_UNKNOWN);
2264 btrfsic_release_block_ctx(&tmp_next_block_ctx);
2265 if (NULL == l)
2266 return -1;
2267 }
2268 }
2269
2270 if (-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)) {
2271 WARN_ON(1);
2272 btrfsic_dump_tree(state);
2273 }
2274
2275 return 0;
2276}
2277
2278static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
2279 struct btrfsic_block *const block,
2280 int recursion_level)
2281{
2282 struct list_head *elem_ref_to;
2283 int ret = 0;
2284
2285 if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
2286 /*
2287 * Note that this situation can happen and does not
2288 * indicate an error in regular cases. It happens
2289 * when disk blocks are freed and later reused.
2290 * The check-integrity module is not aware of any
2291 * block free operations, it just recognizes block
2292 * write operations. Therefore it keeps the linkage
2293 * information for a block until a block is
2294 * rewritten. This can temporarily cause incorrect
2295 * and even circular linkage informations. This
2296 * causes no harm unless such blocks are referenced
2297 * by the most recent super block.
2298 */
2299 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2300 printk(KERN_INFO
2301 "btrfsic: abort cyclic linkage (case 1).\n");
2302
2303 return ret;
2304 }
2305
2306 /*
2307 * This algorithm is recursive because the amount of used stack
2308 * space is very small and the max recursion depth is limited.
2309 */
2310 list_for_each(elem_ref_to, &block->ref_to_list) {
2311 const struct btrfsic_block_link *const l =
2312 list_entry(elem_ref_to, struct btrfsic_block_link,
2313 node_ref_to);
2314
2315 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2316 printk(KERN_INFO
2317 "rl=%d, %c @%llu (%s/%llu/%d)"
2318 " %u* refers to %c @%llu (%s/%llu/%d)\n",
2319 recursion_level,
2320 btrfsic_get_block_type(state, block),
2321 (unsigned long long)block->logical_bytenr,
2322 block->dev_state->name,
2323 (unsigned long long)block->dev_bytenr,
2324 block->mirror_num,
2325 l->ref_cnt,
2326 btrfsic_get_block_type(state, l->block_ref_to),
2327 (unsigned long long)
2328 l->block_ref_to->logical_bytenr,
2329 l->block_ref_to->dev_state->name,
2330 (unsigned long long)l->block_ref_to->dev_bytenr,
2331 l->block_ref_to->mirror_num);
2332 if (l->block_ref_to->never_written) {
2333 printk(KERN_INFO "btrfs: attempt to write superblock"
2334 " which references block %c @%llu (%s/%llu/%d)"
2335 " which is never written!\n",
2336 btrfsic_get_block_type(state, l->block_ref_to),
2337 (unsigned long long)
2338 l->block_ref_to->logical_bytenr,
2339 l->block_ref_to->dev_state->name,
2340 (unsigned long long)l->block_ref_to->dev_bytenr,
2341 l->block_ref_to->mirror_num);
2342 ret = -1;
2343 } else if (!l->block_ref_to->is_iodone) {
2344 printk(KERN_INFO "btrfs: attempt to write superblock"
2345 " which references block %c @%llu (%s/%llu/%d)"
2346 " which is not yet iodone!\n",
2347 btrfsic_get_block_type(state, l->block_ref_to),
2348 (unsigned long long)
2349 l->block_ref_to->logical_bytenr,
2350 l->block_ref_to->dev_state->name,
2351 (unsigned long long)l->block_ref_to->dev_bytenr,
2352 l->block_ref_to->mirror_num);
2353 ret = -1;
2354 } else if (l->parent_generation !=
2355 l->block_ref_to->generation &&
2356 BTRFSIC_GENERATION_UNKNOWN !=
2357 l->parent_generation &&
2358 BTRFSIC_GENERATION_UNKNOWN !=
2359 l->block_ref_to->generation) {
2360 printk(KERN_INFO "btrfs: attempt to write superblock"
2361 " which references block %c @%llu (%s/%llu/%d)"
2362 " with generation %llu !="
2363 " parent generation %llu!\n",
2364 btrfsic_get_block_type(state, l->block_ref_to),
2365 (unsigned long long)
2366 l->block_ref_to->logical_bytenr,
2367 l->block_ref_to->dev_state->name,
2368 (unsigned long long)l->block_ref_to->dev_bytenr,
2369 l->block_ref_to->mirror_num,
2370 (unsigned long long)l->block_ref_to->generation,
2371 (unsigned long long)l->parent_generation);
2372 ret = -1;
2373 } else if (l->block_ref_to->flush_gen >
2374 l->block_ref_to->dev_state->last_flush_gen) {
2375 printk(KERN_INFO "btrfs: attempt to write superblock"
2376 " which references block %c @%llu (%s/%llu/%d)"
2377 " which is not flushed out of disk's write cache"
2378 " (block flush_gen=%llu,"
2379 " dev->flush_gen=%llu)!\n",
2380 btrfsic_get_block_type(state, l->block_ref_to),
2381 (unsigned long long)
2382 l->block_ref_to->logical_bytenr,
2383 l->block_ref_to->dev_state->name,
2384 (unsigned long long)l->block_ref_to->dev_bytenr,
2385 l->block_ref_to->mirror_num,
2386 (unsigned long long)block->flush_gen,
2387 (unsigned long long)
2388 l->block_ref_to->dev_state->last_flush_gen);
2389 ret = -1;
2390 } else if (-1 == btrfsic_check_all_ref_blocks(state,
2391 l->block_ref_to,
2392 recursion_level +
2393 1)) {
2394 ret = -1;
2395 }
2396 }
2397
2398 return ret;
2399}
2400
2401static int btrfsic_is_block_ref_by_superblock(
2402 const struct btrfsic_state *state,
2403 const struct btrfsic_block *block,
2404 int recursion_level)
2405{
2406 struct list_head *elem_ref_from;
2407
2408 if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
2409 /* refer to comment at "abort cyclic linkage (case 1)" */
2410 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2411 printk(KERN_INFO
2412 "btrfsic: abort cyclic linkage (case 2).\n");
2413
2414 return 0;
2415 }
2416
2417 /*
2418 * This algorithm is recursive because the amount of used stack space
2419 * is very small and the max recursion depth is limited.
2420 */
2421 list_for_each(elem_ref_from, &block->ref_from_list) {
2422 const struct btrfsic_block_link *const l =
2423 list_entry(elem_ref_from, struct btrfsic_block_link,
2424 node_ref_from);
2425
2426 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2427 printk(KERN_INFO
2428 "rl=%d, %c @%llu (%s/%llu/%d)"
2429 " is ref %u* from %c @%llu (%s/%llu/%d)\n",
2430 recursion_level,
2431 btrfsic_get_block_type(state, block),
2432 (unsigned long long)block->logical_bytenr,
2433 block->dev_state->name,
2434 (unsigned long long)block->dev_bytenr,
2435 block->mirror_num,
2436 l->ref_cnt,
2437 btrfsic_get_block_type(state, l->block_ref_from),
2438 (unsigned long long)
2439 l->block_ref_from->logical_bytenr,
2440 l->block_ref_from->dev_state->name,
2441 (unsigned long long)
2442 l->block_ref_from->dev_bytenr,
2443 l->block_ref_from->mirror_num);
2444 if (l->block_ref_from->is_superblock &&
2445 state->latest_superblock->dev_bytenr ==
2446 l->block_ref_from->dev_bytenr &&
2447 state->latest_superblock->dev_state->bdev ==
2448 l->block_ref_from->dev_state->bdev)
2449 return 1;
2450 else if (btrfsic_is_block_ref_by_superblock(state,
2451 l->block_ref_from,
2452 recursion_level +
2453 1))
2454 return 1;
2455 }
2456
2457 return 0;
2458}
2459
2460static void btrfsic_print_add_link(const struct btrfsic_state *state,
2461 const struct btrfsic_block_link *l)
2462{
2463 printk(KERN_INFO
2464 "Add %u* link from %c @%llu (%s/%llu/%d)"
2465 " to %c @%llu (%s/%llu/%d).\n",
2466 l->ref_cnt,
2467 btrfsic_get_block_type(state, l->block_ref_from),
2468 (unsigned long long)l->block_ref_from->logical_bytenr,
2469 l->block_ref_from->dev_state->name,
2470 (unsigned long long)l->block_ref_from->dev_bytenr,
2471 l->block_ref_from->mirror_num,
2472 btrfsic_get_block_type(state, l->block_ref_to),
2473 (unsigned long long)l->block_ref_to->logical_bytenr,
2474 l->block_ref_to->dev_state->name,
2475 (unsigned long long)l->block_ref_to->dev_bytenr,
2476 l->block_ref_to->mirror_num);
2477}
2478
2479static void btrfsic_print_rem_link(const struct btrfsic_state *state,
2480 const struct btrfsic_block_link *l)
2481{
2482 printk(KERN_INFO
2483 "Rem %u* link from %c @%llu (%s/%llu/%d)"
2484 " to %c @%llu (%s/%llu/%d).\n",
2485 l->ref_cnt,
2486 btrfsic_get_block_type(state, l->block_ref_from),
2487 (unsigned long long)l->block_ref_from->logical_bytenr,
2488 l->block_ref_from->dev_state->name,
2489 (unsigned long long)l->block_ref_from->dev_bytenr,
2490 l->block_ref_from->mirror_num,
2491 btrfsic_get_block_type(state, l->block_ref_to),
2492 (unsigned long long)l->block_ref_to->logical_bytenr,
2493 l->block_ref_to->dev_state->name,
2494 (unsigned long long)l->block_ref_to->dev_bytenr,
2495 l->block_ref_to->mirror_num);
2496}
2497
2498static char btrfsic_get_block_type(const struct btrfsic_state *state,
2499 const struct btrfsic_block *block)
2500{
2501 if (block->is_superblock &&
2502 state->latest_superblock->dev_bytenr == block->dev_bytenr &&
2503 state->latest_superblock->dev_state->bdev == block->dev_state->bdev)
2504 return 'S';
2505 else if (block->is_superblock)
2506 return 's';
2507 else if (block->is_metadata)
2508 return 'M';
2509 else
2510 return 'D';
2511}
2512
2513static void btrfsic_dump_tree(const struct btrfsic_state *state)
2514{
2515 btrfsic_dump_tree_sub(state, state->latest_superblock, 0);
2516}
2517
2518static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
2519 const struct btrfsic_block *block,
2520 int indent_level)
2521{
2522 struct list_head *elem_ref_to;
2523 int indent_add;
2524 static char buf[80];
2525 int cursor_position;
2526
2527 /*
2528 * Should better fill an on-stack buffer with a complete line and
2529 * dump it at once when it is time to print a newline character.
2530 */
2531
2532 /*
2533 * This algorithm is recursive because the amount of used stack space
2534 * is very small and the max recursion depth is limited.
2535 */
2536 indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)",
2537 btrfsic_get_block_type(state, block),
2538 (unsigned long long)block->logical_bytenr,
2539 block->dev_state->name,
2540 (unsigned long long)block->dev_bytenr,
2541 block->mirror_num);
2542 if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
2543 printk("[...]\n");
2544 return;
2545 }
2546 printk(buf);
2547 indent_level += indent_add;
2548 if (list_empty(&block->ref_to_list)) {
2549 printk("\n");
2550 return;
2551 }
2552 if (block->mirror_num > 1 &&
2553 !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) {
2554 printk(" [...]\n");
2555 return;
2556 }
2557
2558 cursor_position = indent_level;
2559 list_for_each(elem_ref_to, &block->ref_to_list) {
2560 const struct btrfsic_block_link *const l =
2561 list_entry(elem_ref_to, struct btrfsic_block_link,
2562 node_ref_to);
2563
2564 while (cursor_position < indent_level) {
2565 printk(" ");
2566 cursor_position++;
2567 }
2568 if (l->ref_cnt > 1)
2569 indent_add = sprintf(buf, " %d*--> ", l->ref_cnt);
2570 else
2571 indent_add = sprintf(buf, " --> ");
2572 if (indent_level + indent_add >
2573 BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
2574 printk("[...]\n");
2575 cursor_position = 0;
2576 continue;
2577 }
2578
2579 printk(buf);
2580
2581 btrfsic_dump_tree_sub(state, l->block_ref_to,
2582 indent_level + indent_add);
2583 cursor_position = 0;
2584 }
2585}
2586
2587static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
2588 struct btrfsic_state *state,
2589 struct btrfsic_block_data_ctx *next_block_ctx,
2590 struct btrfsic_block *next_block,
2591 struct btrfsic_block *from_block,
2592 u64 parent_generation)
2593{
2594 struct btrfsic_block_link *l;
2595
2596 l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev,
2597 next_block_ctx->dev_bytenr,
2598 from_block->dev_state->bdev,
2599 from_block->dev_bytenr,
2600 &state->block_link_hashtable);
2601 if (NULL == l) {
2602 l = btrfsic_block_link_alloc();
2603 if (NULL == l) {
2604 printk(KERN_INFO
2605 "btrfsic: error, kmalloc" " failed!\n");
2606 return NULL;
2607 }
2608
2609 l->block_ref_to = next_block;
2610 l->block_ref_from = from_block;
2611 l->ref_cnt = 1;
2612 l->parent_generation = parent_generation;
2613
2614 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2615 btrfsic_print_add_link(state, l);
2616
2617 list_add(&l->node_ref_to, &from_block->ref_to_list);
2618 list_add(&l->node_ref_from, &next_block->ref_from_list);
2619
2620 btrfsic_block_link_hashtable_add(l,
2621 &state->block_link_hashtable);
2622 } else {
2623 l->ref_cnt++;
2624 l->parent_generation = parent_generation;
2625 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2626 btrfsic_print_add_link(state, l);
2627 }
2628
2629 return l;
2630}
2631
2632static struct btrfsic_block *btrfsic_block_lookup_or_add(
2633 struct btrfsic_state *state,
2634 struct btrfsic_block_data_ctx *block_ctx,
2635 const char *additional_string,
2636 int is_metadata,
2637 int is_iodone,
2638 int never_written,
2639 int mirror_num,
2640 int *was_created)
2641{
2642 struct btrfsic_block *block;
2643
2644 block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev,
2645 block_ctx->dev_bytenr,
2646 &state->block_hashtable);
2647 if (NULL == block) {
2648 struct btrfsic_dev_state *dev_state;
2649
2650 block = btrfsic_block_alloc();
2651 if (NULL == block) {
2652 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
2653 return NULL;
2654 }
2655 dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev);
2656 if (NULL == dev_state) {
2657 printk(KERN_INFO
2658 "btrfsic: error, lookup dev_state failed!\n");
2659 btrfsic_block_free(block);
2660 return NULL;
2661 }
2662 block->dev_state = dev_state;
2663 block->dev_bytenr = block_ctx->dev_bytenr;
2664 block->logical_bytenr = block_ctx->start;
2665 block->is_metadata = is_metadata;
2666 block->is_iodone = is_iodone;
2667 block->never_written = never_written;
2668 block->mirror_num = mirror_num;
2669 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2670 printk(KERN_INFO
2671 "New %s%c-block @%llu (%s/%llu/%d)\n",
2672 additional_string,
2673 btrfsic_get_block_type(state, block),
2674 (unsigned long long)block->logical_bytenr,
2675 dev_state->name,
2676 (unsigned long long)block->dev_bytenr,
2677 mirror_num);
2678 list_add(&block->all_blocks_node, &state->all_blocks_list);
2679 btrfsic_block_hashtable_add(block, &state->block_hashtable);
2680 if (NULL != was_created)
2681 *was_created = 1;
2682 } else {
2683 if (NULL != was_created)
2684 *was_created = 0;
2685 }
2686
2687 return block;
2688}
2689
2690static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
2691 u64 bytenr,
2692 struct btrfsic_dev_state *dev_state,
2693 u64 dev_bytenr, char *data)
2694{
2695 int num_copies;
2696 int mirror_num;
2697 int ret;
2698 struct btrfsic_block_data_ctx block_ctx;
2699 int match = 0;
2700
2701 num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
2702 bytenr, PAGE_SIZE);
2703
2704 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
2705 ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
2706 &block_ctx, mirror_num);
2707 if (ret) {
2708 printk(KERN_INFO "btrfsic:"
2709 " btrfsic_map_block(logical @%llu,"
2710 " mirror %d) failed!\n",
2711 (unsigned long long)bytenr, mirror_num);
2712 continue;
2713 }
2714
2715 if (dev_state->bdev == block_ctx.dev->bdev &&
2716 dev_bytenr == block_ctx.dev_bytenr) {
2717 match++;
2718 btrfsic_release_block_ctx(&block_ctx);
2719 break;
2720 }
2721 btrfsic_release_block_ctx(&block_ctx);
2722 }
2723
2724 if (!match) {
2725 printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio,"
2726 " buffer->log_bytenr=%llu, submit_bio(bdev=%s,"
2727 " phys_bytenr=%llu)!\n",
2728 (unsigned long long)bytenr, dev_state->name,
2729 (unsigned long long)dev_bytenr);
2730 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
2731 ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
2732 &block_ctx, mirror_num);
2733 if (ret)
2734 continue;
2735
2736 printk(KERN_INFO "Read logical bytenr @%llu maps to"
2737 " (%s/%llu/%d)\n",
2738 (unsigned long long)bytenr,
2739 block_ctx.dev->name,
2740 (unsigned long long)block_ctx.dev_bytenr,
2741 mirror_num);
2742 }
2743 WARN_ON(1);
2744 }
2745}
2746
2747static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
2748 struct block_device *bdev)
2749{
2750 struct btrfsic_dev_state *ds;
2751
2752 ds = btrfsic_dev_state_hashtable_lookup(bdev,
2753 &btrfsic_dev_state_hashtable);
2754 return ds;
2755}
2756
2757int btrfsic_submit_bh(int rw, struct buffer_head *bh)
2758{
2759 struct btrfsic_dev_state *dev_state;
2760
2761 if (!btrfsic_is_initialized)
2762 return submit_bh(rw, bh);
2763
2764 mutex_lock(&btrfsic_mutex);
2765 /* since btrfsic_submit_bh() might also be called before
2766 * btrfsic_mount(), this might return NULL */
2767 dev_state = btrfsic_dev_state_lookup(bh->b_bdev);
2768
2769 /* Only called to write the superblock (incl. FLUSH/FUA) */
2770 if (NULL != dev_state &&
2771 (rw & WRITE) && bh->b_size > 0) {
2772 u64 dev_bytenr;
2773
2774 dev_bytenr = 4096 * bh->b_blocknr;
2775 if (dev_state->state->print_mask &
2776 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
2777 printk(KERN_INFO
2778 "submit_bh(rw=0x%x, blocknr=%lu (bytenr %llu),"
2779 " size=%lu, data=%p, bdev=%p)\n",
2780 rw, bh->b_blocknr,
2781 (unsigned long long)dev_bytenr, bh->b_size,
2782 bh->b_data, bh->b_bdev);
2783 btrfsic_process_written_block(dev_state, dev_bytenr,
2784 bh->b_data, bh->b_size, NULL,
2785 NULL, bh, rw);
2786 } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
2787 if (dev_state->state->print_mask &
2788 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
2789 printk(KERN_INFO
2790 "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n",
2791 rw, bh->b_bdev);
2792 if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
2793 if ((dev_state->state->print_mask &
2794 (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
2795 BTRFSIC_PRINT_MASK_VERBOSE)))
2796 printk(KERN_INFO
2797 "btrfsic_submit_bh(%s) with FLUSH"
2798 " but dummy block already in use"
2799 " (ignored)!\n",
2800 dev_state->name);
2801 } else {
2802 struct btrfsic_block *const block =
2803 &dev_state->dummy_block_for_bio_bh_flush;
2804
2805 block->is_iodone = 0;
2806 block->never_written = 0;
2807 block->iodone_w_error = 0;
2808 block->flush_gen = dev_state->last_flush_gen + 1;
2809 block->submit_bio_bh_rw = rw;
2810 block->orig_bio_bh_private = bh->b_private;
2811 block->orig_bio_bh_end_io.bh = bh->b_end_io;
2812 block->next_in_same_bio = NULL;
2813 bh->b_private = block;
2814 bh->b_end_io = btrfsic_bh_end_io;
2815 }
2816 }
2817 mutex_unlock(&btrfsic_mutex);
2818 return submit_bh(rw, bh);
2819}
2820
2821void btrfsic_submit_bio(int rw, struct bio *bio)
2822{
2823 struct btrfsic_dev_state *dev_state;
2824
2825 if (!btrfsic_is_initialized) {
2826 submit_bio(rw, bio);
2827 return;
2828 }
2829
2830 mutex_lock(&btrfsic_mutex);
2831 /* since btrfsic_submit_bio() is also called before
2832 * btrfsic_mount(), this might return NULL */
2833 dev_state = btrfsic_dev_state_lookup(bio->bi_bdev);
2834 if (NULL != dev_state &&
2835 (rw & WRITE) && NULL != bio->bi_io_vec) {
2836 unsigned int i;
2837 u64 dev_bytenr;
2838 int bio_is_patched;
2839
2840 dev_bytenr = 512 * bio->bi_sector;
2841 bio_is_patched = 0;
2842 if (dev_state->state->print_mask &
2843 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
2844 printk(KERN_INFO
2845 "submit_bio(rw=0x%x, bi_vcnt=%u,"
2846 " bi_sector=%lu (bytenr %llu), bi_bdev=%p)\n",
2847 rw, bio->bi_vcnt, bio->bi_sector,
2848 (unsigned long long)dev_bytenr,
2849 bio->bi_bdev);
2850
2851 for (i = 0; i < bio->bi_vcnt; i++) {
2852 u8 *mapped_data;
2853
2854 mapped_data = kmap(bio->bi_io_vec[i].bv_page);
2855 if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
2856 BTRFSIC_PRINT_MASK_VERBOSE) ==
2857 (dev_state->state->print_mask &
2858 (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
2859 BTRFSIC_PRINT_MASK_VERBOSE)))
2860 printk(KERN_INFO
2861 "#%u: page=%p, mapped=%p, len=%u,"
2862 " offset=%u\n",
2863 i, bio->bi_io_vec[i].bv_page,
2864 mapped_data,
2865 bio->bi_io_vec[i].bv_len,
2866 bio->bi_io_vec[i].bv_offset);
2867 btrfsic_process_written_block(dev_state, dev_bytenr,
2868 mapped_data,
2869 bio->bi_io_vec[i].bv_len,
2870 bio, &bio_is_patched,
2871 NULL, rw);
2872 kunmap(bio->bi_io_vec[i].bv_page);
2873 dev_bytenr += bio->bi_io_vec[i].bv_len;
2874 }
2875 } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
2876 if (dev_state->state->print_mask &
2877 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
2878 printk(KERN_INFO
2879 "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n",
2880 rw, bio->bi_bdev);
2881 if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
2882 if ((dev_state->state->print_mask &
2883 (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
2884 BTRFSIC_PRINT_MASK_VERBOSE)))
2885 printk(KERN_INFO
2886 "btrfsic_submit_bio(%s) with FLUSH"
2887 " but dummy block already in use"
2888 " (ignored)!\n",
2889 dev_state->name);
2890 } else {
2891 struct btrfsic_block *const block =
2892 &dev_state->dummy_block_for_bio_bh_flush;
2893
2894 block->is_iodone = 0;
2895 block->never_written = 0;
2896 block->iodone_w_error = 0;
2897 block->flush_gen = dev_state->last_flush_gen + 1;
2898 block->submit_bio_bh_rw = rw;
2899 block->orig_bio_bh_private = bio->bi_private;
2900 block->orig_bio_bh_end_io.bio = bio->bi_end_io;
2901 block->next_in_same_bio = NULL;
2902 bio->bi_private = block;
2903 bio->bi_end_io = btrfsic_bio_end_io;
2904 }
2905 }
2906 mutex_unlock(&btrfsic_mutex);
2907
2908 submit_bio(rw, bio);
2909}
2910
2911int btrfsic_mount(struct btrfs_root *root,
2912 struct btrfs_fs_devices *fs_devices,
2913 int including_extent_data, u32 print_mask)
2914{
2915 int ret;
2916 struct btrfsic_state *state;
2917 struct list_head *dev_head = &fs_devices->devices;
2918 struct btrfs_device *device;
2919
2920 state = kzalloc(sizeof(*state), GFP_NOFS);
2921 if (NULL == state) {
2922 printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n");
2923 return -1;
2924 }
2925
2926 if (!btrfsic_is_initialized) {
2927 mutex_init(&btrfsic_mutex);
2928 btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable);
2929 btrfsic_is_initialized = 1;
2930 }
2931 mutex_lock(&btrfsic_mutex);
2932 state->root = root;
2933 state->print_mask = print_mask;
2934 state->include_extent_data = including_extent_data;
2935 state->csum_size = 0;
2936 INIT_LIST_HEAD(&state->all_blocks_list);
2937 btrfsic_block_hashtable_init(&state->block_hashtable);
2938 btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
2939 state->max_superblock_generation = 0;
2940 state->latest_superblock = NULL;
2941
2942 list_for_each_entry(device, dev_head, dev_list) {
2943 struct btrfsic_dev_state *ds;
2944 char *p;
2945
2946 if (!device->bdev || !device->name)
2947 continue;
2948
2949 ds = btrfsic_dev_state_alloc();
2950 if (NULL == ds) {
2951 printk(KERN_INFO
2952 "btrfs check-integrity: kmalloc() failed!\n");
2953 mutex_unlock(&btrfsic_mutex);
2954 return -1;
2955 }
2956 ds->bdev = device->bdev;
2957 ds->state = state;
2958 bdevname(ds->bdev, ds->name);
2959 ds->name[BDEVNAME_SIZE - 1] = '\0';
2960 for (p = ds->name; *p != '\0'; p++);
2961 while (p > ds->name && *p != '/')
2962 p--;
2963 if (*p == '/')
2964 p++;
2965 strlcpy(ds->name, p, sizeof(ds->name));
2966 btrfsic_dev_state_hashtable_add(ds,
2967 &btrfsic_dev_state_hashtable);
2968 }
2969
2970 ret = btrfsic_process_superblock(state, fs_devices);
2971 if (0 != ret) {
2972 mutex_unlock(&btrfsic_mutex);
2973 btrfsic_unmount(root, fs_devices);
2974 return ret;
2975 }
2976
2977 if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE)
2978 btrfsic_dump_database(state);
2979 if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE)
2980 btrfsic_dump_tree(state);
2981
2982 mutex_unlock(&btrfsic_mutex);
2983 return 0;
2984}
2985
2986void btrfsic_unmount(struct btrfs_root *root,
2987 struct btrfs_fs_devices *fs_devices)
2988{
2989 struct list_head *elem_all;
2990 struct list_head *tmp_all;
2991 struct btrfsic_state *state;
2992 struct list_head *dev_head = &fs_devices->devices;
2993 struct btrfs_device *device;
2994
2995 if (!btrfsic_is_initialized)
2996 return;
2997
2998 mutex_lock(&btrfsic_mutex);
2999
3000 state = NULL;
3001 list_for_each_entry(device, dev_head, dev_list) {
3002 struct btrfsic_dev_state *ds;
3003
3004 if (!device->bdev || !device->name)
3005 continue;
3006
3007 ds = btrfsic_dev_state_hashtable_lookup(
3008 device->bdev,
3009 &btrfsic_dev_state_hashtable);
3010 if (NULL != ds) {
3011 state = ds->state;
3012 btrfsic_dev_state_hashtable_remove(ds);
3013 btrfsic_dev_state_free(ds);
3014 }
3015 }
3016
3017 if (NULL == state) {
3018 printk(KERN_INFO
3019 "btrfsic: error, cannot find state information"
3020 " on umount!\n");
3021 mutex_unlock(&btrfsic_mutex);
3022 return;
3023 }
3024
3025 /*
3026 * Don't care about keeping the lists' state up to date,
3027 * just free all memory that was allocated dynamically.
3028 * Free the blocks and the block_links.
3029 */
3030 list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) {
3031 struct btrfsic_block *const b_all =
3032 list_entry(elem_all, struct btrfsic_block,
3033 all_blocks_node);
3034 struct list_head *elem_ref_to;
3035 struct list_head *tmp_ref_to;
3036
3037 list_for_each_safe(elem_ref_to, tmp_ref_to,
3038 &b_all->ref_to_list) {
3039 struct btrfsic_block_link *const l =
3040 list_entry(elem_ref_to,
3041 struct btrfsic_block_link,
3042 node_ref_to);
3043
3044 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
3045 btrfsic_print_rem_link(state, l);
3046
3047 l->ref_cnt--;
3048 if (0 == l->ref_cnt)
3049 btrfsic_block_link_free(l);
3050 }
3051
3052 if (b_all->is_iodone)
3053 btrfsic_block_free(b_all);
3054 else
3055 printk(KERN_INFO "btrfs: attempt to free %c-block"
3056 " @%llu (%s/%llu/%d) on umount which is"
3057 " not yet iodone!\n",
3058 btrfsic_get_block_type(state, b_all),
3059 (unsigned long long)b_all->logical_bytenr,
3060 b_all->dev_state->name,
3061 (unsigned long long)b_all->dev_bytenr,
3062 b_all->mirror_num);
3063 }
3064
3065 mutex_unlock(&btrfsic_mutex);
3066
3067 kfree(state);
3068}
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
new file mode 100644
index 000000000000..8b59175cc502
--- /dev/null
+++ b/fs/btrfs/check-integrity.h
@@ -0,0 +1,36 @@
1/*
2 * Copyright (C) STRATO AG 2011. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#if !defined(__BTRFS_CHECK_INTEGRITY__)
20#define __BTRFS_CHECK_INTEGRITY__
21
22#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
23int btrfsic_submit_bh(int rw, struct buffer_head *bh);
24void btrfsic_submit_bio(int rw, struct bio *bio);
25#else
26#define btrfsic_submit_bh submit_bh
27#define btrfsic_submit_bio submit_bio
28#endif
29
30int btrfsic_mount(struct btrfs_root *root,
31 struct btrfs_fs_devices *fs_devices,
32 int including_extent_data, u32 print_mask);
33void btrfsic_unmount(struct btrfs_root *root,
34 struct btrfs_fs_devices *fs_devices);
35
36#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index dede441bdeee..0639a555e16e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -240,7 +240,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
240 240
241 cow = btrfs_alloc_free_block(trans, root, buf->len, 0, 241 cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
242 new_root_objectid, &disk_key, level, 242 new_root_objectid, &disk_key, level,
243 buf->start, 0); 243 buf->start, 0, 1);
244 if (IS_ERR(cow)) 244 if (IS_ERR(cow))
245 return PTR_ERR(cow); 245 return PTR_ERR(cow);
246 246
@@ -261,9 +261,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
261 261
262 WARN_ON(btrfs_header_generation(buf) > trans->transid); 262 WARN_ON(btrfs_header_generation(buf) > trans->transid);
263 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 263 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
264 ret = btrfs_inc_ref(trans, root, cow, 1); 264 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
265 else 265 else
266 ret = btrfs_inc_ref(trans, root, cow, 0); 266 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
267 267
268 if (ret) 268 if (ret)
269 return ret; 269 return ret;
@@ -350,14 +350,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
350 if ((owner == root->root_key.objectid || 350 if ((owner == root->root_key.objectid ||
351 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && 351 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
352 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { 352 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
353 ret = btrfs_inc_ref(trans, root, buf, 1); 353 ret = btrfs_inc_ref(trans, root, buf, 1, 1);
354 BUG_ON(ret); 354 BUG_ON(ret);
355 355
356 if (root->root_key.objectid == 356 if (root->root_key.objectid ==
357 BTRFS_TREE_RELOC_OBJECTID) { 357 BTRFS_TREE_RELOC_OBJECTID) {
358 ret = btrfs_dec_ref(trans, root, buf, 0); 358 ret = btrfs_dec_ref(trans, root, buf, 0, 1);
359 BUG_ON(ret); 359 BUG_ON(ret);
360 ret = btrfs_inc_ref(trans, root, cow, 1); 360 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
361 BUG_ON(ret); 361 BUG_ON(ret);
362 } 362 }
363 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 363 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
@@ -365,9 +365,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
365 365
366 if (root->root_key.objectid == 366 if (root->root_key.objectid ==
367 BTRFS_TREE_RELOC_OBJECTID) 367 BTRFS_TREE_RELOC_OBJECTID)
368 ret = btrfs_inc_ref(trans, root, cow, 1); 368 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
369 else 369 else
370 ret = btrfs_inc_ref(trans, root, cow, 0); 370 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
371 BUG_ON(ret); 371 BUG_ON(ret);
372 } 372 }
373 if (new_flags != 0) { 373 if (new_flags != 0) {
@@ -381,11 +381,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
381 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 381 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
382 if (root->root_key.objectid == 382 if (root->root_key.objectid ==
383 BTRFS_TREE_RELOC_OBJECTID) 383 BTRFS_TREE_RELOC_OBJECTID)
384 ret = btrfs_inc_ref(trans, root, cow, 1); 384 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
385 else 385 else
386 ret = btrfs_inc_ref(trans, root, cow, 0); 386 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
387 BUG_ON(ret); 387 BUG_ON(ret);
388 ret = btrfs_dec_ref(trans, root, buf, 1); 388 ret = btrfs_dec_ref(trans, root, buf, 1, 1);
389 BUG_ON(ret); 389 BUG_ON(ret);
390 } 390 }
391 clean_tree_block(trans, root, buf); 391 clean_tree_block(trans, root, buf);
@@ -446,7 +446,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
446 446
447 cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, 447 cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
448 root->root_key.objectid, &disk_key, 448 root->root_key.objectid, &disk_key,
449 level, search_start, empty_size); 449 level, search_start, empty_size, 1);
450 if (IS_ERR(cow)) 450 if (IS_ERR(cow))
451 return PTR_ERR(cow); 451 return PTR_ERR(cow);
452 452
@@ -484,7 +484,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
484 rcu_assign_pointer(root->node, cow); 484 rcu_assign_pointer(root->node, cow);
485 485
486 btrfs_free_tree_block(trans, root, buf, parent_start, 486 btrfs_free_tree_block(trans, root, buf, parent_start,
487 last_ref); 487 last_ref, 1);
488 free_extent_buffer(buf); 488 free_extent_buffer(buf);
489 add_root_to_dirty_list(root); 489 add_root_to_dirty_list(root);
490 } else { 490 } else {
@@ -500,7 +500,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
500 trans->transid); 500 trans->transid);
501 btrfs_mark_buffer_dirty(parent); 501 btrfs_mark_buffer_dirty(parent);
502 btrfs_free_tree_block(trans, root, buf, parent_start, 502 btrfs_free_tree_block(trans, root, buf, parent_start,
503 last_ref); 503 last_ref, 1);
504 } 504 }
505 if (unlock_orig) 505 if (unlock_orig)
506 btrfs_tree_unlock(buf); 506 btrfs_tree_unlock(buf);
@@ -957,7 +957,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
957 free_extent_buffer(mid); 957 free_extent_buffer(mid);
958 958
959 root_sub_used(root, mid->len); 959 root_sub_used(root, mid->len);
960 btrfs_free_tree_block(trans, root, mid, 0, 1); 960 btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
961 /* once for the root ptr */ 961 /* once for the root ptr */
962 free_extent_buffer(mid); 962 free_extent_buffer(mid);
963 return 0; 963 return 0;
@@ -1015,7 +1015,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1015 if (wret) 1015 if (wret)
1016 ret = wret; 1016 ret = wret;
1017 root_sub_used(root, right->len); 1017 root_sub_used(root, right->len);
1018 btrfs_free_tree_block(trans, root, right, 0, 1); 1018 btrfs_free_tree_block(trans, root, right, 0, 1, 0);
1019 free_extent_buffer(right); 1019 free_extent_buffer(right);
1020 right = NULL; 1020 right = NULL;
1021 } else { 1021 } else {
@@ -1055,7 +1055,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1055 if (wret) 1055 if (wret)
1056 ret = wret; 1056 ret = wret;
1057 root_sub_used(root, mid->len); 1057 root_sub_used(root, mid->len);
1058 btrfs_free_tree_block(trans, root, mid, 0, 1); 1058 btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
1059 free_extent_buffer(mid); 1059 free_extent_buffer(mid);
1060 mid = NULL; 1060 mid = NULL;
1061 } else { 1061 } else {
@@ -2089,7 +2089,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2089 2089
2090 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 2090 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2091 root->root_key.objectid, &lower_key, 2091 root->root_key.objectid, &lower_key,
2092 level, root->node->start, 0); 2092 level, root->node->start, 0, 0);
2093 if (IS_ERR(c)) 2093 if (IS_ERR(c))
2094 return PTR_ERR(c); 2094 return PTR_ERR(c);
2095 2095
@@ -2216,7 +2216,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2216 2216
2217 split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 2217 split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2218 root->root_key.objectid, 2218 root->root_key.objectid,
2219 &disk_key, level, c->start, 0); 2219 &disk_key, level, c->start, 0, 0);
2220 if (IS_ERR(split)) 2220 if (IS_ERR(split))
2221 return PTR_ERR(split); 2221 return PTR_ERR(split);
2222 2222
@@ -2970,7 +2970,7 @@ again:
2970 2970
2971 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 2971 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
2972 root->root_key.objectid, 2972 root->root_key.objectid,
2973 &disk_key, 0, l->start, 0); 2973 &disk_key, 0, l->start, 0, 0);
2974 if (IS_ERR(right)) 2974 if (IS_ERR(right))
2975 return PTR_ERR(right); 2975 return PTR_ERR(right);
2976 2976
@@ -3781,7 +3781,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3781 3781
3782 root_sub_used(root, leaf->len); 3782 root_sub_used(root, leaf->len);
3783 3783
3784 btrfs_free_tree_block(trans, root, leaf, 0, 1); 3784 btrfs_free_tree_block(trans, root, leaf, 0, 1, 0);
3785 return 0; 3785 return 0;
3786} 3786}
3787/* 3787/*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 67385033323d..3c2cbf7b6663 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -86,6 +86,9 @@ struct btrfs_ordered_sum;
86/* holds checksums of all the data extents */ 86/* holds checksums of all the data extents */
87#define BTRFS_CSUM_TREE_OBJECTID 7ULL 87#define BTRFS_CSUM_TREE_OBJECTID 7ULL
88 88
89/* for storing balance parameters in the root tree */
90#define BTRFS_BALANCE_OBJECTID -4ULL
91
89/* orhpan objectid for tracking unlinked/truncated files */ 92/* orhpan objectid for tracking unlinked/truncated files */
90#define BTRFS_ORPHAN_OBJECTID -5ULL 93#define BTRFS_ORPHAN_OBJECTID -5ULL
91 94
@@ -692,6 +695,54 @@ struct btrfs_root_ref {
692 __le16 name_len; 695 __le16 name_len;
693} __attribute__ ((__packed__)); 696} __attribute__ ((__packed__));
694 697
698struct btrfs_disk_balance_args {
699 /*
700 * profiles to operate on, single is denoted by
701 * BTRFS_AVAIL_ALLOC_BIT_SINGLE
702 */
703 __le64 profiles;
704
705 /* usage filter */
706 __le64 usage;
707
708 /* devid filter */
709 __le64 devid;
710
711 /* devid subset filter [pstart..pend) */
712 __le64 pstart;
713 __le64 pend;
714
715 /* btrfs virtual address space subset filter [vstart..vend) */
716 __le64 vstart;
717 __le64 vend;
718
719 /*
720 * profile to convert to, single is denoted by
721 * BTRFS_AVAIL_ALLOC_BIT_SINGLE
722 */
723 __le64 target;
724
725 /* BTRFS_BALANCE_ARGS_* */
726 __le64 flags;
727
728 __le64 unused[8];
729} __attribute__ ((__packed__));
730
731/*
732 * store balance parameters to disk so that balance can be properly
733 * resumed after crash or unmount
734 */
735struct btrfs_balance_item {
736 /* BTRFS_BALANCE_* */
737 __le64 flags;
738
739 struct btrfs_disk_balance_args data;
740 struct btrfs_disk_balance_args meta;
741 struct btrfs_disk_balance_args sys;
742
743 __le64 unused[4];
744} __attribute__ ((__packed__));
745
695#define BTRFS_FILE_EXTENT_INLINE 0 746#define BTRFS_FILE_EXTENT_INLINE 0
696#define BTRFS_FILE_EXTENT_REG 1 747#define BTRFS_FILE_EXTENT_REG 1
697#define BTRFS_FILE_EXTENT_PREALLOC 2 748#define BTRFS_FILE_EXTENT_PREALLOC 2
@@ -751,14 +802,32 @@ struct btrfs_csum_item {
751} __attribute__ ((__packed__)); 802} __attribute__ ((__packed__));
752 803
753/* different types of block groups (and chunks) */ 804/* different types of block groups (and chunks) */
754#define BTRFS_BLOCK_GROUP_DATA (1 << 0) 805#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0)
755#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1) 806#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1)
756#define BTRFS_BLOCK_GROUP_METADATA (1 << 2) 807#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2)
757#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3) 808#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3)
758#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) 809#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
759#define BTRFS_BLOCK_GROUP_DUP (1 << 5) 810#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
760#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) 811#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
761#define BTRFS_NR_RAID_TYPES 5 812#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
813#define BTRFS_NR_RAID_TYPES 5
814
815#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \
816 BTRFS_BLOCK_GROUP_SYSTEM | \
817 BTRFS_BLOCK_GROUP_METADATA)
818
819#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
820 BTRFS_BLOCK_GROUP_RAID1 | \
821 BTRFS_BLOCK_GROUP_DUP | \
822 BTRFS_BLOCK_GROUP_RAID10)
823/*
824 * We need a bit for restriper to be able to tell when chunks of type
825 * SINGLE are available. This "extended" profile format is used in
826 * fs_info->avail_*_alloc_bits (in-memory) and balance item fields
827 * (on-disk). The corresponding on-disk bit in chunk.type is reserved
828 * to avoid remappings between two formats in future.
829 */
830#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48)
762 831
763struct btrfs_block_group_item { 832struct btrfs_block_group_item {
764 __le64 used; 833 __le64 used;
@@ -916,6 +985,7 @@ struct btrfs_block_group_cache {
916struct reloc_control; 985struct reloc_control;
917struct btrfs_device; 986struct btrfs_device;
918struct btrfs_fs_devices; 987struct btrfs_fs_devices;
988struct btrfs_balance_control;
919struct btrfs_delayed_root; 989struct btrfs_delayed_root;
920struct btrfs_fs_info { 990struct btrfs_fs_info {
921 u8 fsid[BTRFS_FSID_SIZE]; 991 u8 fsid[BTRFS_FSID_SIZE];
@@ -971,7 +1041,7 @@ struct btrfs_fs_info {
971 * is required instead of the faster short fsync log commits 1041 * is required instead of the faster short fsync log commits
972 */ 1042 */
973 u64 last_trans_log_full_commit; 1043 u64 last_trans_log_full_commit;
974 unsigned long mount_opt:20; 1044 unsigned long mount_opt:21;
975 unsigned long compress_type:4; 1045 unsigned long compress_type:4;
976 u64 max_inline; 1046 u64 max_inline;
977 u64 alloc_start; 1047 u64 alloc_start;
@@ -1132,12 +1202,23 @@ struct btrfs_fs_info {
1132 spinlock_t ref_cache_lock; 1202 spinlock_t ref_cache_lock;
1133 u64 total_ref_cache_size; 1203 u64 total_ref_cache_size;
1134 1204
1205 /*
1206 * these three are in extended format (availability of single
1207 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
1208 * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits)
1209 */
1135 u64 avail_data_alloc_bits; 1210 u64 avail_data_alloc_bits;
1136 u64 avail_metadata_alloc_bits; 1211 u64 avail_metadata_alloc_bits;
1137 u64 avail_system_alloc_bits; 1212 u64 avail_system_alloc_bits;
1138 u64 data_alloc_profile; 1213
1139 u64 metadata_alloc_profile; 1214 /* restriper state */
1140 u64 system_alloc_profile; 1215 spinlock_t balance_lock;
1216 struct mutex balance_mutex;
1217 atomic_t balance_running;
1218 atomic_t balance_pause_req;
1219 atomic_t balance_cancel_req;
1220 struct btrfs_balance_control *balance_ctl;
1221 wait_queue_head_t balance_wait_q;
1141 1222
1142 unsigned data_chunk_allocations; 1223 unsigned data_chunk_allocations;
1143 unsigned metadata_ratio; 1224 unsigned metadata_ratio;
@@ -1155,6 +1236,10 @@ struct btrfs_fs_info {
1155 int scrub_workers_refcnt; 1236 int scrub_workers_refcnt;
1156 struct btrfs_workers scrub_workers; 1237 struct btrfs_workers scrub_workers;
1157 1238
1239#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1240 u32 check_integrity_print_mask;
1241#endif
1242
1158 /* filesystem state */ 1243 /* filesystem state */
1159 u64 fs_state; 1244 u64 fs_state;
1160 1245
@@ -1383,6 +1468,8 @@ struct btrfs_ioctl_defrag_range_args {
1383#define BTRFS_DEV_ITEM_KEY 216 1468#define BTRFS_DEV_ITEM_KEY 216
1384#define BTRFS_CHUNK_ITEM_KEY 228 1469#define BTRFS_CHUNK_ITEM_KEY 228
1385 1470
1471#define BTRFS_BALANCE_ITEM_KEY 248
1472
1386/* 1473/*
1387 * string items are for debugging. They just store a short string of 1474 * string items are for debugging. They just store a short string of
1388 * data in the FS 1475 * data in the FS
@@ -1413,6 +1500,9 @@ struct btrfs_ioctl_defrag_range_args {
1413#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) 1500#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
1414#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) 1501#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
1415#define BTRFS_MOUNT_RECOVERY (1 << 18) 1502#define BTRFS_MOUNT_RECOVERY (1 << 18)
1503#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19)
1504#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20)
1505#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
1416 1506
1417#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1507#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1418#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1508#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -2077,8 +2167,86 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
2077BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, 2167BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
2078 num_devices, 64); 2168 num_devices, 64);
2079 2169
2080/* struct btrfs_super_block */ 2170/* struct btrfs_balance_item */
2171BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
2081 2172
2173static inline void btrfs_balance_data(struct extent_buffer *eb,
2174 struct btrfs_balance_item *bi,
2175 struct btrfs_disk_balance_args *ba)
2176{
2177 read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
2178}
2179
2180static inline void btrfs_set_balance_data(struct extent_buffer *eb,
2181 struct btrfs_balance_item *bi,
2182 struct btrfs_disk_balance_args *ba)
2183{
2184 write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
2185}
2186
2187static inline void btrfs_balance_meta(struct extent_buffer *eb,
2188 struct btrfs_balance_item *bi,
2189 struct btrfs_disk_balance_args *ba)
2190{
2191 read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
2192}
2193
2194static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
2195 struct btrfs_balance_item *bi,
2196 struct btrfs_disk_balance_args *ba)
2197{
2198 write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
2199}
2200
2201static inline void btrfs_balance_sys(struct extent_buffer *eb,
2202 struct btrfs_balance_item *bi,
2203 struct btrfs_disk_balance_args *ba)
2204{
2205 read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
2206}
2207
2208static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
2209 struct btrfs_balance_item *bi,
2210 struct btrfs_disk_balance_args *ba)
2211{
2212 write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
2213}
2214
2215static inline void
2216btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
2217 struct btrfs_disk_balance_args *disk)
2218{
2219 memset(cpu, 0, sizeof(*cpu));
2220
2221 cpu->profiles = le64_to_cpu(disk->profiles);
2222 cpu->usage = le64_to_cpu(disk->usage);
2223 cpu->devid = le64_to_cpu(disk->devid);
2224 cpu->pstart = le64_to_cpu(disk->pstart);
2225 cpu->pend = le64_to_cpu(disk->pend);
2226 cpu->vstart = le64_to_cpu(disk->vstart);
2227 cpu->vend = le64_to_cpu(disk->vend);
2228 cpu->target = le64_to_cpu(disk->target);
2229 cpu->flags = le64_to_cpu(disk->flags);
2230}
2231
2232static inline void
2233btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
2234 struct btrfs_balance_args *cpu)
2235{
2236 memset(disk, 0, sizeof(*disk));
2237
2238 disk->profiles = cpu_to_le64(cpu->profiles);
2239 disk->usage = cpu_to_le64(cpu->usage);
2240 disk->devid = cpu_to_le64(cpu->devid);
2241 disk->pstart = cpu_to_le64(cpu->pstart);
2242 disk->pend = cpu_to_le64(cpu->pend);
2243 disk->vstart = cpu_to_le64(cpu->vstart);
2244 disk->vend = cpu_to_le64(cpu->vend);
2245 disk->target = cpu_to_le64(cpu->target);
2246 disk->flags = cpu_to_le64(cpu->flags);
2247}
2248
2249/* struct btrfs_super_block */
2082BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); 2250BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
2083BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); 2251BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
2084BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, 2252BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
@@ -2277,11 +2445,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
2277 struct btrfs_root *root, u32 blocksize, 2445 struct btrfs_root *root, u32 blocksize,
2278 u64 parent, u64 root_objectid, 2446 u64 parent, u64 root_objectid,
2279 struct btrfs_disk_key *key, int level, 2447 struct btrfs_disk_key *key, int level,
2280 u64 hint, u64 empty_size); 2448 u64 hint, u64 empty_size, int for_cow);
2281void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 2449void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
2282 struct btrfs_root *root, 2450 struct btrfs_root *root,
2283 struct extent_buffer *buf, 2451 struct extent_buffer *buf,
2284 u64 parent, int last_ref); 2452 u64 parent, int last_ref, int for_cow);
2285struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 2453struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
2286 struct btrfs_root *root, 2454 struct btrfs_root *root,
2287 u64 bytenr, u32 blocksize, 2455 u64 bytenr, u32 blocksize,
@@ -2301,17 +2469,17 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
2301 u64 search_end, struct btrfs_key *ins, 2469 u64 search_end, struct btrfs_key *ins,
2302 u64 data); 2470 u64 data);
2303int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2471int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2304 struct extent_buffer *buf, int full_backref); 2472 struct extent_buffer *buf, int full_backref, int for_cow);
2305int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2473int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2306 struct extent_buffer *buf, int full_backref); 2474 struct extent_buffer *buf, int full_backref, int for_cow);
2307int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 2475int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2308 struct btrfs_root *root, 2476 struct btrfs_root *root,
2309 u64 bytenr, u64 num_bytes, u64 flags, 2477 u64 bytenr, u64 num_bytes, u64 flags,
2310 int is_data); 2478 int is_data);
2311int btrfs_free_extent(struct btrfs_trans_handle *trans, 2479int btrfs_free_extent(struct btrfs_trans_handle *trans,
2312 struct btrfs_root *root, 2480 struct btrfs_root *root,
2313 u64 bytenr, u64 num_bytes, u64 parent, 2481 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
2314 u64 root_objectid, u64 owner, u64 offset); 2482 u64 owner, u64 offset, int for_cow);
2315 2483
2316int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); 2484int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
2317int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, 2485int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
@@ -2323,7 +2491,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2323int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2491int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2324 struct btrfs_root *root, 2492 struct btrfs_root *root,
2325 u64 bytenr, u64 num_bytes, u64 parent, 2493 u64 bytenr, u64 num_bytes, u64 parent,
2326 u64 root_objectid, u64 owner, u64 offset); 2494 u64 root_objectid, u64 owner, u64 offset, int for_cow);
2327 2495
2328int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 2496int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2329 struct btrfs_root *root); 2497 struct btrfs_root *root);
@@ -2482,10 +2650,18 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
2482} 2650}
2483 2651
2484int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); 2652int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
2653static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
2654{
2655 ++p->slots[0];
2656 if (p->slots[0] >= btrfs_header_nritems(p->nodes[0]))
2657 return btrfs_next_leaf(root, p);
2658 return 0;
2659}
2485int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); 2660int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
2486int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); 2661int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
2487void btrfs_drop_snapshot(struct btrfs_root *root, 2662void btrfs_drop_snapshot(struct btrfs_root *root,
2488 struct btrfs_block_rsv *block_rsv, int update_ref); 2663 struct btrfs_block_rsv *block_rsv, int update_ref,
2664 int for_reloc);
2489int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 2665int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
2490 struct btrfs_root *root, 2666 struct btrfs_root *root,
2491 struct extent_buffer *node, 2667 struct extent_buffer *node,
@@ -2500,6 +2676,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
2500} 2676}
2501static inline void free_fs_info(struct btrfs_fs_info *fs_info) 2677static inline void free_fs_info(struct btrfs_fs_info *fs_info)
2502{ 2678{
2679 kfree(fs_info->balance_ctl);
2503 kfree(fs_info->delayed_root); 2680 kfree(fs_info->delayed_root);
2504 kfree(fs_info->extent_root); 2681 kfree(fs_info->extent_root);
2505 kfree(fs_info->tree_root); 2682 kfree(fs_info->tree_root);
@@ -2510,6 +2687,24 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
2510 kfree(fs_info->super_for_commit); 2687 kfree(fs_info->super_for_commit);
2511 kfree(fs_info); 2688 kfree(fs_info);
2512} 2689}
2690/**
2691 * profile_is_valid - tests whether a given profile is valid and reduced
2692 * @flags: profile to validate
2693 * @extended: if true @flags is treated as an extended profile
2694 */
2695static inline int profile_is_valid(u64 flags, int extended)
2696{
2697 u64 mask = ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
2698
2699 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
2700 if (extended)
2701 mask &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2702
2703 if (flags & mask)
2704 return 0;
2705 /* true if zero or exactly one bit set */
2706 return (flags & (~flags + 1)) == flags;
2707}
2513 2708
2514/* root-item.c */ 2709/* root-item.c */
2515int btrfs_find_root_ref(struct btrfs_root *tree_root, 2710int btrfs_find_root_ref(struct btrfs_root *tree_root,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 9c1eccc2c503..fe4cd0f1cef1 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -595,8 +595,12 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
595 595
596 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 596 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
597 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); 597 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
598 if (!ret) 598 if (!ret) {
599 trace_btrfs_space_reservation(root->fs_info, "delayed_item",
600 item->key.objectid,
601 num_bytes, 1);
599 item->bytes_reserved = num_bytes; 602 item->bytes_reserved = num_bytes;
603 }
600 604
601 return ret; 605 return ret;
602} 606}
@@ -610,6 +614,9 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
610 return; 614 return;
611 615
612 rsv = &root->fs_info->delayed_block_rsv; 616 rsv = &root->fs_info->delayed_block_rsv;
617 trace_btrfs_space_reservation(root->fs_info, "delayed_item",
618 item->key.objectid, item->bytes_reserved,
619 0);
613 btrfs_block_rsv_release(root, rsv, 620 btrfs_block_rsv_release(root, rsv,
614 item->bytes_reserved); 621 item->bytes_reserved);
615} 622}
@@ -624,7 +631,7 @@ static int btrfs_delayed_inode_reserve_metadata(
624 struct btrfs_block_rsv *dst_rsv; 631 struct btrfs_block_rsv *dst_rsv;
625 u64 num_bytes; 632 u64 num_bytes;
626 int ret; 633 int ret;
627 int release = false; 634 bool release = false;
628 635
629 src_rsv = trans->block_rsv; 636 src_rsv = trans->block_rsv;
630 dst_rsv = &root->fs_info->delayed_block_rsv; 637 dst_rsv = &root->fs_info->delayed_block_rsv;
@@ -651,8 +658,13 @@ static int btrfs_delayed_inode_reserve_metadata(
651 */ 658 */
652 if (ret == -EAGAIN) 659 if (ret == -EAGAIN)
653 ret = -ENOSPC; 660 ret = -ENOSPC;
654 if (!ret) 661 if (!ret) {
655 node->bytes_reserved = num_bytes; 662 node->bytes_reserved = num_bytes;
663 trace_btrfs_space_reservation(root->fs_info,
664 "delayed_inode",
665 btrfs_ino(inode),
666 num_bytes, 1);
667 }
656 return ret; 668 return ret;
657 } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { 669 } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
658 spin_lock(&BTRFS_I(inode)->lock); 670 spin_lock(&BTRFS_I(inode)->lock);
@@ -707,11 +719,17 @@ out:
707 * reservation here. I think it may be time for a documentation page on 719 * reservation here. I think it may be time for a documentation page on
708 * how block rsvs. work. 720 * how block rsvs. work.
709 */ 721 */
710 if (!ret) 722 if (!ret) {
723 trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
724 btrfs_ino(inode), num_bytes, 1);
711 node->bytes_reserved = num_bytes; 725 node->bytes_reserved = num_bytes;
726 }
712 727
713 if (release) 728 if (release) {
729 trace_btrfs_space_reservation(root->fs_info, "delalloc",
730 btrfs_ino(inode), num_bytes, 0);
714 btrfs_block_rsv_release(root, src_rsv, num_bytes); 731 btrfs_block_rsv_release(root, src_rsv, num_bytes);
732 }
715 733
716 return ret; 734 return ret;
717} 735}
@@ -725,6 +743,8 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
725 return; 743 return;
726 744
727 rsv = &root->fs_info->delayed_block_rsv; 745 rsv = &root->fs_info->delayed_block_rsv;
746 trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
747 node->inode_id, node->bytes_reserved, 0);
728 btrfs_block_rsv_release(root, rsv, 748 btrfs_block_rsv_release(root, rsv,
729 node->bytes_reserved); 749 node->bytes_reserved);
730 node->bytes_reserved = 0; 750 node->bytes_reserved = 0;
@@ -1372,13 +1392,6 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
1372 goto release_node; 1392 goto release_node;
1373 } 1393 }
1374 1394
1375 ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
1376 /*
1377 * we have reserved enough space when we start a new transaction,
1378 * so reserving metadata failure is impossible
1379 */
1380 BUG_ON(ret);
1381
1382 delayed_item->key.objectid = btrfs_ino(dir); 1395 delayed_item->key.objectid = btrfs_ino(dir);
1383 btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY); 1396 btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY);
1384 delayed_item->key.offset = index; 1397 delayed_item->key.offset = index;
@@ -1391,6 +1404,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
1391 dir_item->type = type; 1404 dir_item->type = type;
1392 memcpy((char *)(dir_item + 1), name, name_len); 1405 memcpy((char *)(dir_item + 1), name, name_len);
1393 1406
1407 ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
1408 /*
1409 * we have reserved enough space when we start a new transaction,
1410 * so reserving metadata failure is impossible
1411 */
1412 BUG_ON(ret);
1413
1414
1394 mutex_lock(&delayed_node->mutex); 1415 mutex_lock(&delayed_node->mutex);
1395 ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item); 1416 ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
1396 if (unlikely(ret)) { 1417 if (unlikely(ret)) {
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 125cf76fcd08..66e4f29505a3 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -101,6 +101,11 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
101 return -1; 101 return -1;
102 if (ref1->type > ref2->type) 102 if (ref1->type > ref2->type)
103 return 1; 103 return 1;
104 /* merging of sequenced refs is not allowed */
105 if (ref1->seq < ref2->seq)
106 return -1;
107 if (ref1->seq > ref2->seq)
108 return 1;
104 if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || 109 if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
105 ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { 110 ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
106 return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), 111 return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
@@ -150,16 +155,22 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
150 155
151/* 156/*
152 * find an head entry based on bytenr. This returns the delayed ref 157 * find an head entry based on bytenr. This returns the delayed ref
153 * head if it was able to find one, or NULL if nothing was in that spot 158 * head if it was able to find one, or NULL if nothing was in that spot.
159 * If return_bigger is given, the next bigger entry is returned if no exact
160 * match is found.
154 */ 161 */
155static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, 162static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
156 u64 bytenr, 163 u64 bytenr,
157 struct btrfs_delayed_ref_node **last) 164 struct btrfs_delayed_ref_node **last,
165 int return_bigger)
158{ 166{
159 struct rb_node *n = root->rb_node; 167 struct rb_node *n;
160 struct btrfs_delayed_ref_node *entry; 168 struct btrfs_delayed_ref_node *entry;
161 int cmp; 169 int cmp = 0;
162 170
171again:
172 n = root->rb_node;
173 entry = NULL;
163 while (n) { 174 while (n) {
164 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 175 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
165 WARN_ON(!entry->in_tree); 176 WARN_ON(!entry->in_tree);
@@ -182,6 +193,19 @@ static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
182 else 193 else
183 return entry; 194 return entry;
184 } 195 }
196 if (entry && return_bigger) {
197 if (cmp > 0) {
198 n = rb_next(&entry->rb_node);
199 if (!n)
200 n = rb_first(root);
201 entry = rb_entry(n, struct btrfs_delayed_ref_node,
202 rb_node);
203 bytenr = entry->bytenr;
204 return_bigger = 0;
205 goto again;
206 }
207 return entry;
208 }
185 return NULL; 209 return NULL;
186} 210}
187 211
@@ -209,6 +233,24 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
209 return 0; 233 return 0;
210} 234}
211 235
236int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
237 u64 seq)
238{
239 struct seq_list *elem;
240
241 assert_spin_locked(&delayed_refs->lock);
242 if (list_empty(&delayed_refs->seq_head))
243 return 0;
244
245 elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list);
246 if (seq >= elem->seq) {
247 pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n",
248 seq, elem->seq, delayed_refs);
249 return 1;
250 }
251 return 0;
252}
253
212int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 254int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
213 struct list_head *cluster, u64 start) 255 struct list_head *cluster, u64 start)
214{ 256{
@@ -223,20 +265,8 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
223 node = rb_first(&delayed_refs->root); 265 node = rb_first(&delayed_refs->root);
224 } else { 266 } else {
225 ref = NULL; 267 ref = NULL;
226 find_ref_head(&delayed_refs->root, start, &ref); 268 find_ref_head(&delayed_refs->root, start + 1, &ref, 1);
227 if (ref) { 269 if (ref) {
228 struct btrfs_delayed_ref_node *tmp;
229
230 node = rb_prev(&ref->rb_node);
231 while (node) {
232 tmp = rb_entry(node,
233 struct btrfs_delayed_ref_node,
234 rb_node);
235 if (tmp->bytenr < start)
236 break;
237 ref = tmp;
238 node = rb_prev(&ref->rb_node);
239 }
240 node = &ref->rb_node; 270 node = &ref->rb_node;
241 } else 271 } else
242 node = rb_first(&delayed_refs->root); 272 node = rb_first(&delayed_refs->root);
@@ -390,7 +420,8 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
390 * this does all the dirty work in terms of maintaining the correct 420 * this does all the dirty work in terms of maintaining the correct
391 * overall modification count. 421 * overall modification count.
392 */ 422 */
393static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, 423static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info,
424 struct btrfs_trans_handle *trans,
394 struct btrfs_delayed_ref_node *ref, 425 struct btrfs_delayed_ref_node *ref,
395 u64 bytenr, u64 num_bytes, 426 u64 bytenr, u64 num_bytes,
396 int action, int is_data) 427 int action, int is_data)
@@ -437,6 +468,7 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
437 ref->action = 0; 468 ref->action = 0;
438 ref->is_head = 1; 469 ref->is_head = 1;
439 ref->in_tree = 1; 470 ref->in_tree = 1;
471 ref->seq = 0;
440 472
441 head_ref = btrfs_delayed_node_to_head(ref); 473 head_ref = btrfs_delayed_node_to_head(ref);
442 head_ref->must_insert_reserved = must_insert_reserved; 474 head_ref->must_insert_reserved = must_insert_reserved;
@@ -468,14 +500,17 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
468/* 500/*
469 * helper to insert a delayed tree ref into the rbtree. 501 * helper to insert a delayed tree ref into the rbtree.
470 */ 502 */
471static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans, 503static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
504 struct btrfs_trans_handle *trans,
472 struct btrfs_delayed_ref_node *ref, 505 struct btrfs_delayed_ref_node *ref,
473 u64 bytenr, u64 num_bytes, u64 parent, 506 u64 bytenr, u64 num_bytes, u64 parent,
474 u64 ref_root, int level, int action) 507 u64 ref_root, int level, int action,
508 int for_cow)
475{ 509{
476 struct btrfs_delayed_ref_node *existing; 510 struct btrfs_delayed_ref_node *existing;
477 struct btrfs_delayed_tree_ref *full_ref; 511 struct btrfs_delayed_tree_ref *full_ref;
478 struct btrfs_delayed_ref_root *delayed_refs; 512 struct btrfs_delayed_ref_root *delayed_refs;
513 u64 seq = 0;
479 514
480 if (action == BTRFS_ADD_DELAYED_EXTENT) 515 if (action == BTRFS_ADD_DELAYED_EXTENT)
481 action = BTRFS_ADD_DELAYED_REF; 516 action = BTRFS_ADD_DELAYED_REF;
@@ -491,14 +526,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
491 ref->is_head = 0; 526 ref->is_head = 0;
492 ref->in_tree = 1; 527 ref->in_tree = 1;
493 528
529 if (need_ref_seq(for_cow, ref_root))
530 seq = inc_delayed_seq(delayed_refs);
531 ref->seq = seq;
532
494 full_ref = btrfs_delayed_node_to_tree_ref(ref); 533 full_ref = btrfs_delayed_node_to_tree_ref(ref);
495 if (parent) { 534 full_ref->parent = parent;
496 full_ref->parent = parent; 535 full_ref->root = ref_root;
536 if (parent)
497 ref->type = BTRFS_SHARED_BLOCK_REF_KEY; 537 ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
498 } else { 538 else
499 full_ref->root = ref_root;
500 ref->type = BTRFS_TREE_BLOCK_REF_KEY; 539 ref->type = BTRFS_TREE_BLOCK_REF_KEY;
501 }
502 full_ref->level = level; 540 full_ref->level = level;
503 541
504 trace_btrfs_delayed_tree_ref(ref, full_ref, action); 542 trace_btrfs_delayed_tree_ref(ref, full_ref, action);
@@ -522,15 +560,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
522/* 560/*
523 * helper to insert a delayed data ref into the rbtree. 561 * helper to insert a delayed data ref into the rbtree.
524 */ 562 */
525static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, 563static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info,
564 struct btrfs_trans_handle *trans,
526 struct btrfs_delayed_ref_node *ref, 565 struct btrfs_delayed_ref_node *ref,
527 u64 bytenr, u64 num_bytes, u64 parent, 566 u64 bytenr, u64 num_bytes, u64 parent,
528 u64 ref_root, u64 owner, u64 offset, 567 u64 ref_root, u64 owner, u64 offset,
529 int action) 568 int action, int for_cow)
530{ 569{
531 struct btrfs_delayed_ref_node *existing; 570 struct btrfs_delayed_ref_node *existing;
532 struct btrfs_delayed_data_ref *full_ref; 571 struct btrfs_delayed_data_ref *full_ref;
533 struct btrfs_delayed_ref_root *delayed_refs; 572 struct btrfs_delayed_ref_root *delayed_refs;
573 u64 seq = 0;
534 574
535 if (action == BTRFS_ADD_DELAYED_EXTENT) 575 if (action == BTRFS_ADD_DELAYED_EXTENT)
536 action = BTRFS_ADD_DELAYED_REF; 576 action = BTRFS_ADD_DELAYED_REF;
@@ -546,14 +586,18 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
546 ref->is_head = 0; 586 ref->is_head = 0;
547 ref->in_tree = 1; 587 ref->in_tree = 1;
548 588
589 if (need_ref_seq(for_cow, ref_root))
590 seq = inc_delayed_seq(delayed_refs);
591 ref->seq = seq;
592
549 full_ref = btrfs_delayed_node_to_data_ref(ref); 593 full_ref = btrfs_delayed_node_to_data_ref(ref);
550 if (parent) { 594 full_ref->parent = parent;
551 full_ref->parent = parent; 595 full_ref->root = ref_root;
596 if (parent)
552 ref->type = BTRFS_SHARED_DATA_REF_KEY; 597 ref->type = BTRFS_SHARED_DATA_REF_KEY;
553 } else { 598 else
554 full_ref->root = ref_root;
555 ref->type = BTRFS_EXTENT_DATA_REF_KEY; 599 ref->type = BTRFS_EXTENT_DATA_REF_KEY;
556 } 600
557 full_ref->objectid = owner; 601 full_ref->objectid = owner;
558 full_ref->offset = offset; 602 full_ref->offset = offset;
559 603
@@ -580,10 +624,12 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
580 * to make sure the delayed ref is eventually processed before this 624 * to make sure the delayed ref is eventually processed before this
581 * transaction commits. 625 * transaction commits.
582 */ 626 */
583int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, 627int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
628 struct btrfs_trans_handle *trans,
584 u64 bytenr, u64 num_bytes, u64 parent, 629 u64 bytenr, u64 num_bytes, u64 parent,
585 u64 ref_root, int level, int action, 630 u64 ref_root, int level, int action,
586 struct btrfs_delayed_extent_op *extent_op) 631 struct btrfs_delayed_extent_op *extent_op,
632 int for_cow)
587{ 633{
588 struct btrfs_delayed_tree_ref *ref; 634 struct btrfs_delayed_tree_ref *ref;
589 struct btrfs_delayed_ref_head *head_ref; 635 struct btrfs_delayed_ref_head *head_ref;
@@ -610,13 +656,17 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
610 * insert both the head node and the new ref without dropping 656 * insert both the head node and the new ref without dropping
611 * the spin lock 657 * the spin lock
612 */ 658 */
613 ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, 659 ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
614 action, 0); 660 num_bytes, action, 0);
615 BUG_ON(ret); 661 BUG_ON(ret);
616 662
617 ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes, 663 ret = add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
618 parent, ref_root, level, action); 664 num_bytes, parent, ref_root, level, action,
665 for_cow);
619 BUG_ON(ret); 666 BUG_ON(ret);
667 if (!need_ref_seq(for_cow, ref_root) &&
668 waitqueue_active(&delayed_refs->seq_wait))
669 wake_up(&delayed_refs->seq_wait);
620 spin_unlock(&delayed_refs->lock); 670 spin_unlock(&delayed_refs->lock);
621 return 0; 671 return 0;
622} 672}
@@ -624,11 +674,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
624/* 674/*
625 * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. 675 * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
626 */ 676 */
627int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, 677int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
678 struct btrfs_trans_handle *trans,
628 u64 bytenr, u64 num_bytes, 679 u64 bytenr, u64 num_bytes,
629 u64 parent, u64 ref_root, 680 u64 parent, u64 ref_root,
630 u64 owner, u64 offset, int action, 681 u64 owner, u64 offset, int action,
631 struct btrfs_delayed_extent_op *extent_op) 682 struct btrfs_delayed_extent_op *extent_op,
683 int for_cow)
632{ 684{
633 struct btrfs_delayed_data_ref *ref; 685 struct btrfs_delayed_data_ref *ref;
634 struct btrfs_delayed_ref_head *head_ref; 686 struct btrfs_delayed_ref_head *head_ref;
@@ -655,18 +707,23 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
655 * insert both the head node and the new ref without dropping 707 * insert both the head node and the new ref without dropping
656 * the spin lock 708 * the spin lock
657 */ 709 */
658 ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, 710 ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
659 action, 1); 711 num_bytes, action, 1);
660 BUG_ON(ret); 712 BUG_ON(ret);
661 713
662 ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes, 714 ret = add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
663 parent, ref_root, owner, offset, action); 715 num_bytes, parent, ref_root, owner, offset,
716 action, for_cow);
664 BUG_ON(ret); 717 BUG_ON(ret);
718 if (!need_ref_seq(for_cow, ref_root) &&
719 waitqueue_active(&delayed_refs->seq_wait))
720 wake_up(&delayed_refs->seq_wait);
665 spin_unlock(&delayed_refs->lock); 721 spin_unlock(&delayed_refs->lock);
666 return 0; 722 return 0;
667} 723}
668 724
669int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, 725int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
726 struct btrfs_trans_handle *trans,
670 u64 bytenr, u64 num_bytes, 727 u64 bytenr, u64 num_bytes,
671 struct btrfs_delayed_extent_op *extent_op) 728 struct btrfs_delayed_extent_op *extent_op)
672{ 729{
@@ -683,11 +740,13 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
683 delayed_refs = &trans->transaction->delayed_refs; 740 delayed_refs = &trans->transaction->delayed_refs;
684 spin_lock(&delayed_refs->lock); 741 spin_lock(&delayed_refs->lock);
685 742
686 ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, 743 ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
687 num_bytes, BTRFS_UPDATE_DELAYED_HEAD, 744 num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
688 extent_op->is_data); 745 extent_op->is_data);
689 BUG_ON(ret); 746 BUG_ON(ret);
690 747
748 if (waitqueue_active(&delayed_refs->seq_wait))
749 wake_up(&delayed_refs->seq_wait);
691 spin_unlock(&delayed_refs->lock); 750 spin_unlock(&delayed_refs->lock);
692 return 0; 751 return 0;
693} 752}
@@ -704,7 +763,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
704 struct btrfs_delayed_ref_root *delayed_refs; 763 struct btrfs_delayed_ref_root *delayed_refs;
705 764
706 delayed_refs = &trans->transaction->delayed_refs; 765 delayed_refs = &trans->transaction->delayed_refs;
707 ref = find_ref_head(&delayed_refs->root, bytenr, NULL); 766 ref = find_ref_head(&delayed_refs->root, bytenr, NULL, 0);
708 if (ref) 767 if (ref)
709 return btrfs_delayed_node_to_head(ref); 768 return btrfs_delayed_node_to_head(ref);
710 return NULL; 769 return NULL;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index e287e3b0eab0..d8f244d94925 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -33,6 +33,9 @@ struct btrfs_delayed_ref_node {
33 /* the size of the extent */ 33 /* the size of the extent */
34 u64 num_bytes; 34 u64 num_bytes;
35 35
36 /* seq number to keep track of insertion order */
37 u64 seq;
38
36 /* ref count on this data structure */ 39 /* ref count on this data structure */
37 atomic_t refs; 40 atomic_t refs;
38 41
@@ -98,19 +101,15 @@ struct btrfs_delayed_ref_head {
98 101
99struct btrfs_delayed_tree_ref { 102struct btrfs_delayed_tree_ref {
100 struct btrfs_delayed_ref_node node; 103 struct btrfs_delayed_ref_node node;
101 union { 104 u64 root;
102 u64 root; 105 u64 parent;
103 u64 parent;
104 };
105 int level; 106 int level;
106}; 107};
107 108
108struct btrfs_delayed_data_ref { 109struct btrfs_delayed_data_ref {
109 struct btrfs_delayed_ref_node node; 110 struct btrfs_delayed_ref_node node;
110 union { 111 u64 root;
111 u64 root; 112 u64 parent;
112 u64 parent;
113 };
114 u64 objectid; 113 u64 objectid;
115 u64 offset; 114 u64 offset;
116}; 115};
@@ -140,6 +139,26 @@ struct btrfs_delayed_ref_root {
140 int flushing; 139 int flushing;
141 140
142 u64 run_delayed_start; 141 u64 run_delayed_start;
142
143 /*
144 * seq number of delayed refs. We need to know if a backref was being
145 * added before the currently processed ref or afterwards.
146 */
147 u64 seq;
148
149 /*
150 * seq_list holds a list of all seq numbers that are currently being
151 * added to the list. While walking backrefs (btrfs_find_all_roots,
152 * qgroups), which might take some time, no newer ref must be processed,
153 * as it might influence the outcome of the walk.
154 */
155 struct list_head seq_head;
156
157 /*
158 * when the only refs we have in the list must not be processed, we want
159 * to wait for more refs to show up or for the end of backref walking.
160 */
161 wait_queue_head_t seq_wait;
143}; 162};
144 163
145static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) 164static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
@@ -151,16 +170,21 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
151 } 170 }
152} 171}
153 172
154int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, 173int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
174 struct btrfs_trans_handle *trans,
155 u64 bytenr, u64 num_bytes, u64 parent, 175 u64 bytenr, u64 num_bytes, u64 parent,
156 u64 ref_root, int level, int action, 176 u64 ref_root, int level, int action,
157 struct btrfs_delayed_extent_op *extent_op); 177 struct btrfs_delayed_extent_op *extent_op,
158int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, 178 int for_cow);
179int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
180 struct btrfs_trans_handle *trans,
159 u64 bytenr, u64 num_bytes, 181 u64 bytenr, u64 num_bytes,
160 u64 parent, u64 ref_root, 182 u64 parent, u64 ref_root,
161 u64 owner, u64 offset, int action, 183 u64 owner, u64 offset, int action,
162 struct btrfs_delayed_extent_op *extent_op); 184 struct btrfs_delayed_extent_op *extent_op,
163int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, 185 int for_cow);
186int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
187 struct btrfs_trans_handle *trans,
164 u64 bytenr, u64 num_bytes, 188 u64 bytenr, u64 num_bytes,
165 struct btrfs_delayed_extent_op *extent_op); 189 struct btrfs_delayed_extent_op *extent_op);
166 190
@@ -170,6 +194,60 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
170 struct btrfs_delayed_ref_head *head); 194 struct btrfs_delayed_ref_head *head);
171int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 195int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
172 struct list_head *cluster, u64 search_start); 196 struct list_head *cluster, u64 search_start);
197
198struct seq_list {
199 struct list_head list;
200 u64 seq;
201};
202
203static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs)
204{
205 assert_spin_locked(&delayed_refs->lock);
206 ++delayed_refs->seq;
207 return delayed_refs->seq;
208}
209
210static inline void
211btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
212 struct seq_list *elem)
213{
214 assert_spin_locked(&delayed_refs->lock);
215 elem->seq = delayed_refs->seq;
216 list_add_tail(&elem->list, &delayed_refs->seq_head);
217}
218
219static inline void
220btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
221 struct seq_list *elem)
222{
223 spin_lock(&delayed_refs->lock);
224 list_del(&elem->list);
225 wake_up(&delayed_refs->seq_wait);
226 spin_unlock(&delayed_refs->lock);
227}
228
229int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
230 u64 seq);
231
232/*
233 * delayed refs with a ref_seq > 0 must be held back during backref walking.
234 * this only applies to items in one of the fs-trees. for_cow items never need
235 * to be held back, so they won't get a ref_seq number.
236 */
237static inline int need_ref_seq(int for_cow, u64 rootid)
238{
239 if (for_cow)
240 return 0;
241
242 if (rootid == BTRFS_FS_TREE_OBJECTID)
243 return 1;
244
245 if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
246 return 1;
247
248 return 0;
249}
250
173/* 251/*
174 * a node might live in a head or a regular ref, this lets you 252 * a node might live in a head or a regular ref, this lets you
175 * test for the proper type to use. 253 * test for the proper type to use.
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d8525662ca7a..89e99eb384db 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -43,6 +43,7 @@
43#include "tree-log.h" 43#include "tree-log.h"
44#include "free-space-cache.h" 44#include "free-space-cache.h"
45#include "inode-map.h" 45#include "inode-map.h"
46#include "check-integrity.h"
46 47
47static struct extent_io_ops btree_extent_io_ops; 48static struct extent_io_ops btree_extent_io_ops;
48static void end_workqueue_fn(struct btrfs_work *work); 49static void end_workqueue_fn(struct btrfs_work *work);
@@ -1244,7 +1245,8 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1244 root->ref_cows = 0; 1245 root->ref_cows = 0;
1245 1246
1246 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 1247 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
1247 BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0); 1248 BTRFS_TREE_LOG_OBJECTID, NULL,
1249 0, 0, 0, 0);
1248 if (IS_ERR(leaf)) { 1250 if (IS_ERR(leaf)) {
1249 kfree(root); 1251 kfree(root);
1250 return ERR_CAST(leaf); 1252 return ERR_CAST(leaf);
@@ -1998,6 +2000,17 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1998 init_waitqueue_head(&fs_info->scrub_pause_wait); 2000 init_waitqueue_head(&fs_info->scrub_pause_wait);
1999 init_rwsem(&fs_info->scrub_super_lock); 2001 init_rwsem(&fs_info->scrub_super_lock);
2000 fs_info->scrub_workers_refcnt = 0; 2002 fs_info->scrub_workers_refcnt = 0;
2003#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
2004 fs_info->check_integrity_print_mask = 0;
2005#endif
2006
2007 spin_lock_init(&fs_info->balance_lock);
2008 mutex_init(&fs_info->balance_mutex);
2009 atomic_set(&fs_info->balance_running, 0);
2010 atomic_set(&fs_info->balance_pause_req, 0);
2011 atomic_set(&fs_info->balance_cancel_req, 0);
2012 fs_info->balance_ctl = NULL;
2013 init_waitqueue_head(&fs_info->balance_wait_q);
2001 2014
2002 sb->s_blocksize = 4096; 2015 sb->s_blocksize = 4096;
2003 sb->s_blocksize_bits = blksize_bits(4096); 2016 sb->s_blocksize_bits = blksize_bits(4096);
@@ -2267,9 +2280,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
2267 (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), 2280 (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
2268 BTRFS_UUID_SIZE); 2281 BTRFS_UUID_SIZE);
2269 2282
2270 mutex_lock(&fs_info->chunk_mutex);
2271 ret = btrfs_read_chunk_tree(chunk_root); 2283 ret = btrfs_read_chunk_tree(chunk_root);
2272 mutex_unlock(&fs_info->chunk_mutex);
2273 if (ret) { 2284 if (ret) {
2274 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", 2285 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
2275 sb->s_id); 2286 sb->s_id);
@@ -2318,9 +2329,6 @@ retry_root_backup:
2318 2329
2319 fs_info->generation = generation; 2330 fs_info->generation = generation;
2320 fs_info->last_trans_committed = generation; 2331 fs_info->last_trans_committed = generation;
2321 fs_info->data_alloc_profile = (u64)-1;
2322 fs_info->metadata_alloc_profile = (u64)-1;
2323 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
2324 2332
2325 ret = btrfs_init_space_info(fs_info); 2333 ret = btrfs_init_space_info(fs_info);
2326 if (ret) { 2334 if (ret) {
@@ -2353,6 +2361,19 @@ retry_root_backup:
2353 btrfs_set_opt(fs_info->mount_opt, SSD); 2361 btrfs_set_opt(fs_info->mount_opt, SSD);
2354 } 2362 }
2355 2363
2364#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
2365 if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
2366 ret = btrfsic_mount(tree_root, fs_devices,
2367 btrfs_test_opt(tree_root,
2368 CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
2369 1 : 0,
2370 fs_info->check_integrity_print_mask);
2371 if (ret)
2372 printk(KERN_WARNING "btrfs: failed to initialize"
2373 " integrity check module %s\n", sb->s_id);
2374 }
2375#endif
2376
2356 /* do not make disk changes in broken FS */ 2377 /* do not make disk changes in broken FS */
2357 if (btrfs_super_log_root(disk_super) != 0 && 2378 if (btrfs_super_log_root(disk_super) != 0 &&
2358 !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) { 2379 !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
@@ -2423,6 +2444,10 @@ retry_root_backup:
2423 if (!err) 2444 if (!err)
2424 err = btrfs_orphan_cleanup(fs_info->tree_root); 2445 err = btrfs_orphan_cleanup(fs_info->tree_root);
2425 up_read(&fs_info->cleanup_work_sem); 2446 up_read(&fs_info->cleanup_work_sem);
2447
2448 if (!err)
2449 err = btrfs_recover_balance(fs_info->tree_root);
2450
2426 if (err) { 2451 if (err) {
2427 close_ctree(tree_root); 2452 close_ctree(tree_root);
2428 return ERR_PTR(err); 2453 return ERR_PTR(err);
@@ -2631,7 +2656,7 @@ static int write_dev_supers(struct btrfs_device *device,
2631 * we fua the first super. The others we allow 2656 * we fua the first super. The others we allow
2632 * to go down lazy. 2657 * to go down lazy.
2633 */ 2658 */
2634 ret = submit_bh(WRITE_FUA, bh); 2659 ret = btrfsic_submit_bh(WRITE_FUA, bh);
2635 if (ret) 2660 if (ret)
2636 errors++; 2661 errors++;
2637 } 2662 }
@@ -2708,7 +2733,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
2708 device->flush_bio = bio; 2733 device->flush_bio = bio;
2709 2734
2710 bio_get(bio); 2735 bio_get(bio);
2711 submit_bio(WRITE_FLUSH, bio); 2736 btrfsic_submit_bio(WRITE_FLUSH, bio);
2712 2737
2713 return 0; 2738 return 0;
2714} 2739}
@@ -2972,6 +2997,9 @@ int close_ctree(struct btrfs_root *root)
2972 fs_info->closing = 1; 2997 fs_info->closing = 1;
2973 smp_mb(); 2998 smp_mb();
2974 2999
3000 /* pause restriper - we want to resume on mount */
3001 btrfs_pause_balance(root->fs_info);
3002
2975 btrfs_scrub_cancel(root); 3003 btrfs_scrub_cancel(root);
2976 3004
2977 /* wait for any defraggers to finish */ 3005 /* wait for any defraggers to finish */
@@ -3054,6 +3082,11 @@ int close_ctree(struct btrfs_root *root)
3054 btrfs_stop_workers(&fs_info->caching_workers); 3082 btrfs_stop_workers(&fs_info->caching_workers);
3055 btrfs_stop_workers(&fs_info->readahead_workers); 3083 btrfs_stop_workers(&fs_info->readahead_workers);
3056 3084
3085#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3086 if (btrfs_test_opt(root, CHECK_INTEGRITY))
3087 btrfsic_unmount(root, fs_info->fs_devices);
3088#endif
3089
3057 btrfs_close_devices(fs_info->fs_devices); 3090 btrfs_close_devices(fs_info->fs_devices);
3058 btrfs_mapping_tree_free(&fs_info->mapping_tree); 3091 btrfs_mapping_tree_free(&fs_info->mapping_tree);
3059 3092
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5fbe576d2ba..700879ed64cf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -618,8 +618,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
618 struct list_head *head = &info->space_info; 618 struct list_head *head = &info->space_info;
619 struct btrfs_space_info *found; 619 struct btrfs_space_info *found;
620 620
621 flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM | 621 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
622 BTRFS_BLOCK_GROUP_METADATA;
623 622
624 rcu_read_lock(); 623 rcu_read_lock();
625 list_for_each_entry_rcu(found, head, list) { 624 list_for_each_entry_rcu(found, head, list) {
@@ -1872,20 +1871,24 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1872int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1871int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1873 struct btrfs_root *root, 1872 struct btrfs_root *root,
1874 u64 bytenr, u64 num_bytes, u64 parent, 1873 u64 bytenr, u64 num_bytes, u64 parent,
1875 u64 root_objectid, u64 owner, u64 offset) 1874 u64 root_objectid, u64 owner, u64 offset, int for_cow)
1876{ 1875{
1877 int ret; 1876 int ret;
1877 struct btrfs_fs_info *fs_info = root->fs_info;
1878
1878 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && 1879 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1879 root_objectid == BTRFS_TREE_LOG_OBJECTID); 1880 root_objectid == BTRFS_TREE_LOG_OBJECTID);
1880 1881
1881 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1882 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1882 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, 1883 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
1884 num_bytes,
1883 parent, root_objectid, (int)owner, 1885 parent, root_objectid, (int)owner,
1884 BTRFS_ADD_DELAYED_REF, NULL); 1886 BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1885 } else { 1887 } else {
1886 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, 1888 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
1889 num_bytes,
1887 parent, root_objectid, owner, offset, 1890 parent, root_objectid, owner, offset,
1888 BTRFS_ADD_DELAYED_REF, NULL); 1891 BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1889 } 1892 }
1890 return ret; 1893 return ret;
1891} 1894}
@@ -2233,6 +2236,28 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2233 } 2236 }
2234 2237
2235 /* 2238 /*
2239 * locked_ref is the head node, so we have to go one
2240 * node back for any delayed ref updates
2241 */
2242 ref = select_delayed_ref(locked_ref);
2243
2244 if (ref && ref->seq &&
2245 btrfs_check_delayed_seq(delayed_refs, ref->seq)) {
2246 /*
2247 * there are still refs with lower seq numbers in the
2248 * process of being added. Don't run this ref yet.
2249 */
2250 list_del_init(&locked_ref->cluster);
2251 mutex_unlock(&locked_ref->mutex);
2252 locked_ref = NULL;
2253 delayed_refs->num_heads_ready++;
2254 spin_unlock(&delayed_refs->lock);
2255 cond_resched();
2256 spin_lock(&delayed_refs->lock);
2257 continue;
2258 }
2259
2260 /*
2236 * record the must insert reserved flag before we 2261 * record the must insert reserved flag before we
2237 * drop the spin lock. 2262 * drop the spin lock.
2238 */ 2263 */
@@ -2242,11 +2267,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2242 extent_op = locked_ref->extent_op; 2267 extent_op = locked_ref->extent_op;
2243 locked_ref->extent_op = NULL; 2268 locked_ref->extent_op = NULL;
2244 2269
2245 /*
2246 * locked_ref is the head node, so we have to go one
2247 * node back for any delayed ref updates
2248 */
2249 ref = select_delayed_ref(locked_ref);
2250 if (!ref) { 2270 if (!ref) {
2251 /* All delayed refs have been processed, Go ahead 2271 /* All delayed refs have been processed, Go ahead
2252 * and send the head node to run_one_delayed_ref, 2272 * and send the head node to run_one_delayed_ref,
@@ -2267,9 +2287,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2267 BUG_ON(ret); 2287 BUG_ON(ret);
2268 kfree(extent_op); 2288 kfree(extent_op);
2269 2289
2270 cond_resched(); 2290 goto next;
2271 spin_lock(&delayed_refs->lock);
2272 continue;
2273 } 2291 }
2274 2292
2275 list_del_init(&locked_ref->cluster); 2293 list_del_init(&locked_ref->cluster);
@@ -2279,7 +2297,12 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2279 ref->in_tree = 0; 2297 ref->in_tree = 0;
2280 rb_erase(&ref->rb_node, &delayed_refs->root); 2298 rb_erase(&ref->rb_node, &delayed_refs->root);
2281 delayed_refs->num_entries--; 2299 delayed_refs->num_entries--;
2282 2300 /*
2301 * we modified num_entries, but as we're currently running
2302 * delayed refs, skip
2303 * wake_up(&delayed_refs->seq_wait);
2304 * here.
2305 */
2283 spin_unlock(&delayed_refs->lock); 2306 spin_unlock(&delayed_refs->lock);
2284 2307
2285 ret = run_one_delayed_ref(trans, root, ref, extent_op, 2308 ret = run_one_delayed_ref(trans, root, ref, extent_op,
@@ -2289,13 +2312,34 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2289 btrfs_put_delayed_ref(ref); 2312 btrfs_put_delayed_ref(ref);
2290 kfree(extent_op); 2313 kfree(extent_op);
2291 count++; 2314 count++;
2292 2315next:
2316 do_chunk_alloc(trans, root->fs_info->extent_root,
2317 2 * 1024 * 1024,
2318 btrfs_get_alloc_profile(root, 0),
2319 CHUNK_ALLOC_NO_FORCE);
2293 cond_resched(); 2320 cond_resched();
2294 spin_lock(&delayed_refs->lock); 2321 spin_lock(&delayed_refs->lock);
2295 } 2322 }
2296 return count; 2323 return count;
2297} 2324}
2298 2325
2326
2327static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs,
2328 unsigned long num_refs)
2329{
2330 struct list_head *first_seq = delayed_refs->seq_head.next;
2331
2332 spin_unlock(&delayed_refs->lock);
2333 pr_debug("waiting for more refs (num %ld, first %p)\n",
2334 num_refs, first_seq);
2335 wait_event(delayed_refs->seq_wait,
2336 num_refs != delayed_refs->num_entries ||
2337 delayed_refs->seq_head.next != first_seq);
2338 pr_debug("done waiting for more refs (num %ld, first %p)\n",
2339 delayed_refs->num_entries, delayed_refs->seq_head.next);
2340 spin_lock(&delayed_refs->lock);
2341}
2342
2299/* 2343/*
2300 * this starts processing the delayed reference count updates and 2344 * this starts processing the delayed reference count updates and
2301 * extent insertions we have queued up so far. count can be 2345 * extent insertions we have queued up so far. count can be
@@ -2311,15 +2355,23 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2311 struct btrfs_delayed_ref_node *ref; 2355 struct btrfs_delayed_ref_node *ref;
2312 struct list_head cluster; 2356 struct list_head cluster;
2313 int ret; 2357 int ret;
2358 u64 delayed_start;
2314 int run_all = count == (unsigned long)-1; 2359 int run_all = count == (unsigned long)-1;
2315 int run_most = 0; 2360 int run_most = 0;
2361 unsigned long num_refs = 0;
2362 int consider_waiting;
2316 2363
2317 if (root == root->fs_info->extent_root) 2364 if (root == root->fs_info->extent_root)
2318 root = root->fs_info->tree_root; 2365 root = root->fs_info->tree_root;
2319 2366
2367 do_chunk_alloc(trans, root->fs_info->extent_root,
2368 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
2369 CHUNK_ALLOC_NO_FORCE);
2370
2320 delayed_refs = &trans->transaction->delayed_refs; 2371 delayed_refs = &trans->transaction->delayed_refs;
2321 INIT_LIST_HEAD(&cluster); 2372 INIT_LIST_HEAD(&cluster);
2322again: 2373again:
2374 consider_waiting = 0;
2323 spin_lock(&delayed_refs->lock); 2375 spin_lock(&delayed_refs->lock);
2324 if (count == 0) { 2376 if (count == 0) {
2325 count = delayed_refs->num_entries * 2; 2377 count = delayed_refs->num_entries * 2;
@@ -2336,11 +2388,35 @@ again:
2336 * of refs to process starting at the first one we are able to 2388 * of refs to process starting at the first one we are able to
2337 * lock 2389 * lock
2338 */ 2390 */
2391 delayed_start = delayed_refs->run_delayed_start;
2339 ret = btrfs_find_ref_cluster(trans, &cluster, 2392 ret = btrfs_find_ref_cluster(trans, &cluster,
2340 delayed_refs->run_delayed_start); 2393 delayed_refs->run_delayed_start);
2341 if (ret) 2394 if (ret)
2342 break; 2395 break;
2343 2396
2397 if (delayed_start >= delayed_refs->run_delayed_start) {
2398 if (consider_waiting == 0) {
2399 /*
2400 * btrfs_find_ref_cluster looped. let's do one
2401 * more cycle. if we don't run any delayed ref
2402 * during that cycle (because we can't because
2403 * all of them are blocked) and if the number of
2404 * refs doesn't change, we avoid busy waiting.
2405 */
2406 consider_waiting = 1;
2407 num_refs = delayed_refs->num_entries;
2408 } else {
2409 wait_for_more_refs(delayed_refs, num_refs);
2410 /*
2411 * after waiting, things have changed. we
2412 * dropped the lock and someone else might have
2413 * run some refs, built new clusters and so on.
2414 * therefore, we restart staleness detection.
2415 */
2416 consider_waiting = 0;
2417 }
2418 }
2419
2344 ret = run_clustered_refs(trans, root, &cluster); 2420 ret = run_clustered_refs(trans, root, &cluster);
2345 BUG_ON(ret < 0); 2421 BUG_ON(ret < 0);
2346 2422
@@ -2348,6 +2424,11 @@ again:
2348 2424
2349 if (count == 0) 2425 if (count == 0)
2350 break; 2426 break;
2427
2428 if (ret || delayed_refs->run_delayed_start == 0) {
2429 /* refs were run, let's reset staleness detection */
2430 consider_waiting = 0;
2431 }
2351 } 2432 }
2352 2433
2353 if (run_all) { 2434 if (run_all) {
@@ -2405,7 +2486,8 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2405 extent_op->update_key = 0; 2486 extent_op->update_key = 0;
2406 extent_op->is_data = is_data ? 1 : 0; 2487 extent_op->is_data = is_data ? 1 : 0;
2407 2488
2408 ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); 2489 ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
2490 num_bytes, extent_op);
2409 if (ret) 2491 if (ret)
2410 kfree(extent_op); 2492 kfree(extent_op);
2411 return ret; 2493 return ret;
@@ -2590,7 +2672,7 @@ out:
2590static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 2672static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2591 struct btrfs_root *root, 2673 struct btrfs_root *root,
2592 struct extent_buffer *buf, 2674 struct extent_buffer *buf,
2593 int full_backref, int inc) 2675 int full_backref, int inc, int for_cow)
2594{ 2676{
2595 u64 bytenr; 2677 u64 bytenr;
2596 u64 num_bytes; 2678 u64 num_bytes;
@@ -2603,7 +2685,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2603 int level; 2685 int level;
2604 int ret = 0; 2686 int ret = 0;
2605 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 2687 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2606 u64, u64, u64, u64, u64, u64); 2688 u64, u64, u64, u64, u64, u64, int);
2607 2689
2608 ref_root = btrfs_header_owner(buf); 2690 ref_root = btrfs_header_owner(buf);
2609 nritems = btrfs_header_nritems(buf); 2691 nritems = btrfs_header_nritems(buf);
@@ -2640,14 +2722,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2640 key.offset -= btrfs_file_extent_offset(buf, fi); 2722 key.offset -= btrfs_file_extent_offset(buf, fi);
2641 ret = process_func(trans, root, bytenr, num_bytes, 2723 ret = process_func(trans, root, bytenr, num_bytes,
2642 parent, ref_root, key.objectid, 2724 parent, ref_root, key.objectid,
2643 key.offset); 2725 key.offset, for_cow);
2644 if (ret) 2726 if (ret)
2645 goto fail; 2727 goto fail;
2646 } else { 2728 } else {
2647 bytenr = btrfs_node_blockptr(buf, i); 2729 bytenr = btrfs_node_blockptr(buf, i);
2648 num_bytes = btrfs_level_size(root, level - 1); 2730 num_bytes = btrfs_level_size(root, level - 1);
2649 ret = process_func(trans, root, bytenr, num_bytes, 2731 ret = process_func(trans, root, bytenr, num_bytes,
2650 parent, ref_root, level - 1, 0); 2732 parent, ref_root, level - 1, 0,
2733 for_cow);
2651 if (ret) 2734 if (ret)
2652 goto fail; 2735 goto fail;
2653 } 2736 }
@@ -2659,15 +2742,15 @@ fail:
2659} 2742}
2660 2743
2661int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2744int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2662 struct extent_buffer *buf, int full_backref) 2745 struct extent_buffer *buf, int full_backref, int for_cow)
2663{ 2746{
2664 return __btrfs_mod_ref(trans, root, buf, full_backref, 1); 2747 return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
2665} 2748}
2666 2749
2667int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2750int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2668 struct extent_buffer *buf, int full_backref) 2751 struct extent_buffer *buf, int full_backref, int for_cow)
2669{ 2752{
2670 return __btrfs_mod_ref(trans, root, buf, full_backref, 0); 2753 return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
2671} 2754}
2672 2755
2673static int write_one_cache_group(struct btrfs_trans_handle *trans, 2756static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -2993,9 +3076,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2993 INIT_LIST_HEAD(&found->block_groups[i]); 3076 INIT_LIST_HEAD(&found->block_groups[i]);
2994 init_rwsem(&found->groups_sem); 3077 init_rwsem(&found->groups_sem);
2995 spin_lock_init(&found->lock); 3078 spin_lock_init(&found->lock);
2996 found->flags = flags & (BTRFS_BLOCK_GROUP_DATA | 3079 found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
2997 BTRFS_BLOCK_GROUP_SYSTEM |
2998 BTRFS_BLOCK_GROUP_METADATA);
2999 found->total_bytes = total_bytes; 3080 found->total_bytes = total_bytes;
3000 found->disk_total = total_bytes * factor; 3081 found->disk_total = total_bytes * factor;
3001 found->bytes_used = bytes_used; 3082 found->bytes_used = bytes_used;
@@ -3016,20 +3097,27 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3016 3097
3017static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 3098static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3018{ 3099{
3019 u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | 3100 u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
3020 BTRFS_BLOCK_GROUP_RAID1 | 3101
3021 BTRFS_BLOCK_GROUP_RAID10 | 3102 /* chunk -> extended profile */
3022 BTRFS_BLOCK_GROUP_DUP); 3103 if (extra_flags == 0)
3023 if (extra_flags) { 3104 extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
3024 if (flags & BTRFS_BLOCK_GROUP_DATA) 3105
3025 fs_info->avail_data_alloc_bits |= extra_flags; 3106 if (flags & BTRFS_BLOCK_GROUP_DATA)
3026 if (flags & BTRFS_BLOCK_GROUP_METADATA) 3107 fs_info->avail_data_alloc_bits |= extra_flags;
3027 fs_info->avail_metadata_alloc_bits |= extra_flags; 3108 if (flags & BTRFS_BLOCK_GROUP_METADATA)
3028 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3109 fs_info->avail_metadata_alloc_bits |= extra_flags;
3029 fs_info->avail_system_alloc_bits |= extra_flags; 3110 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3030 } 3111 fs_info->avail_system_alloc_bits |= extra_flags;
3031} 3112}
3032 3113
3114/*
3115 * @flags: available profiles in extended format (see ctree.h)
3116 *
3117 * Returns reduced profile in chunk format. If profile changing is in
3118 * progress (either running or paused) picks the target profile (if it's
3119 * already available), otherwise falls back to plain reducing.
3120 */
3033u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 3121u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3034{ 3122{
3035 /* 3123 /*
@@ -3040,6 +3128,34 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3040 u64 num_devices = root->fs_info->fs_devices->rw_devices + 3128 u64 num_devices = root->fs_info->fs_devices->rw_devices +
3041 root->fs_info->fs_devices->missing_devices; 3129 root->fs_info->fs_devices->missing_devices;
3042 3130
3131 /* pick restriper's target profile if it's available */
3132 spin_lock(&root->fs_info->balance_lock);
3133 if (root->fs_info->balance_ctl) {
3134 struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
3135 u64 tgt = 0;
3136
3137 if ((flags & BTRFS_BLOCK_GROUP_DATA) &&
3138 (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3139 (flags & bctl->data.target)) {
3140 tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3141 } else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) &&
3142 (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3143 (flags & bctl->sys.target)) {
3144 tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3145 } else if ((flags & BTRFS_BLOCK_GROUP_METADATA) &&
3146 (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3147 (flags & bctl->meta.target)) {
3148 tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3149 }
3150
3151 if (tgt) {
3152 spin_unlock(&root->fs_info->balance_lock);
3153 flags = tgt;
3154 goto out;
3155 }
3156 }
3157 spin_unlock(&root->fs_info->balance_lock);
3158
3043 if (num_devices == 1) 3159 if (num_devices == 1)
3044 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); 3160 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
3045 if (num_devices < 4) 3161 if (num_devices < 4)
@@ -3059,22 +3175,25 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3059 if ((flags & BTRFS_BLOCK_GROUP_RAID0) && 3175 if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
3060 ((flags & BTRFS_BLOCK_GROUP_RAID1) | 3176 ((flags & BTRFS_BLOCK_GROUP_RAID1) |
3061 (flags & BTRFS_BLOCK_GROUP_RAID10) | 3177 (flags & BTRFS_BLOCK_GROUP_RAID10) |
3062 (flags & BTRFS_BLOCK_GROUP_DUP))) 3178 (flags & BTRFS_BLOCK_GROUP_DUP))) {
3063 flags &= ~BTRFS_BLOCK_GROUP_RAID0; 3179 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
3180 }
3181
3182out:
3183 /* extended -> chunk profile */
3184 flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
3064 return flags; 3185 return flags;
3065} 3186}
3066 3187
3067static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) 3188static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3068{ 3189{
3069 if (flags & BTRFS_BLOCK_GROUP_DATA) 3190 if (flags & BTRFS_BLOCK_GROUP_DATA)
3070 flags |= root->fs_info->avail_data_alloc_bits & 3191 flags |= root->fs_info->avail_data_alloc_bits;
3071 root->fs_info->data_alloc_profile;
3072 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3192 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3073 flags |= root->fs_info->avail_system_alloc_bits & 3193 flags |= root->fs_info->avail_system_alloc_bits;
3074 root->fs_info->system_alloc_profile;
3075 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 3194 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3076 flags |= root->fs_info->avail_metadata_alloc_bits & 3195 flags |= root->fs_info->avail_metadata_alloc_bits;
3077 root->fs_info->metadata_alloc_profile; 3196
3078 return btrfs_reduce_alloc_profile(root, flags); 3197 return btrfs_reduce_alloc_profile(root, flags);
3079} 3198}
3080 3199
@@ -3191,6 +3310,8 @@ commit_trans:
3191 return -ENOSPC; 3310 return -ENOSPC;
3192 } 3311 }
3193 data_sinfo->bytes_may_use += bytes; 3312 data_sinfo->bytes_may_use += bytes;
3313 trace_btrfs_space_reservation(root->fs_info, "space_info",
3314 (u64)data_sinfo, bytes, 1);
3194 spin_unlock(&data_sinfo->lock); 3315 spin_unlock(&data_sinfo->lock);
3195 3316
3196 return 0; 3317 return 0;
@@ -3210,6 +3331,8 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3210 data_sinfo = BTRFS_I(inode)->space_info; 3331 data_sinfo = BTRFS_I(inode)->space_info;
3211 spin_lock(&data_sinfo->lock); 3332 spin_lock(&data_sinfo->lock);
3212 data_sinfo->bytes_may_use -= bytes; 3333 data_sinfo->bytes_may_use -= bytes;
3334 trace_btrfs_space_reservation(root->fs_info, "space_info",
3335 (u64)data_sinfo, bytes, 0);
3213 spin_unlock(&data_sinfo->lock); 3336 spin_unlock(&data_sinfo->lock);
3214} 3337}
3215 3338
@@ -3257,27 +3380,15 @@ static int should_alloc_chunk(struct btrfs_root *root,
3257 if (num_bytes - num_allocated < thresh) 3380 if (num_bytes - num_allocated < thresh)
3258 return 1; 3381 return 1;
3259 } 3382 }
3260
3261 /*
3262 * we have two similar checks here, one based on percentage
3263 * and once based on a hard number of 256MB. The idea
3264 * is that if we have a good amount of free
3265 * room, don't allocate a chunk. A good mount is
3266 * less than 80% utilized of the chunks we have allocated,
3267 * or more than 256MB free
3268 */
3269 if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
3270 return 0;
3271
3272 if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
3273 return 0;
3274
3275 thresh = btrfs_super_total_bytes(root->fs_info->super_copy); 3383 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3276 3384
3277 /* 256MB or 5% of the FS */ 3385 /* 256MB or 2% of the FS */
3278 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); 3386 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
3387 /* system chunks need a much small threshold */
3388 if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM)
3389 thresh = 32 * 1024 * 1024;
3279 3390
3280 if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3)) 3391 if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
3281 return 0; 3392 return 0;
3282 return 1; 3393 return 1;
3283} 3394}
@@ -3291,7 +3402,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3291 int wait_for_alloc = 0; 3402 int wait_for_alloc = 0;
3292 int ret = 0; 3403 int ret = 0;
3293 3404
3294 flags = btrfs_reduce_alloc_profile(extent_root, flags); 3405 BUG_ON(!profile_is_valid(flags, 0));
3295 3406
3296 space_info = __find_space_info(extent_root->fs_info, flags); 3407 space_info = __find_space_info(extent_root->fs_info, flags);
3297 if (!space_info) { 3408 if (!space_info) {
@@ -3582,6 +3693,10 @@ again:
3582 if (used <= space_info->total_bytes) { 3693 if (used <= space_info->total_bytes) {
3583 if (used + orig_bytes <= space_info->total_bytes) { 3694 if (used + orig_bytes <= space_info->total_bytes) {
3584 space_info->bytes_may_use += orig_bytes; 3695 space_info->bytes_may_use += orig_bytes;
3696 trace_btrfs_space_reservation(root->fs_info,
3697 "space_info",
3698 (u64)space_info,
3699 orig_bytes, 1);
3585 ret = 0; 3700 ret = 0;
3586 } else { 3701 } else {
3587 /* 3702 /*
@@ -3649,6 +3764,10 @@ again:
3649 3764
3650 if (used + num_bytes < space_info->total_bytes + avail) { 3765 if (used + num_bytes < space_info->total_bytes + avail) {
3651 space_info->bytes_may_use += orig_bytes; 3766 space_info->bytes_may_use += orig_bytes;
3767 trace_btrfs_space_reservation(root->fs_info,
3768 "space_info",
3769 (u64)space_info,
3770 orig_bytes, 1);
3652 ret = 0; 3771 ret = 0;
3653 } else { 3772 } else {
3654 wait_ordered = true; 3773 wait_ordered = true;
@@ -3755,7 +3874,8 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
3755 spin_unlock(&block_rsv->lock); 3874 spin_unlock(&block_rsv->lock);
3756} 3875}
3757 3876
3758static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, 3877static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
3878 struct btrfs_block_rsv *block_rsv,
3759 struct btrfs_block_rsv *dest, u64 num_bytes) 3879 struct btrfs_block_rsv *dest, u64 num_bytes)
3760{ 3880{
3761 struct btrfs_space_info *space_info = block_rsv->space_info; 3881 struct btrfs_space_info *space_info = block_rsv->space_info;
@@ -3791,6 +3911,9 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3791 if (num_bytes) { 3911 if (num_bytes) {
3792 spin_lock(&space_info->lock); 3912 spin_lock(&space_info->lock);
3793 space_info->bytes_may_use -= num_bytes; 3913 space_info->bytes_may_use -= num_bytes;
3914 trace_btrfs_space_reservation(fs_info, "space_info",
3915 (u64)space_info,
3916 num_bytes, 0);
3794 space_info->reservation_progress++; 3917 space_info->reservation_progress++;
3795 spin_unlock(&space_info->lock); 3918 spin_unlock(&space_info->lock);
3796 } 3919 }
@@ -3947,7 +4070,8 @@ void btrfs_block_rsv_release(struct btrfs_root *root,
3947 if (global_rsv->full || global_rsv == block_rsv || 4070 if (global_rsv->full || global_rsv == block_rsv ||
3948 block_rsv->space_info != global_rsv->space_info) 4071 block_rsv->space_info != global_rsv->space_info)
3949 global_rsv = NULL; 4072 global_rsv = NULL;
3950 block_rsv_release_bytes(block_rsv, global_rsv, num_bytes); 4073 block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
4074 num_bytes);
3951} 4075}
3952 4076
3953/* 4077/*
@@ -4006,11 +4130,15 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
4006 num_bytes = sinfo->total_bytes - num_bytes; 4130 num_bytes = sinfo->total_bytes - num_bytes;
4007 block_rsv->reserved += num_bytes; 4131 block_rsv->reserved += num_bytes;
4008 sinfo->bytes_may_use += num_bytes; 4132 sinfo->bytes_may_use += num_bytes;
4133 trace_btrfs_space_reservation(fs_info, "space_info",
4134 (u64)sinfo, num_bytes, 1);
4009 } 4135 }
4010 4136
4011 if (block_rsv->reserved >= block_rsv->size) { 4137 if (block_rsv->reserved >= block_rsv->size) {
4012 num_bytes = block_rsv->reserved - block_rsv->size; 4138 num_bytes = block_rsv->reserved - block_rsv->size;
4013 sinfo->bytes_may_use -= num_bytes; 4139 sinfo->bytes_may_use -= num_bytes;
4140 trace_btrfs_space_reservation(fs_info, "space_info",
4141 (u64)sinfo, num_bytes, 0);
4014 sinfo->reservation_progress++; 4142 sinfo->reservation_progress++;
4015 block_rsv->reserved = block_rsv->size; 4143 block_rsv->reserved = block_rsv->size;
4016 block_rsv->full = 1; 4144 block_rsv->full = 1;
@@ -4045,7 +4173,8 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
4045 4173
4046static void release_global_block_rsv(struct btrfs_fs_info *fs_info) 4174static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
4047{ 4175{
4048 block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1); 4176 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
4177 (u64)-1);
4049 WARN_ON(fs_info->delalloc_block_rsv.size > 0); 4178 WARN_ON(fs_info->delalloc_block_rsv.size > 0);
4050 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); 4179 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
4051 WARN_ON(fs_info->trans_block_rsv.size > 0); 4180 WARN_ON(fs_info->trans_block_rsv.size > 0);
@@ -4062,6 +4191,8 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
4062 if (!trans->bytes_reserved) 4191 if (!trans->bytes_reserved)
4063 return; 4192 return;
4064 4193
4194 trace_btrfs_space_reservation(root->fs_info, "transaction", (u64)trans,
4195 trans->bytes_reserved, 0);
4065 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); 4196 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
4066 trans->bytes_reserved = 0; 4197 trans->bytes_reserved = 0;
4067} 4198}
@@ -4079,6 +4210,8 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
4079 * when we are truly done with the orphan item. 4210 * when we are truly done with the orphan item.
4080 */ 4211 */
4081 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 4212 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4213 trace_btrfs_space_reservation(root->fs_info, "orphan",
4214 btrfs_ino(inode), num_bytes, 1);
4082 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4215 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4083} 4216}
4084 4217
@@ -4086,6 +4219,8 @@ void btrfs_orphan_release_metadata(struct inode *inode)
4086{ 4219{
4087 struct btrfs_root *root = BTRFS_I(inode)->root; 4220 struct btrfs_root *root = BTRFS_I(inode)->root;
4088 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 4221 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4222 trace_btrfs_space_reservation(root->fs_info, "orphan",
4223 btrfs_ino(inode), num_bytes, 0);
4089 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); 4224 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
4090} 4225}
4091 4226
@@ -4213,12 +4348,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4213 /* Need to be holding the i_mutex here if we aren't free space cache */ 4348 /* Need to be holding the i_mutex here if we aren't free space cache */
4214 if (btrfs_is_free_space_inode(root, inode)) 4349 if (btrfs_is_free_space_inode(root, inode))
4215 flush = 0; 4350 flush = 0;
4216 else
4217 WARN_ON(!mutex_is_locked(&inode->i_mutex));
4218 4351
4219 if (flush && btrfs_transaction_in_commit(root->fs_info)) 4352 if (flush && btrfs_transaction_in_commit(root->fs_info))
4220 schedule_timeout(1); 4353 schedule_timeout(1);
4221 4354
4355 mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
4222 num_bytes = ALIGN(num_bytes, root->sectorsize); 4356 num_bytes = ALIGN(num_bytes, root->sectorsize);
4223 4357
4224 spin_lock(&BTRFS_I(inode)->lock); 4358 spin_lock(&BTRFS_I(inode)->lock);
@@ -4266,8 +4400,14 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4266 if (dropped) 4400 if (dropped)
4267 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4401 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4268 4402
4269 if (to_free) 4403 if (to_free) {
4270 btrfs_block_rsv_release(root, block_rsv, to_free); 4404 btrfs_block_rsv_release(root, block_rsv, to_free);
4405 trace_btrfs_space_reservation(root->fs_info,
4406 "delalloc",
4407 btrfs_ino(inode),
4408 to_free, 0);
4409 }
4410 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4271 return ret; 4411 return ret;
4272 } 4412 }
4273 4413
@@ -4278,7 +4418,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4278 } 4418 }
4279 BTRFS_I(inode)->reserved_extents += nr_extents; 4419 BTRFS_I(inode)->reserved_extents += nr_extents;
4280 spin_unlock(&BTRFS_I(inode)->lock); 4420 spin_unlock(&BTRFS_I(inode)->lock);
4421 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4281 4422
4423 if (to_reserve)
4424 trace_btrfs_space_reservation(root->fs_info,"delalloc",
4425 btrfs_ino(inode), to_reserve, 1);
4282 block_rsv_add_bytes(block_rsv, to_reserve, 1); 4426 block_rsv_add_bytes(block_rsv, to_reserve, 1);
4283 4427
4284 return 0; 4428 return 0;
@@ -4308,6 +4452,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4308 if (dropped > 0) 4452 if (dropped > 0)
4309 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4453 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4310 4454
4455 trace_btrfs_space_reservation(root->fs_info, "delalloc",
4456 btrfs_ino(inode), to_free, 0);
4311 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, 4457 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
4312 to_free); 4458 to_free);
4313} 4459}
@@ -4562,7 +4708,10 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4562 cache->reserved += num_bytes; 4708 cache->reserved += num_bytes;
4563 space_info->bytes_reserved += num_bytes; 4709 space_info->bytes_reserved += num_bytes;
4564 if (reserve == RESERVE_ALLOC) { 4710 if (reserve == RESERVE_ALLOC) {
4565 BUG_ON(space_info->bytes_may_use < num_bytes); 4711 trace_btrfs_space_reservation(cache->fs_info,
4712 "space_info",
4713 (u64)space_info,
4714 num_bytes, 0);
4566 space_info->bytes_may_use -= num_bytes; 4715 space_info->bytes_may_use -= num_bytes;
4567 } 4716 }
4568 } 4717 }
@@ -4928,6 +5077,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
4928 rb_erase(&head->node.rb_node, &delayed_refs->root); 5077 rb_erase(&head->node.rb_node, &delayed_refs->root);
4929 5078
4930 delayed_refs->num_entries--; 5079 delayed_refs->num_entries--;
5080 if (waitqueue_active(&delayed_refs->seq_wait))
5081 wake_up(&delayed_refs->seq_wait);
4931 5082
4932 /* 5083 /*
4933 * we don't take a ref on the node because we're removing it from the 5084 * we don't take a ref on the node because we're removing it from the
@@ -4955,16 +5106,17 @@ out:
4955void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 5106void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4956 struct btrfs_root *root, 5107 struct btrfs_root *root,
4957 struct extent_buffer *buf, 5108 struct extent_buffer *buf,
4958 u64 parent, int last_ref) 5109 u64 parent, int last_ref, int for_cow)
4959{ 5110{
4960 struct btrfs_block_group_cache *cache = NULL; 5111 struct btrfs_block_group_cache *cache = NULL;
4961 int ret; 5112 int ret;
4962 5113
4963 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 5114 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4964 ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len, 5115 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
4965 parent, root->root_key.objectid, 5116 buf->start, buf->len,
4966 btrfs_header_level(buf), 5117 parent, root->root_key.objectid,
4967 BTRFS_DROP_DELAYED_REF, NULL); 5118 btrfs_header_level(buf),
5119 BTRFS_DROP_DELAYED_REF, NULL, for_cow);
4968 BUG_ON(ret); 5120 BUG_ON(ret);
4969 } 5121 }
4970 5122
@@ -4999,12 +5151,12 @@ out:
4999 btrfs_put_block_group(cache); 5151 btrfs_put_block_group(cache);
5000} 5152}
5001 5153
5002int btrfs_free_extent(struct btrfs_trans_handle *trans, 5154int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5003 struct btrfs_root *root, 5155 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
5004 u64 bytenr, u64 num_bytes, u64 parent, 5156 u64 owner, u64 offset, int for_cow)
5005 u64 root_objectid, u64 owner, u64 offset)
5006{ 5157{
5007 int ret; 5158 int ret;
5159 struct btrfs_fs_info *fs_info = root->fs_info;
5008 5160
5009 /* 5161 /*
5010 * tree log blocks never actually go into the extent allocation 5162 * tree log blocks never actually go into the extent allocation
@@ -5016,14 +5168,17 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
5016 btrfs_pin_extent(root, bytenr, num_bytes, 1); 5168 btrfs_pin_extent(root, bytenr, num_bytes, 1);
5017 ret = 0; 5169 ret = 0;
5018 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { 5170 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
5019 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, 5171 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
5172 num_bytes,
5020 parent, root_objectid, (int)owner, 5173 parent, root_objectid, (int)owner,
5021 BTRFS_DROP_DELAYED_REF, NULL); 5174 BTRFS_DROP_DELAYED_REF, NULL, for_cow);
5022 BUG_ON(ret); 5175 BUG_ON(ret);
5023 } else { 5176 } else {
5024 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, 5177 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
5025 parent, root_objectid, owner, 5178 num_bytes,
5026 offset, BTRFS_DROP_DELAYED_REF, NULL); 5179 parent, root_objectid, owner,
5180 offset, BTRFS_DROP_DELAYED_REF,
5181 NULL, for_cow);
5027 BUG_ON(ret); 5182 BUG_ON(ret);
5028 } 5183 }
5029 return ret; 5184 return ret;
@@ -5146,6 +5301,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5146 ins->objectid = 0; 5301 ins->objectid = 0;
5147 ins->offset = 0; 5302 ins->offset = 0;
5148 5303
5304 trace_find_free_extent(orig_root, num_bytes, empty_size, data);
5305
5149 space_info = __find_space_info(root->fs_info, data); 5306 space_info = __find_space_info(root->fs_info, data);
5150 if (!space_info) { 5307 if (!space_info) {
5151 printk(KERN_ERR "No space info for %llu\n", data); 5308 printk(KERN_ERR "No space info for %llu\n", data);
@@ -5295,15 +5452,6 @@ alloc:
5295 if (unlikely(block_group->ro)) 5452 if (unlikely(block_group->ro))
5296 goto loop; 5453 goto loop;
5297 5454
5298 spin_lock(&block_group->free_space_ctl->tree_lock);
5299 if (cached &&
5300 block_group->free_space_ctl->free_space <
5301 num_bytes + empty_cluster + empty_size) {
5302 spin_unlock(&block_group->free_space_ctl->tree_lock);
5303 goto loop;
5304 }
5305 spin_unlock(&block_group->free_space_ctl->tree_lock);
5306
5307 /* 5455 /*
5308 * Ok we want to try and use the cluster allocator, so 5456 * Ok we want to try and use the cluster allocator, so
5309 * lets look there 5457 * lets look there
@@ -5331,6 +5479,8 @@ alloc:
5331 if (offset) { 5479 if (offset) {
5332 /* we have a block, we're done */ 5480 /* we have a block, we're done */
5333 spin_unlock(&last_ptr->refill_lock); 5481 spin_unlock(&last_ptr->refill_lock);
5482 trace_btrfs_reserve_extent_cluster(root,
5483 block_group, search_start, num_bytes);
5334 goto checks; 5484 goto checks;
5335 } 5485 }
5336 5486
@@ -5349,8 +5499,15 @@ refill_cluster:
5349 * plenty of times and not have found 5499 * plenty of times and not have found
5350 * anything, so we are likely way too 5500 * anything, so we are likely way too
5351 * fragmented for the clustering stuff to find 5501 * fragmented for the clustering stuff to find
5352 * anything. */ 5502 * anything.
5353 if (loop >= LOOP_NO_EMPTY_SIZE) { 5503 *
5504 * However, if the cluster is taken from the
5505 * current block group, release the cluster
5506 * first, so that we stand a better chance of
5507 * succeeding in the unclustered
5508 * allocation. */
5509 if (loop >= LOOP_NO_EMPTY_SIZE &&
5510 last_ptr->block_group != block_group) {
5354 spin_unlock(&last_ptr->refill_lock); 5511 spin_unlock(&last_ptr->refill_lock);
5355 goto unclustered_alloc; 5512 goto unclustered_alloc;
5356 } 5513 }
@@ -5361,6 +5518,11 @@ refill_cluster:
5361 */ 5518 */
5362 btrfs_return_cluster_to_free_space(NULL, last_ptr); 5519 btrfs_return_cluster_to_free_space(NULL, last_ptr);
5363 5520
5521 if (loop >= LOOP_NO_EMPTY_SIZE) {
5522 spin_unlock(&last_ptr->refill_lock);
5523 goto unclustered_alloc;
5524 }
5525
5364 /* allocate a cluster in this block group */ 5526 /* allocate a cluster in this block group */
5365 ret = btrfs_find_space_cluster(trans, root, 5527 ret = btrfs_find_space_cluster(trans, root,
5366 block_group, last_ptr, 5528 block_group, last_ptr,
@@ -5377,6 +5539,9 @@ refill_cluster:
5377 if (offset) { 5539 if (offset) {
5378 /* we found one, proceed */ 5540 /* we found one, proceed */
5379 spin_unlock(&last_ptr->refill_lock); 5541 spin_unlock(&last_ptr->refill_lock);
5542 trace_btrfs_reserve_extent_cluster(root,
5543 block_group, search_start,
5544 num_bytes);
5380 goto checks; 5545 goto checks;
5381 } 5546 }
5382 } else if (!cached && loop > LOOP_CACHING_NOWAIT 5547 } else if (!cached && loop > LOOP_CACHING_NOWAIT
@@ -5401,6 +5566,15 @@ refill_cluster:
5401 } 5566 }
5402 5567
5403unclustered_alloc: 5568unclustered_alloc:
5569 spin_lock(&block_group->free_space_ctl->tree_lock);
5570 if (cached &&
5571 block_group->free_space_ctl->free_space <
5572 num_bytes + empty_cluster + empty_size) {
5573 spin_unlock(&block_group->free_space_ctl->tree_lock);
5574 goto loop;
5575 }
5576 spin_unlock(&block_group->free_space_ctl->tree_lock);
5577
5404 offset = btrfs_find_space_for_alloc(block_group, search_start, 5578 offset = btrfs_find_space_for_alloc(block_group, search_start,
5405 num_bytes, empty_size); 5579 num_bytes, empty_size);
5406 /* 5580 /*
@@ -5438,9 +5612,6 @@ checks:
5438 goto loop; 5612 goto loop;
5439 } 5613 }
5440 5614
5441 ins->objectid = search_start;
5442 ins->offset = num_bytes;
5443
5444 if (offset < search_start) 5615 if (offset < search_start)
5445 btrfs_add_free_space(used_block_group, offset, 5616 btrfs_add_free_space(used_block_group, offset,
5446 search_start - offset); 5617 search_start - offset);
@@ -5457,6 +5628,8 @@ checks:
5457 ins->objectid = search_start; 5628 ins->objectid = search_start;
5458 ins->offset = num_bytes; 5629 ins->offset = num_bytes;
5459 5630
5631 trace_btrfs_reserve_extent(orig_root, block_group,
5632 search_start, num_bytes);
5460 if (offset < search_start) 5633 if (offset < search_start)
5461 btrfs_add_free_space(used_block_group, offset, 5634 btrfs_add_free_space(used_block_group, offset,
5462 search_start - offset); 5635 search_start - offset);
@@ -5842,9 +6015,10 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5842 6015
5843 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); 6016 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
5844 6017
5845 ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset, 6018 ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
5846 0, root_objectid, owner, offset, 6019 ins->offset, 0,
5847 BTRFS_ADD_DELAYED_EXTENT, NULL); 6020 root_objectid, owner, offset,
6021 BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
5848 return ret; 6022 return ret;
5849} 6023}
5850 6024
@@ -5997,10 +6171,11 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5997 return ERR_PTR(-ENOSPC); 6171 return ERR_PTR(-ENOSPC);
5998} 6172}
5999 6173
6000static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize) 6174static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
6175 struct btrfs_block_rsv *block_rsv, u32 blocksize)
6001{ 6176{
6002 block_rsv_add_bytes(block_rsv, blocksize, 0); 6177 block_rsv_add_bytes(block_rsv, blocksize, 0);
6003 block_rsv_release_bytes(block_rsv, NULL, 0); 6178 block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
6004} 6179}
6005 6180
6006/* 6181/*
@@ -6014,7 +6189,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6014 struct btrfs_root *root, u32 blocksize, 6189 struct btrfs_root *root, u32 blocksize,
6015 u64 parent, u64 root_objectid, 6190 u64 parent, u64 root_objectid,
6016 struct btrfs_disk_key *key, int level, 6191 struct btrfs_disk_key *key, int level,
6017 u64 hint, u64 empty_size) 6192 u64 hint, u64 empty_size, int for_cow)
6018{ 6193{
6019 struct btrfs_key ins; 6194 struct btrfs_key ins;
6020 struct btrfs_block_rsv *block_rsv; 6195 struct btrfs_block_rsv *block_rsv;
@@ -6030,7 +6205,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6030 ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, 6205 ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
6031 empty_size, hint, (u64)-1, &ins, 0); 6206 empty_size, hint, (u64)-1, &ins, 0);
6032 if (ret) { 6207 if (ret) {
6033 unuse_block_rsv(block_rsv, blocksize); 6208 unuse_block_rsv(root->fs_info, block_rsv, blocksize);
6034 return ERR_PTR(ret); 6209 return ERR_PTR(ret);
6035 } 6210 }
6036 6211
@@ -6058,10 +6233,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6058 extent_op->update_flags = 1; 6233 extent_op->update_flags = 1;
6059 extent_op->is_data = 0; 6234 extent_op->is_data = 0;
6060 6235
6061 ret = btrfs_add_delayed_tree_ref(trans, ins.objectid, 6236 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
6237 ins.objectid,
6062 ins.offset, parent, root_objectid, 6238 ins.offset, parent, root_objectid,
6063 level, BTRFS_ADD_DELAYED_EXTENT, 6239 level, BTRFS_ADD_DELAYED_EXTENT,
6064 extent_op); 6240 extent_op, for_cow);
6065 BUG_ON(ret); 6241 BUG_ON(ret);
6066 } 6242 }
6067 return buf; 6243 return buf;
@@ -6078,6 +6254,7 @@ struct walk_control {
6078 int keep_locks; 6254 int keep_locks;
6079 int reada_slot; 6255 int reada_slot;
6080 int reada_count; 6256 int reada_count;
6257 int for_reloc;
6081}; 6258};
6082 6259
6083#define DROP_REFERENCE 1 6260#define DROP_REFERENCE 1
@@ -6216,9 +6393,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
6216 /* wc->stage == UPDATE_BACKREF */ 6393 /* wc->stage == UPDATE_BACKREF */
6217 if (!(wc->flags[level] & flag)) { 6394 if (!(wc->flags[level] & flag)) {
6218 BUG_ON(!path->locks[level]); 6395 BUG_ON(!path->locks[level]);
6219 ret = btrfs_inc_ref(trans, root, eb, 1); 6396 ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
6220 BUG_ON(ret); 6397 BUG_ON(ret);
6221 ret = btrfs_dec_ref(trans, root, eb, 0); 6398 ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
6222 BUG_ON(ret); 6399 BUG_ON(ret);
6223 ret = btrfs_set_disk_extent_flags(trans, root, eb->start, 6400 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
6224 eb->len, flag, 0); 6401 eb->len, flag, 0);
@@ -6362,7 +6539,7 @@ skip:
6362 } 6539 }
6363 6540
6364 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, 6541 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
6365 root->root_key.objectid, level - 1, 0); 6542 root->root_key.objectid, level - 1, 0, 0);
6366 BUG_ON(ret); 6543 BUG_ON(ret);
6367 } 6544 }
6368 btrfs_tree_unlock(next); 6545 btrfs_tree_unlock(next);
@@ -6436,9 +6613,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6436 if (wc->refs[level] == 1) { 6613 if (wc->refs[level] == 1) {
6437 if (level == 0) { 6614 if (level == 0) {
6438 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 6615 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6439 ret = btrfs_dec_ref(trans, root, eb, 1); 6616 ret = btrfs_dec_ref(trans, root, eb, 1,
6617 wc->for_reloc);
6440 else 6618 else
6441 ret = btrfs_dec_ref(trans, root, eb, 0); 6619 ret = btrfs_dec_ref(trans, root, eb, 0,
6620 wc->for_reloc);
6442 BUG_ON(ret); 6621 BUG_ON(ret);
6443 } 6622 }
6444 /* make block locked assertion in clean_tree_block happy */ 6623 /* make block locked assertion in clean_tree_block happy */
@@ -6465,7 +6644,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6465 btrfs_header_owner(path->nodes[level + 1])); 6644 btrfs_header_owner(path->nodes[level + 1]));
6466 } 6645 }
6467 6646
6468 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); 6647 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0);
6469out: 6648out:
6470 wc->refs[level] = 0; 6649 wc->refs[level] = 0;
6471 wc->flags[level] = 0; 6650 wc->flags[level] = 0;
@@ -6549,7 +6728,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
6549 * blocks are properly updated. 6728 * blocks are properly updated.
6550 */ 6729 */
6551void btrfs_drop_snapshot(struct btrfs_root *root, 6730void btrfs_drop_snapshot(struct btrfs_root *root,
6552 struct btrfs_block_rsv *block_rsv, int update_ref) 6731 struct btrfs_block_rsv *block_rsv, int update_ref,
6732 int for_reloc)
6553{ 6733{
6554 struct btrfs_path *path; 6734 struct btrfs_path *path;
6555 struct btrfs_trans_handle *trans; 6735 struct btrfs_trans_handle *trans;
@@ -6637,6 +6817,7 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
6637 wc->stage = DROP_REFERENCE; 6817 wc->stage = DROP_REFERENCE;
6638 wc->update_ref = update_ref; 6818 wc->update_ref = update_ref;
6639 wc->keep_locks = 0; 6819 wc->keep_locks = 0;
6820 wc->for_reloc = for_reloc;
6640 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 6821 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
6641 6822
6642 while (1) { 6823 while (1) {
@@ -6721,6 +6902,7 @@ out:
6721 * drop subtree rooted at tree block 'node'. 6902 * drop subtree rooted at tree block 'node'.
6722 * 6903 *
6723 * NOTE: this function will unlock and release tree block 'node' 6904 * NOTE: this function will unlock and release tree block 'node'
6905 * only used by relocation code
6724 */ 6906 */
6725int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 6907int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6726 struct btrfs_root *root, 6908 struct btrfs_root *root,
@@ -6765,6 +6947,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6765 wc->stage = DROP_REFERENCE; 6947 wc->stage = DROP_REFERENCE;
6766 wc->update_ref = 0; 6948 wc->update_ref = 0;
6767 wc->keep_locks = 1; 6949 wc->keep_locks = 1;
6950 wc->for_reloc = 1;
6768 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 6951 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
6769 6952
6770 while (1) { 6953 while (1) {
@@ -6792,6 +6975,29 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
6792 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | 6975 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
6793 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 6976 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
6794 6977
6978 if (root->fs_info->balance_ctl) {
6979 struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
6980 u64 tgt = 0;
6981
6982 /* pick restriper's target profile and return */
6983 if (flags & BTRFS_BLOCK_GROUP_DATA &&
6984 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
6985 tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
6986 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
6987 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
6988 tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
6989 } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
6990 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
6991 tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
6992 }
6993
6994 if (tgt) {
6995 /* extended -> chunk profile */
6996 tgt &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
6997 return tgt;
6998 }
6999 }
7000
6795 /* 7001 /*
6796 * we add in the count of missing devices because we want 7002 * we add in the count of missing devices because we want
6797 * to make sure that any RAID levels on a degraded FS 7003 * to make sure that any RAID levels on a degraded FS
@@ -7085,7 +7291,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7085 * space to fit our block group in. 7291 * space to fit our block group in.
7086 */ 7292 */
7087 if (device->total_bytes > device->bytes_used + min_free) { 7293 if (device->total_bytes > device->bytes_used + min_free) {
7088 ret = find_free_dev_extent(NULL, device, min_free, 7294 ret = find_free_dev_extent(device, min_free,
7089 &dev_offset, NULL); 7295 &dev_offset, NULL);
7090 if (!ret) 7296 if (!ret)
7091 dev_nr++; 7297 dev_nr++;
@@ -7447,6 +7653,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7447 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 7653 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
7448 &cache->space_info); 7654 &cache->space_info);
7449 BUG_ON(ret); 7655 BUG_ON(ret);
7656 update_global_block_rsv(root->fs_info);
7450 7657
7451 spin_lock(&cache->space_info->lock); 7658 spin_lock(&cache->space_info->lock);
7452 cache->space_info->bytes_readonly += cache->bytes_super; 7659 cache->space_info->bytes_readonly += cache->bytes_super;
@@ -7466,6 +7673,22 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7466 return 0; 7673 return 0;
7467} 7674}
7468 7675
7676static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
7677{
7678 u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
7679
7680 /* chunk -> extended profile */
7681 if (extra_flags == 0)
7682 extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
7683
7684 if (flags & BTRFS_BLOCK_GROUP_DATA)
7685 fs_info->avail_data_alloc_bits &= ~extra_flags;
7686 if (flags & BTRFS_BLOCK_GROUP_METADATA)
7687 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
7688 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
7689 fs_info->avail_system_alloc_bits &= ~extra_flags;
7690}
7691
7469int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 7692int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7470 struct btrfs_root *root, u64 group_start) 7693 struct btrfs_root *root, u64 group_start)
7471{ 7694{
@@ -7476,6 +7699,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7476 struct btrfs_key key; 7699 struct btrfs_key key;
7477 struct inode *inode; 7700 struct inode *inode;
7478 int ret; 7701 int ret;
7702 int index;
7479 int factor; 7703 int factor;
7480 7704
7481 root = root->fs_info->extent_root; 7705 root = root->fs_info->extent_root;
@@ -7491,6 +7715,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7491 free_excluded_extents(root, block_group); 7715 free_excluded_extents(root, block_group);
7492 7716
7493 memcpy(&key, &block_group->key, sizeof(key)); 7717 memcpy(&key, &block_group->key, sizeof(key));
7718 index = get_block_group_index(block_group);
7494 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | 7719 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
7495 BTRFS_BLOCK_GROUP_RAID1 | 7720 BTRFS_BLOCK_GROUP_RAID1 |
7496 BTRFS_BLOCK_GROUP_RAID10)) 7721 BTRFS_BLOCK_GROUP_RAID10))
@@ -7565,6 +7790,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7565 * are still on the list after taking the semaphore 7790 * are still on the list after taking the semaphore
7566 */ 7791 */
7567 list_del_init(&block_group->list); 7792 list_del_init(&block_group->list);
7793 if (list_empty(&block_group->space_info->block_groups[index]))
7794 clear_avail_alloc_bits(root->fs_info, block_group->flags);
7568 up_write(&block_group->space_info->groups_sem); 7795 up_write(&block_group->space_info->groups_sem);
7569 7796
7570 if (block_group->cached == BTRFS_CACHE_STARTED) 7797 if (block_group->cached == BTRFS_CACHE_STARTED)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 49f3c9dc09f4..9d09a4f81875 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -18,6 +18,7 @@
18#include "ctree.h" 18#include "ctree.h"
19#include "btrfs_inode.h" 19#include "btrfs_inode.h"
20#include "volumes.h" 20#include "volumes.h"
21#include "check-integrity.h"
21 22
22static struct kmem_cache *extent_state_cache; 23static struct kmem_cache *extent_state_cache;
23static struct kmem_cache *extent_buffer_cache; 24static struct kmem_cache *extent_buffer_cache;
@@ -1895,7 +1896,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1895 } 1896 }
1896 bio->bi_bdev = dev->bdev; 1897 bio->bi_bdev = dev->bdev;
1897 bio_add_page(bio, page, length, start-page_offset(page)); 1898 bio_add_page(bio, page, length, start-page_offset(page));
1898 submit_bio(WRITE_SYNC, bio); 1899 btrfsic_submit_bio(WRITE_SYNC, bio);
1899 wait_for_completion(&compl); 1900 wait_for_completion(&compl);
1900 1901
1901 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { 1902 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
@@ -2393,7 +2394,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
2393 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 2394 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
2394 mirror_num, bio_flags, start); 2395 mirror_num, bio_flags, start);
2395 else 2396 else
2396 submit_bio(rw, bio); 2397 btrfsic_submit_bio(rw, bio);
2397 2398
2398 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 2399 if (bio_flagged(bio, BIO_EOPNOTSUPP))
2399 ret = -EOPNOTSUPP; 2400 ret = -EOPNOTSUPP;
@@ -3579,6 +3580,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3579 atomic_set(&eb->blocking_writers, 0); 3580 atomic_set(&eb->blocking_writers, 0);
3580 atomic_set(&eb->spinning_readers, 0); 3581 atomic_set(&eb->spinning_readers, 0);
3581 atomic_set(&eb->spinning_writers, 0); 3582 atomic_set(&eb->spinning_writers, 0);
3583 eb->lock_nested = 0;
3582 init_waitqueue_head(&eb->write_lock_wq); 3584 init_waitqueue_head(&eb->write_lock_wq);
3583 init_waitqueue_head(&eb->read_lock_wq); 3585 init_waitqueue_head(&eb->read_lock_wq);
3584 3586
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7604c3001322..bc6a042cb6fc 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -129,6 +129,7 @@ struct extent_buffer {
129 struct list_head leak_list; 129 struct list_head leak_list;
130 struct rcu_head rcu_head; 130 struct rcu_head rcu_head;
131 atomic_t refs; 131 atomic_t refs;
132 pid_t lock_owner;
132 133
133 /* count of read lock holders on the extent buffer */ 134 /* count of read lock holders on the extent buffer */
134 atomic_t write_locks; 135 atomic_t write_locks;
@@ -137,6 +138,7 @@ struct extent_buffer {
137 atomic_t blocking_readers; 138 atomic_t blocking_readers;
138 atomic_t spinning_readers; 139 atomic_t spinning_readers;
139 atomic_t spinning_writers; 140 atomic_t spinning_writers;
141 int lock_nested;
140 142
141 /* protects write locks */ 143 /* protects write locks */
142 rwlock_t lock; 144 rwlock_t lock;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 034d98503229..859ba2dd8890 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -678,7 +678,7 @@ next_slot:
678 disk_bytenr, num_bytes, 0, 678 disk_bytenr, num_bytes, 0,
679 root->root_key.objectid, 679 root->root_key.objectid,
680 new_key.objectid, 680 new_key.objectid,
681 start - extent_offset); 681 start - extent_offset, 0);
682 BUG_ON(ret); 682 BUG_ON(ret);
683 *hint_byte = disk_bytenr; 683 *hint_byte = disk_bytenr;
684 } 684 }
@@ -753,7 +753,7 @@ next_slot:
753 disk_bytenr, num_bytes, 0, 753 disk_bytenr, num_bytes, 0,
754 root->root_key.objectid, 754 root->root_key.objectid,
755 key.objectid, key.offset - 755 key.objectid, key.offset -
756 extent_offset); 756 extent_offset, 0);
757 BUG_ON(ret); 757 BUG_ON(ret);
758 inode_sub_bytes(inode, 758 inode_sub_bytes(inode,
759 extent_end - key.offset); 759 extent_end - key.offset);
@@ -962,7 +962,7 @@ again:
962 962
963 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, 963 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
964 root->root_key.objectid, 964 root->root_key.objectid,
965 ino, orig_offset); 965 ino, orig_offset, 0);
966 BUG_ON(ret); 966 BUG_ON(ret);
967 967
968 if (split == start) { 968 if (split == start) {
@@ -989,7 +989,7 @@ again:
989 del_nr++; 989 del_nr++;
990 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 990 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
991 0, root->root_key.objectid, 991 0, root->root_key.objectid,
992 ino, orig_offset); 992 ino, orig_offset, 0);
993 BUG_ON(ret); 993 BUG_ON(ret);
994 } 994 }
995 other_start = 0; 995 other_start = 0;
@@ -1006,7 +1006,7 @@ again:
1006 del_nr++; 1006 del_nr++;
1007 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 1007 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1008 0, root->root_key.objectid, 1008 0, root->root_key.objectid,
1009 ino, orig_offset); 1009 ino, orig_offset, 0);
1010 BUG_ON(ret); 1010 BUG_ON(ret);
1011 } 1011 }
1012 if (del_nr == 0) { 1012 if (del_nr == 0) {
@@ -1274,7 +1274,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1274 dirty_pages); 1274 dirty_pages);
1275 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) 1275 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1276 btrfs_btree_balance_dirty(root, 1); 1276 btrfs_btree_balance_dirty(root, 1);
1277 btrfs_throttle(root);
1278 1277
1279 pos += copied; 1278 pos += copied;
1280 num_written += copied; 1279 num_written += copied;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 9a897bf79538..d20ff87ca603 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -319,9 +319,11 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl)
319 io_ctl_unmap_page(io_ctl); 319 io_ctl_unmap_page(io_ctl);
320 320
321 for (i = 0; i < io_ctl->num_pages; i++) { 321 for (i = 0; i < io_ctl->num_pages; i++) {
322 ClearPageChecked(io_ctl->pages[i]); 322 if (io_ctl->pages[i]) {
323 unlock_page(io_ctl->pages[i]); 323 ClearPageChecked(io_ctl->pages[i]);
324 page_cache_release(io_ctl->pages[i]); 324 unlock_page(io_ctl->pages[i]);
325 page_cache_release(io_ctl->pages[i]);
326 }
325 } 327 }
326} 328}
327 329
@@ -635,7 +637,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
635 if (!num_entries) 637 if (!num_entries)
636 return 0; 638 return 0;
637 639
638 io_ctl_init(&io_ctl, inode, root); 640 ret = io_ctl_init(&io_ctl, inode, root);
641 if (ret)
642 return ret;
643
639 ret = readahead_cache(inode); 644 ret = readahead_cache(inode);
640 if (ret) 645 if (ret)
641 goto out; 646 goto out;
@@ -838,7 +843,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
838 struct io_ctl io_ctl; 843 struct io_ctl io_ctl;
839 struct list_head bitmap_list; 844 struct list_head bitmap_list;
840 struct btrfs_key key; 845 struct btrfs_key key;
841 u64 start, end, len; 846 u64 start, extent_start, extent_end, len;
842 int entries = 0; 847 int entries = 0;
843 int bitmaps = 0; 848 int bitmaps = 0;
844 int ret; 849 int ret;
@@ -849,7 +854,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
849 if (!i_size_read(inode)) 854 if (!i_size_read(inode))
850 return -1; 855 return -1;
851 856
852 io_ctl_init(&io_ctl, inode, root); 857 ret = io_ctl_init(&io_ctl, inode, root);
858 if (ret)
859 return -1;
853 860
854 /* Get the cluster for this block_group if it exists */ 861 /* Get the cluster for this block_group if it exists */
855 if (block_group && !list_empty(&block_group->cluster_list)) 862 if (block_group && !list_empty(&block_group->cluster_list))
@@ -857,25 +864,12 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
857 struct btrfs_free_cluster, 864 struct btrfs_free_cluster,
858 block_group_list); 865 block_group_list);
859 866
860 /*
861 * We shouldn't have switched the pinned extents yet so this is the
862 * right one
863 */
864 unpin = root->fs_info->pinned_extents;
865
866 /* Lock all pages first so we can lock the extent safely. */ 867 /* Lock all pages first so we can lock the extent safely. */
867 io_ctl_prepare_pages(&io_ctl, inode, 0); 868 io_ctl_prepare_pages(&io_ctl, inode, 0);
868 869
869 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 870 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
870 0, &cached_state, GFP_NOFS); 871 0, &cached_state, GFP_NOFS);
871 872
872 /*
873 * When searching for pinned extents, we need to start at our start
874 * offset.
875 */
876 if (block_group)
877 start = block_group->key.objectid;
878
879 node = rb_first(&ctl->free_space_offset); 873 node = rb_first(&ctl->free_space_offset);
880 if (!node && cluster) { 874 if (!node && cluster) {
881 node = rb_first(&cluster->root); 875 node = rb_first(&cluster->root);
@@ -918,9 +912,20 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
918 * We want to add any pinned extents to our free space cache 912 * We want to add any pinned extents to our free space cache
919 * so we don't leak the space 913 * so we don't leak the space
920 */ 914 */
915
916 /*
917 * We shouldn't have switched the pinned extents yet so this is the
918 * right one
919 */
920 unpin = root->fs_info->pinned_extents;
921
922 if (block_group)
923 start = block_group->key.objectid;
924
921 while (block_group && (start < block_group->key.objectid + 925 while (block_group && (start < block_group->key.objectid +
922 block_group->key.offset)) { 926 block_group->key.offset)) {
923 ret = find_first_extent_bit(unpin, start, &start, &end, 927 ret = find_first_extent_bit(unpin, start,
928 &extent_start, &extent_end,
924 EXTENT_DIRTY); 929 EXTENT_DIRTY);
925 if (ret) { 930 if (ret) {
926 ret = 0; 931 ret = 0;
@@ -928,20 +933,21 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
928 } 933 }
929 934
930 /* This pinned extent is out of our range */ 935 /* This pinned extent is out of our range */
931 if (start >= block_group->key.objectid + 936 if (extent_start >= block_group->key.objectid +
932 block_group->key.offset) 937 block_group->key.offset)
933 break; 938 break;
934 939
935 len = block_group->key.objectid + 940 extent_start = max(extent_start, start);
936 block_group->key.offset - start; 941 extent_end = min(block_group->key.objectid +
937 len = min(len, end + 1 - start); 942 block_group->key.offset, extent_end + 1);
943 len = extent_end - extent_start;
938 944
939 entries++; 945 entries++;
940 ret = io_ctl_add_entry(&io_ctl, start, len, NULL); 946 ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL);
941 if (ret) 947 if (ret)
942 goto out_nospc; 948 goto out_nospc;
943 949
944 start = end + 1; 950 start = extent_end;
945 } 951 }
946 952
947 /* Write out the bitmaps */ 953 /* Write out the bitmaps */
@@ -2283,23 +2289,23 @@ out:
2283static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, 2289static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
2284 struct btrfs_free_space *entry, 2290 struct btrfs_free_space *entry,
2285 struct btrfs_free_cluster *cluster, 2291 struct btrfs_free_cluster *cluster,
2286 u64 offset, u64 bytes, u64 min_bytes) 2292 u64 offset, u64 bytes,
2293 u64 cont1_bytes, u64 min_bytes)
2287{ 2294{
2288 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2295 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2289 unsigned long next_zero; 2296 unsigned long next_zero;
2290 unsigned long i; 2297 unsigned long i;
2291 unsigned long search_bits; 2298 unsigned long want_bits;
2292 unsigned long total_bits; 2299 unsigned long min_bits;
2293 unsigned long found_bits; 2300 unsigned long found_bits;
2294 unsigned long start = 0; 2301 unsigned long start = 0;
2295 unsigned long total_found = 0; 2302 unsigned long total_found = 0;
2296 int ret; 2303 int ret;
2297 bool found = false;
2298 2304
2299 i = offset_to_bit(entry->offset, block_group->sectorsize, 2305 i = offset_to_bit(entry->offset, block_group->sectorsize,
2300 max_t(u64, offset, entry->offset)); 2306 max_t(u64, offset, entry->offset));
2301 search_bits = bytes_to_bits(bytes, block_group->sectorsize); 2307 want_bits = bytes_to_bits(bytes, block_group->sectorsize);
2302 total_bits = bytes_to_bits(min_bytes, block_group->sectorsize); 2308 min_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
2303 2309
2304again: 2310again:
2305 found_bits = 0; 2311 found_bits = 0;
@@ -2308,7 +2314,7 @@ again:
2308 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) { 2314 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
2309 next_zero = find_next_zero_bit(entry->bitmap, 2315 next_zero = find_next_zero_bit(entry->bitmap,
2310 BITS_PER_BITMAP, i); 2316 BITS_PER_BITMAP, i);
2311 if (next_zero - i >= search_bits) { 2317 if (next_zero - i >= min_bits) {
2312 found_bits = next_zero - i; 2318 found_bits = next_zero - i;
2313 break; 2319 break;
2314 } 2320 }
@@ -2318,10 +2324,9 @@ again:
2318 if (!found_bits) 2324 if (!found_bits)
2319 return -ENOSPC; 2325 return -ENOSPC;
2320 2326
2321 if (!found) { 2327 if (!total_found) {
2322 start = i; 2328 start = i;
2323 cluster->max_size = 0; 2329 cluster->max_size = 0;
2324 found = true;
2325 } 2330 }
2326 2331
2327 total_found += found_bits; 2332 total_found += found_bits;
@@ -2329,13 +2334,8 @@ again:
2329 if (cluster->max_size < found_bits * block_group->sectorsize) 2334 if (cluster->max_size < found_bits * block_group->sectorsize)
2330 cluster->max_size = found_bits * block_group->sectorsize; 2335 cluster->max_size = found_bits * block_group->sectorsize;
2331 2336
2332 if (total_found < total_bits) { 2337 if (total_found < want_bits || cluster->max_size < cont1_bytes) {
2333 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero); 2338 i = next_zero + 1;
2334 if (i - start > total_bits * 2) {
2335 total_found = 0;
2336 cluster->max_size = 0;
2337 found = false;
2338 }
2339 goto again; 2339 goto again;
2340 } 2340 }
2341 2341
@@ -2346,28 +2346,31 @@ again:
2346 &entry->offset_index, 1); 2346 &entry->offset_index, 1);
2347 BUG_ON(ret); 2347 BUG_ON(ret);
2348 2348
2349 trace_btrfs_setup_cluster(block_group, cluster,
2350 total_found * block_group->sectorsize, 1);
2349 return 0; 2351 return 0;
2350} 2352}
2351 2353
2352/* 2354/*
2353 * This searches the block group for just extents to fill the cluster with. 2355 * This searches the block group for just extents to fill the cluster with.
2356 * Try to find a cluster with at least bytes total bytes, at least one
2357 * extent of cont1_bytes, and other clusters of at least min_bytes.
2354 */ 2358 */
2355static noinline int 2359static noinline int
2356setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, 2360setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2357 struct btrfs_free_cluster *cluster, 2361 struct btrfs_free_cluster *cluster,
2358 struct list_head *bitmaps, u64 offset, u64 bytes, 2362 struct list_head *bitmaps, u64 offset, u64 bytes,
2359 u64 min_bytes) 2363 u64 cont1_bytes, u64 min_bytes)
2360{ 2364{
2361 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2365 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2362 struct btrfs_free_space *first = NULL; 2366 struct btrfs_free_space *first = NULL;
2363 struct btrfs_free_space *entry = NULL; 2367 struct btrfs_free_space *entry = NULL;
2364 struct btrfs_free_space *prev = NULL;
2365 struct btrfs_free_space *last; 2368 struct btrfs_free_space *last;
2366 struct rb_node *node; 2369 struct rb_node *node;
2367 u64 window_start; 2370 u64 window_start;
2368 u64 window_free; 2371 u64 window_free;
2369 u64 max_extent; 2372 u64 max_extent;
2370 u64 max_gap = 128 * 1024; 2373 u64 total_size = 0;
2371 2374
2372 entry = tree_search_offset(ctl, offset, 0, 1); 2375 entry = tree_search_offset(ctl, offset, 0, 1);
2373 if (!entry) 2376 if (!entry)
@@ -2377,8 +2380,8 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2377 * We don't want bitmaps, so just move along until we find a normal 2380 * We don't want bitmaps, so just move along until we find a normal
2378 * extent entry. 2381 * extent entry.
2379 */ 2382 */
2380 while (entry->bitmap) { 2383 while (entry->bitmap || entry->bytes < min_bytes) {
2381 if (list_empty(&entry->list)) 2384 if (entry->bitmap && list_empty(&entry->list))
2382 list_add_tail(&entry->list, bitmaps); 2385 list_add_tail(&entry->list, bitmaps);
2383 node = rb_next(&entry->offset_index); 2386 node = rb_next(&entry->offset_index);
2384 if (!node) 2387 if (!node)
@@ -2391,12 +2394,9 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2391 max_extent = entry->bytes; 2394 max_extent = entry->bytes;
2392 first = entry; 2395 first = entry;
2393 last = entry; 2396 last = entry;
2394 prev = entry;
2395 2397
2396 while (window_free <= min_bytes) { 2398 for (node = rb_next(&entry->offset_index); node;
2397 node = rb_next(&entry->offset_index); 2399 node = rb_next(&entry->offset_index)) {
2398 if (!node)
2399 return -ENOSPC;
2400 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2400 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2401 2401
2402 if (entry->bitmap) { 2402 if (entry->bitmap) {
@@ -2405,26 +2405,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2405 continue; 2405 continue;
2406 } 2406 }
2407 2407
2408 /* 2408 if (entry->bytes < min_bytes)
2409 * we haven't filled the empty size and the window is 2409 continue;
2410 * very large. reset and try again 2410
2411 */ 2411 last = entry;
2412 if (entry->offset - (prev->offset + prev->bytes) > max_gap || 2412 window_free += entry->bytes;
2413 entry->offset - window_start > (min_bytes * 2)) { 2413 if (entry->bytes > max_extent)
2414 first = entry;
2415 window_start = entry->offset;
2416 window_free = entry->bytes;
2417 last = entry;
2418 max_extent = entry->bytes; 2414 max_extent = entry->bytes;
2419 } else {
2420 last = entry;
2421 window_free += entry->bytes;
2422 if (entry->bytes > max_extent)
2423 max_extent = entry->bytes;
2424 }
2425 prev = entry;
2426 } 2415 }
2427 2416
2417 if (window_free < bytes || max_extent < cont1_bytes)
2418 return -ENOSPC;
2419
2428 cluster->window_start = first->offset; 2420 cluster->window_start = first->offset;
2429 2421
2430 node = &first->offset_index; 2422 node = &first->offset_index;
@@ -2438,17 +2430,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2438 2430
2439 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2431 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2440 node = rb_next(&entry->offset_index); 2432 node = rb_next(&entry->offset_index);
2441 if (entry->bitmap) 2433 if (entry->bitmap || entry->bytes < min_bytes)
2442 continue; 2434 continue;
2443 2435
2444 rb_erase(&entry->offset_index, &ctl->free_space_offset); 2436 rb_erase(&entry->offset_index, &ctl->free_space_offset);
2445 ret = tree_insert_offset(&cluster->root, entry->offset, 2437 ret = tree_insert_offset(&cluster->root, entry->offset,
2446 &entry->offset_index, 0); 2438 &entry->offset_index, 0);
2439 total_size += entry->bytes;
2447 BUG_ON(ret); 2440 BUG_ON(ret);
2448 } while (node && entry != last); 2441 } while (node && entry != last);
2449 2442
2450 cluster->max_size = max_extent; 2443 cluster->max_size = max_extent;
2451 2444 trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
2452 return 0; 2445 return 0;
2453} 2446}
2454 2447
@@ -2460,7 +2453,7 @@ static noinline int
2460setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, 2453setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2461 struct btrfs_free_cluster *cluster, 2454 struct btrfs_free_cluster *cluster,
2462 struct list_head *bitmaps, u64 offset, u64 bytes, 2455 struct list_head *bitmaps, u64 offset, u64 bytes,
2463 u64 min_bytes) 2456 u64 cont1_bytes, u64 min_bytes)
2464{ 2457{
2465 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2458 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2466 struct btrfs_free_space *entry; 2459 struct btrfs_free_space *entry;
@@ -2485,7 +2478,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2485 if (entry->bytes < min_bytes) 2478 if (entry->bytes < min_bytes)
2486 continue; 2479 continue;
2487 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, 2480 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
2488 bytes, min_bytes); 2481 bytes, cont1_bytes, min_bytes);
2489 if (!ret) 2482 if (!ret)
2490 return 0; 2483 return 0;
2491 } 2484 }
@@ -2499,7 +2492,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2499 2492
2500/* 2493/*
2501 * here we try to find a cluster of blocks in a block group. The goal 2494 * here we try to find a cluster of blocks in a block group. The goal
2502 * is to find at least bytes free and up to empty_size + bytes free. 2495 * is to find at least bytes+empty_size.
2503 * We might not find them all in one contiguous area. 2496 * We might not find them all in one contiguous area.
2504 * 2497 *
2505 * returns zero and sets up cluster if things worked out, otherwise 2498 * returns zero and sets up cluster if things worked out, otherwise
@@ -2515,23 +2508,24 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2515 struct btrfs_free_space *entry, *tmp; 2508 struct btrfs_free_space *entry, *tmp;
2516 LIST_HEAD(bitmaps); 2509 LIST_HEAD(bitmaps);
2517 u64 min_bytes; 2510 u64 min_bytes;
2511 u64 cont1_bytes;
2518 int ret; 2512 int ret;
2519 2513
2520 /* for metadata, allow allocates with more holes */ 2514 /*
2515 * Choose the minimum extent size we'll require for this
2516 * cluster. For SSD_SPREAD, don't allow any fragmentation.
2517 * For metadata, allow allocates with smaller extents. For
2518 * data, keep it dense.
2519 */
2521 if (btrfs_test_opt(root, SSD_SPREAD)) { 2520 if (btrfs_test_opt(root, SSD_SPREAD)) {
2522 min_bytes = bytes + empty_size; 2521 cont1_bytes = min_bytes = bytes + empty_size;
2523 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { 2522 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
2524 /* 2523 cont1_bytes = bytes;
2525 * we want to do larger allocations when we are 2524 min_bytes = block_group->sectorsize;
2526 * flushing out the delayed refs, it helps prevent 2525 } else {
2527 * making more work as we go along. 2526 cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
2528 */ 2527 min_bytes = block_group->sectorsize;
2529 if (trans->transaction->delayed_refs.flushing) 2528 }
2530 min_bytes = max(bytes, (bytes + empty_size) >> 1);
2531 else
2532 min_bytes = max(bytes, (bytes + empty_size) >> 4);
2533 } else
2534 min_bytes = max(bytes, (bytes + empty_size) >> 2);
2535 2529
2536 spin_lock(&ctl->tree_lock); 2530 spin_lock(&ctl->tree_lock);
2537 2531
@@ -2539,7 +2533,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2539 * If we know we don't have enough space to make a cluster don't even 2533 * If we know we don't have enough space to make a cluster don't even
2540 * bother doing all the work to try and find one. 2534 * bother doing all the work to try and find one.
2541 */ 2535 */
2542 if (ctl->free_space < min_bytes) { 2536 if (ctl->free_space < bytes) {
2543 spin_unlock(&ctl->tree_lock); 2537 spin_unlock(&ctl->tree_lock);
2544 return -ENOSPC; 2538 return -ENOSPC;
2545 } 2539 }
@@ -2552,11 +2546,17 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2552 goto out; 2546 goto out;
2553 } 2547 }
2554 2548
2549 trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
2550 min_bytes);
2551
2552 INIT_LIST_HEAD(&bitmaps);
2555 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, 2553 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
2556 bytes, min_bytes); 2554 bytes + empty_size,
2555 cont1_bytes, min_bytes);
2557 if (ret) 2556 if (ret)
2558 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, 2557 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
2559 offset, bytes, min_bytes); 2558 offset, bytes + empty_size,
2559 cont1_bytes, min_bytes);
2560 2560
2561 /* Clear our temporary list */ 2561 /* Clear our temporary list */
2562 list_for_each_entry_safe(entry, tmp, &bitmaps, list) 2562 list_for_each_entry_safe(entry, tmp, &bitmaps, list)
@@ -2567,6 +2567,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2567 list_add_tail(&cluster->block_group_list, 2567 list_add_tail(&cluster->block_group_list,
2568 &block_group->cluster_list); 2568 &block_group->cluster_list);
2569 cluster->block_group = block_group; 2569 cluster->block_group = block_group;
2570 } else {
2571 trace_btrfs_failed_cluster_setup(block_group);
2570 } 2572 }
2571out: 2573out:
2572 spin_unlock(&cluster->lock); 2574 spin_unlock(&cluster->lock);
@@ -2588,17 +2590,57 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
2588 cluster->block_group = NULL; 2590 cluster->block_group = NULL;
2589} 2591}
2590 2592
2591int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, 2593static int do_trimming(struct btrfs_block_group_cache *block_group,
2592 u64 *trimmed, u64 start, u64 end, u64 minlen) 2594 u64 *total_trimmed, u64 start, u64 bytes,
2595 u64 reserved_start, u64 reserved_bytes)
2593{ 2596{
2594 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2597 struct btrfs_space_info *space_info = block_group->space_info;
2595 struct btrfs_free_space *entry = NULL;
2596 struct btrfs_fs_info *fs_info = block_group->fs_info; 2598 struct btrfs_fs_info *fs_info = block_group->fs_info;
2597 u64 bytes = 0; 2599 int ret;
2598 u64 actually_trimmed; 2600 int update = 0;
2599 int ret = 0; 2601 u64 trimmed = 0;
2600 2602
2601 *trimmed = 0; 2603 spin_lock(&space_info->lock);
2604 spin_lock(&block_group->lock);
2605 if (!block_group->ro) {
2606 block_group->reserved += reserved_bytes;
2607 space_info->bytes_reserved += reserved_bytes;
2608 update = 1;
2609 }
2610 spin_unlock(&block_group->lock);
2611 spin_unlock(&space_info->lock);
2612
2613 ret = btrfs_error_discard_extent(fs_info->extent_root,
2614 start, bytes, &trimmed);
2615 if (!ret)
2616 *total_trimmed += trimmed;
2617
2618 btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
2619
2620 if (update) {
2621 spin_lock(&space_info->lock);
2622 spin_lock(&block_group->lock);
2623 if (block_group->ro)
2624 space_info->bytes_readonly += reserved_bytes;
2625 block_group->reserved -= reserved_bytes;
2626 space_info->bytes_reserved -= reserved_bytes;
2627 spin_unlock(&space_info->lock);
2628 spin_unlock(&block_group->lock);
2629 }
2630
2631 return ret;
2632}
2633
2634static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
2635 u64 *total_trimmed, u64 start, u64 end, u64 minlen)
2636{
2637 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2638 struct btrfs_free_space *entry;
2639 struct rb_node *node;
2640 int ret = 0;
2641 u64 extent_start;
2642 u64 extent_bytes;
2643 u64 bytes;
2602 2644
2603 while (start < end) { 2645 while (start < end) {
2604 spin_lock(&ctl->tree_lock); 2646 spin_lock(&ctl->tree_lock);
@@ -2609,81 +2651,118 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2609 } 2651 }
2610 2652
2611 entry = tree_search_offset(ctl, start, 0, 1); 2653 entry = tree_search_offset(ctl, start, 0, 1);
2612 if (!entry) 2654 if (!entry) {
2613 entry = tree_search_offset(ctl,
2614 offset_to_bitmap(ctl, start),
2615 1, 1);
2616
2617 if (!entry || entry->offset >= end) {
2618 spin_unlock(&ctl->tree_lock); 2655 spin_unlock(&ctl->tree_lock);
2619 break; 2656 break;
2620 } 2657 }
2621 2658
2622 if (entry->bitmap) { 2659 /* skip bitmaps */
2623 ret = search_bitmap(ctl, entry, &start, &bytes); 2660 while (entry->bitmap) {
2624 if (!ret) { 2661 node = rb_next(&entry->offset_index);
2625 if (start >= end) { 2662 if (!node) {
2626 spin_unlock(&ctl->tree_lock);
2627 break;
2628 }
2629 bytes = min(bytes, end - start);
2630 bitmap_clear_bits(ctl, entry, start, bytes);
2631 if (entry->bytes == 0)
2632 free_bitmap(ctl, entry);
2633 } else {
2634 start = entry->offset + BITS_PER_BITMAP *
2635 block_group->sectorsize;
2636 spin_unlock(&ctl->tree_lock); 2663 spin_unlock(&ctl->tree_lock);
2637 ret = 0; 2664 goto out;
2638 continue;
2639 } 2665 }
2640 } else { 2666 entry = rb_entry(node, struct btrfs_free_space,
2641 start = entry->offset; 2667 offset_index);
2642 bytes = min(entry->bytes, end - start);
2643 unlink_free_space(ctl, entry);
2644 kmem_cache_free(btrfs_free_space_cachep, entry);
2645 } 2668 }
2646 2669
2670 if (entry->offset >= end) {
2671 spin_unlock(&ctl->tree_lock);
2672 break;
2673 }
2674
2675 extent_start = entry->offset;
2676 extent_bytes = entry->bytes;
2677 start = max(start, extent_start);
2678 bytes = min(extent_start + extent_bytes, end) - start;
2679 if (bytes < minlen) {
2680 spin_unlock(&ctl->tree_lock);
2681 goto next;
2682 }
2683
2684 unlink_free_space(ctl, entry);
2685 kmem_cache_free(btrfs_free_space_cachep, entry);
2686
2647 spin_unlock(&ctl->tree_lock); 2687 spin_unlock(&ctl->tree_lock);
2648 2688
2649 if (bytes >= minlen) { 2689 ret = do_trimming(block_group, total_trimmed, start, bytes,
2650 struct btrfs_space_info *space_info; 2690 extent_start, extent_bytes);
2651 int update = 0; 2691 if (ret)
2652 2692 break;
2653 space_info = block_group->space_info; 2693next:
2654 spin_lock(&space_info->lock); 2694 start += bytes;
2655 spin_lock(&block_group->lock);
2656 if (!block_group->ro) {
2657 block_group->reserved += bytes;
2658 space_info->bytes_reserved += bytes;
2659 update = 1;
2660 }
2661 spin_unlock(&block_group->lock);
2662 spin_unlock(&space_info->lock);
2663
2664 ret = btrfs_error_discard_extent(fs_info->extent_root,
2665 start,
2666 bytes,
2667 &actually_trimmed);
2668
2669 btrfs_add_free_space(block_group, start, bytes);
2670 if (update) {
2671 spin_lock(&space_info->lock);
2672 spin_lock(&block_group->lock);
2673 if (block_group->ro)
2674 space_info->bytes_readonly += bytes;
2675 block_group->reserved -= bytes;
2676 space_info->bytes_reserved -= bytes;
2677 spin_unlock(&space_info->lock);
2678 spin_unlock(&block_group->lock);
2679 }
2680 2695
2681 if (ret) 2696 if (fatal_signal_pending(current)) {
2682 break; 2697 ret = -ERESTARTSYS;
2683 *trimmed += actually_trimmed; 2698 break;
2699 }
2700
2701 cond_resched();
2702 }
2703out:
2704 return ret;
2705}
2706
2707static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
2708 u64 *total_trimmed, u64 start, u64 end, u64 minlen)
2709{
2710 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2711 struct btrfs_free_space *entry;
2712 int ret = 0;
2713 int ret2;
2714 u64 bytes;
2715 u64 offset = offset_to_bitmap(ctl, start);
2716
2717 while (offset < end) {
2718 bool next_bitmap = false;
2719
2720 spin_lock(&ctl->tree_lock);
2721
2722 if (ctl->free_space < minlen) {
2723 spin_unlock(&ctl->tree_lock);
2724 break;
2725 }
2726
2727 entry = tree_search_offset(ctl, offset, 1, 0);
2728 if (!entry) {
2729 spin_unlock(&ctl->tree_lock);
2730 next_bitmap = true;
2731 goto next;
2732 }
2733
2734 bytes = minlen;
2735 ret2 = search_bitmap(ctl, entry, &start, &bytes);
2736 if (ret2 || start >= end) {
2737 spin_unlock(&ctl->tree_lock);
2738 next_bitmap = true;
2739 goto next;
2740 }
2741
2742 bytes = min(bytes, end - start);
2743 if (bytes < minlen) {
2744 spin_unlock(&ctl->tree_lock);
2745 goto next;
2746 }
2747
2748 bitmap_clear_bits(ctl, entry, start, bytes);
2749 if (entry->bytes == 0)
2750 free_bitmap(ctl, entry);
2751
2752 spin_unlock(&ctl->tree_lock);
2753
2754 ret = do_trimming(block_group, total_trimmed, start, bytes,
2755 start, bytes);
2756 if (ret)
2757 break;
2758next:
2759 if (next_bitmap) {
2760 offset += BITS_PER_BITMAP * ctl->unit;
2761 } else {
2762 start += bytes;
2763 if (start >= offset + BITS_PER_BITMAP * ctl->unit)
2764 offset += BITS_PER_BITMAP * ctl->unit;
2684 } 2765 }
2685 start += bytes;
2686 bytes = 0;
2687 2766
2688 if (fatal_signal_pending(current)) { 2767 if (fatal_signal_pending(current)) {
2689 ret = -ERESTARTSYS; 2768 ret = -ERESTARTSYS;
@@ -2696,6 +2775,22 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2696 return ret; 2775 return ret;
2697} 2776}
2698 2777
2778int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2779 u64 *trimmed, u64 start, u64 end, u64 minlen)
2780{
2781 int ret;
2782
2783 *trimmed = 0;
2784
2785 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
2786 if (ret)
2787 return ret;
2788
2789 ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
2790
2791 return ret;
2792}
2793
2699/* 2794/*
2700 * Find the left-most item in the cache tree, and then return the 2795 * Find the left-most item in the cache tree, and then return the
2701 * smallest inode number in the item. 2796 * smallest inode number in the item.
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index f8962a957d65..213ffa86ce1b 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -438,6 +438,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
438 trans->bytes_reserved); 438 trans->bytes_reserved);
439 if (ret) 439 if (ret)
440 goto out; 440 goto out;
441 trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans,
442 trans->bytes_reserved, 1);
441again: 443again:
442 inode = lookup_free_ino_inode(root, path); 444 inode = lookup_free_ino_inode(root, path);
443 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 445 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
@@ -498,6 +500,8 @@ again:
498out_put: 500out_put:
499 iput(inode); 501 iput(inode);
500out_release: 502out_release:
503 trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans,
504 trans->bytes_reserved, 0);
501 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); 505 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
502out: 506out:
503 trans->block_rsv = rsv; 507 trans->block_rsv = rsv;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 81b235a61f8c..0da19a0ea00d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1951,12 +1951,28 @@ enum btrfs_orphan_cleanup_state {
1951void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, 1951void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
1952 struct btrfs_root *root) 1952 struct btrfs_root *root)
1953{ 1953{
1954 struct btrfs_block_rsv *block_rsv;
1954 int ret; 1955 int ret;
1955 1956
1956 if (!list_empty(&root->orphan_list) || 1957 if (!list_empty(&root->orphan_list) ||
1957 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) 1958 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
1958 return; 1959 return;
1959 1960
1961 spin_lock(&root->orphan_lock);
1962 if (!list_empty(&root->orphan_list)) {
1963 spin_unlock(&root->orphan_lock);
1964 return;
1965 }
1966
1967 if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
1968 spin_unlock(&root->orphan_lock);
1969 return;
1970 }
1971
1972 block_rsv = root->orphan_block_rsv;
1973 root->orphan_block_rsv = NULL;
1974 spin_unlock(&root->orphan_lock);
1975
1960 if (root->orphan_item_inserted && 1976 if (root->orphan_item_inserted &&
1961 btrfs_root_refs(&root->root_item) > 0) { 1977 btrfs_root_refs(&root->root_item) > 0) {
1962 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, 1978 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
@@ -1965,10 +1981,9 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
1965 root->orphan_item_inserted = 0; 1981 root->orphan_item_inserted = 0;
1966 } 1982 }
1967 1983
1968 if (root->orphan_block_rsv) { 1984 if (block_rsv) {
1969 WARN_ON(root->orphan_block_rsv->size > 0); 1985 WARN_ON(block_rsv->size > 0);
1970 btrfs_free_block_rsv(root, root->orphan_block_rsv); 1986 btrfs_free_block_rsv(root, block_rsv);
1971 root->orphan_block_rsv = NULL;
1972 } 1987 }
1973} 1988}
1974 1989
@@ -2224,14 +2239,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2224 continue; 2239 continue;
2225 } 2240 }
2226 nr_truncate++; 2241 nr_truncate++;
2227 /*
2228 * Need to hold the imutex for reservation purposes, not
2229 * a huge deal here but I have a WARN_ON in
2230 * btrfs_delalloc_reserve_space to catch offenders.
2231 */
2232 mutex_lock(&inode->i_mutex);
2233 ret = btrfs_truncate(inode); 2242 ret = btrfs_truncate(inode);
2234 mutex_unlock(&inode->i_mutex);
2235 } else { 2243 } else {
2236 nr_unlink++; 2244 nr_unlink++;
2237 } 2245 }
@@ -2845,7 +2853,7 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
2845 BUG_ON(!root->fs_info->enospc_unlink); 2853 BUG_ON(!root->fs_info->enospc_unlink);
2846 root->fs_info->enospc_unlink = 0; 2854 root->fs_info->enospc_unlink = 0;
2847 } 2855 }
2848 btrfs_end_transaction_throttle(trans, root); 2856 btrfs_end_transaction(trans, root);
2849} 2857}
2850 2858
2851static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 2859static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -3009,7 +3017,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3009 int pending_del_nr = 0; 3017 int pending_del_nr = 0;
3010 int pending_del_slot = 0; 3018 int pending_del_slot = 0;
3011 int extent_type = -1; 3019 int extent_type = -1;
3012 int encoding;
3013 int ret; 3020 int ret;
3014 int err = 0; 3021 int err = 0;
3015 u64 ino = btrfs_ino(inode); 3022 u64 ino = btrfs_ino(inode);
@@ -3059,7 +3066,6 @@ search_again:
3059 leaf = path->nodes[0]; 3066 leaf = path->nodes[0];
3060 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3067 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3061 found_type = btrfs_key_type(&found_key); 3068 found_type = btrfs_key_type(&found_key);
3062 encoding = 0;
3063 3069
3064 if (found_key.objectid != ino) 3070 if (found_key.objectid != ino)
3065 break; 3071 break;
@@ -3072,10 +3078,6 @@ search_again:
3072 fi = btrfs_item_ptr(leaf, path->slots[0], 3078 fi = btrfs_item_ptr(leaf, path->slots[0],
3073 struct btrfs_file_extent_item); 3079 struct btrfs_file_extent_item);
3074 extent_type = btrfs_file_extent_type(leaf, fi); 3080 extent_type = btrfs_file_extent_type(leaf, fi);
3075 encoding = btrfs_file_extent_compression(leaf, fi);
3076 encoding |= btrfs_file_extent_encryption(leaf, fi);
3077 encoding |= btrfs_file_extent_other_encoding(leaf, fi);
3078
3079 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 3081 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
3080 item_end += 3082 item_end +=
3081 btrfs_file_extent_num_bytes(leaf, fi); 3083 btrfs_file_extent_num_bytes(leaf, fi);
@@ -3103,7 +3105,7 @@ search_again:
3103 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 3105 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
3104 u64 num_dec; 3106 u64 num_dec;
3105 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); 3107 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
3106 if (!del_item && !encoding) { 3108 if (!del_item) {
3107 u64 orig_num_bytes = 3109 u64 orig_num_bytes =
3108 btrfs_file_extent_num_bytes(leaf, fi); 3110 btrfs_file_extent_num_bytes(leaf, fi);
3109 extent_num_bytes = new_size - 3111 extent_num_bytes = new_size -
@@ -3179,7 +3181,7 @@ delete:
3179 ret = btrfs_free_extent(trans, root, extent_start, 3181 ret = btrfs_free_extent(trans, root, extent_start,
3180 extent_num_bytes, 0, 3182 extent_num_bytes, 0,
3181 btrfs_header_owner(leaf), 3183 btrfs_header_owner(leaf),
3182 ino, extent_offset); 3184 ino, extent_offset, 0);
3183 BUG_ON(ret); 3185 BUG_ON(ret);
3184 } 3186 }
3185 3187
@@ -3434,7 +3436,7 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
3434 i_size_write(inode, newsize); 3436 i_size_write(inode, newsize);
3435 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); 3437 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
3436 ret = btrfs_update_inode(trans, root, inode); 3438 ret = btrfs_update_inode(trans, root, inode);
3437 btrfs_end_transaction_throttle(trans, root); 3439 btrfs_end_transaction(trans, root);
3438 } else { 3440 } else {
3439 3441
3440 /* 3442 /*
@@ -4655,7 +4657,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4655 } 4657 }
4656out_unlock: 4658out_unlock:
4657 nr = trans->blocks_used; 4659 nr = trans->blocks_used;
4658 btrfs_end_transaction_throttle(trans, root); 4660 btrfs_end_transaction(trans, root);
4659 btrfs_btree_balance_dirty(root, nr); 4661 btrfs_btree_balance_dirty(root, nr);
4660 if (drop_inode) { 4662 if (drop_inode) {
4661 inode_dec_link_count(inode); 4663 inode_dec_link_count(inode);
@@ -4723,7 +4725,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4723 } 4725 }
4724out_unlock: 4726out_unlock:
4725 nr = trans->blocks_used; 4727 nr = trans->blocks_used;
4726 btrfs_end_transaction_throttle(trans, root); 4728 btrfs_end_transaction(trans, root);
4727 if (drop_inode) { 4729 if (drop_inode) {
4728 inode_dec_link_count(inode); 4730 inode_dec_link_count(inode);
4729 iput(inode); 4731 iput(inode);
@@ -4782,7 +4784,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4782 } 4784 }
4783 4785
4784 nr = trans->blocks_used; 4786 nr = trans->blocks_used;
4785 btrfs_end_transaction_throttle(trans, root); 4787 btrfs_end_transaction(trans, root);
4786fail: 4788fail:
4787 if (drop_inode) { 4789 if (drop_inode) {
4788 inode_dec_link_count(inode); 4790 inode_dec_link_count(inode);
@@ -4848,7 +4850,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4848 4850
4849out_fail: 4851out_fail:
4850 nr = trans->blocks_used; 4852 nr = trans->blocks_used;
4851 btrfs_end_transaction_throttle(trans, root); 4853 btrfs_end_transaction(trans, root);
4852 if (drop_on_err) 4854 if (drop_on_err)
4853 iput(inode); 4855 iput(inode);
4854 btrfs_btree_balance_dirty(root, nr); 4856 btrfs_btree_balance_dirty(root, nr);
@@ -5121,7 +5123,7 @@ again:
5121 } 5123 }
5122 flush_dcache_page(page); 5124 flush_dcache_page(page);
5123 } else if (create && PageUptodate(page)) { 5125 } else if (create && PageUptodate(page)) {
5124 WARN_ON(1); 5126 BUG();
5125 if (!trans) { 5127 if (!trans) {
5126 kunmap(page); 5128 kunmap(page);
5127 free_extent_map(em); 5129 free_extent_map(em);
@@ -6402,10 +6404,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
6402 u64 page_start; 6404 u64 page_start;
6403 u64 page_end; 6405 u64 page_end;
6404 6406
6405 /* Need this to keep space reservations serialized */
6406 mutex_lock(&inode->i_mutex);
6407 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 6407 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
6408 mutex_unlock(&inode->i_mutex);
6409 if (!ret) 6408 if (!ret)
6410 ret = btrfs_update_time(vma->vm_file); 6409 ret = btrfs_update_time(vma->vm_file);
6411 if (ret) { 6410 if (ret) {
@@ -6494,8 +6493,8 @@ out_unlock:
6494 if (!ret) 6493 if (!ret)
6495 return VM_FAULT_LOCKED; 6494 return VM_FAULT_LOCKED;
6496 unlock_page(page); 6495 unlock_page(page);
6497 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
6498out: 6496out:
6497 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
6499 return ret; 6498 return ret;
6500} 6499}
6501 6500
@@ -6668,7 +6667,7 @@ end_trans:
6668 err = ret; 6667 err = ret;
6669 6668
6670 nr = trans->blocks_used; 6669 nr = trans->blocks_used;
6671 ret = btrfs_end_transaction_throttle(trans, root); 6670 ret = btrfs_end_transaction(trans, root);
6672 btrfs_btree_balance_dirty(root, nr); 6671 btrfs_btree_balance_dirty(root, nr);
6673 } 6672 }
6674 6673
@@ -6749,6 +6748,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6749 extent_io_tree_init(&ei->io_tree, &inode->i_data); 6748 extent_io_tree_init(&ei->io_tree, &inode->i_data);
6750 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); 6749 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
6751 mutex_init(&ei->log_mutex); 6750 mutex_init(&ei->log_mutex);
6751 mutex_init(&ei->delalloc_mutex);
6752 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 6752 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
6753 INIT_LIST_HEAD(&ei->i_orphan); 6753 INIT_LIST_HEAD(&ei->i_orphan);
6754 INIT_LIST_HEAD(&ei->delalloc_inodes); 6754 INIT_LIST_HEAD(&ei->delalloc_inodes);
@@ -7074,7 +7074,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
7074 btrfs_end_log_trans(root); 7074 btrfs_end_log_trans(root);
7075 } 7075 }
7076out_fail: 7076out_fail:
7077 btrfs_end_transaction_throttle(trans, root); 7077 btrfs_end_transaction(trans, root);
7078out_notrans: 7078out_notrans:
7079 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 7079 if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
7080 up_read(&root->fs_info->subvol_sem); 7080 up_read(&root->fs_info->subvol_sem);
@@ -7246,7 +7246,7 @@ out_unlock:
7246 if (!err) 7246 if (!err)
7247 d_instantiate(dentry, inode); 7247 d_instantiate(dentry, inode);
7248 nr = trans->blocks_used; 7248 nr = trans->blocks_used;
7249 btrfs_end_transaction_throttle(trans, root); 7249 btrfs_end_transaction(trans, root);
7250 if (drop_inode) { 7250 if (drop_inode) {
7251 inode_dec_link_count(inode); 7251 inode_dec_link_count(inode);
7252 iput(inode); 7252 iput(inode);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5441ff1480fd..2db7c1455c7f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -176,6 +176,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
176 struct btrfs_trans_handle *trans; 176 struct btrfs_trans_handle *trans;
177 unsigned int flags, oldflags; 177 unsigned int flags, oldflags;
178 int ret; 178 int ret;
179 u64 ip_oldflags;
180 unsigned int i_oldflags;
179 181
180 if (btrfs_root_readonly(root)) 182 if (btrfs_root_readonly(root))
181 return -EROFS; 183 return -EROFS;
@@ -192,6 +194,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
192 194
193 mutex_lock(&inode->i_mutex); 195 mutex_lock(&inode->i_mutex);
194 196
197 ip_oldflags = ip->flags;
198 i_oldflags = inode->i_flags;
199
195 flags = btrfs_mask_flags(inode->i_mode, flags); 200 flags = btrfs_mask_flags(inode->i_mode, flags);
196 oldflags = btrfs_flags_to_ioctl(ip->flags); 201 oldflags = btrfs_flags_to_ioctl(ip->flags);
197 if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { 202 if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
@@ -249,19 +254,24 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
249 ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); 254 ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
250 } 255 }
251 256
252 trans = btrfs_join_transaction(root); 257 trans = btrfs_start_transaction(root, 1);
253 BUG_ON(IS_ERR(trans)); 258 if (IS_ERR(trans)) {
259 ret = PTR_ERR(trans);
260 goto out_drop;
261 }
254 262
255 btrfs_update_iflags(inode); 263 btrfs_update_iflags(inode);
256 inode->i_ctime = CURRENT_TIME; 264 inode->i_ctime = CURRENT_TIME;
257 ret = btrfs_update_inode(trans, root, inode); 265 ret = btrfs_update_inode(trans, root, inode);
258 BUG_ON(ret);
259 266
260 btrfs_end_transaction(trans, root); 267 btrfs_end_transaction(trans, root);
268 out_drop:
269 if (ret) {
270 ip->flags = ip_oldflags;
271 inode->i_flags = i_oldflags;
272 }
261 273
262 mnt_drop_write_file(file); 274 mnt_drop_write_file(file);
263
264 ret = 0;
265 out_unlock: 275 out_unlock:
266 mutex_unlock(&inode->i_mutex); 276 mutex_unlock(&inode->i_mutex);
267 return ret; 277 return ret;
@@ -358,7 +368,7 @@ static noinline int create_subvol(struct btrfs_root *root,
358 return PTR_ERR(trans); 368 return PTR_ERR(trans);
359 369
360 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 370 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
361 0, objectid, NULL, 0, 0, 0); 371 0, objectid, NULL, 0, 0, 0, 0);
362 if (IS_ERR(leaf)) { 372 if (IS_ERR(leaf)) {
363 ret = PTR_ERR(leaf); 373 ret = PTR_ERR(leaf);
364 goto fail; 374 goto fail;
@@ -858,10 +868,8 @@ static int cluster_pages_for_defrag(struct inode *inode,
858 return 0; 868 return 0;
859 file_end = (isize - 1) >> PAGE_CACHE_SHIFT; 869 file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
860 870
861 mutex_lock(&inode->i_mutex);
862 ret = btrfs_delalloc_reserve_space(inode, 871 ret = btrfs_delalloc_reserve_space(inode,
863 num_pages << PAGE_CACHE_SHIFT); 872 num_pages << PAGE_CACHE_SHIFT);
864 mutex_unlock(&inode->i_mutex);
865 if (ret) 873 if (ret)
866 return ret; 874 return ret;
867again: 875again:
@@ -1203,13 +1211,21 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1203 if (!capable(CAP_SYS_ADMIN)) 1211 if (!capable(CAP_SYS_ADMIN))
1204 return -EPERM; 1212 return -EPERM;
1205 1213
1214 mutex_lock(&root->fs_info->volume_mutex);
1215 if (root->fs_info->balance_ctl) {
1216 printk(KERN_INFO "btrfs: balance in progress\n");
1217 ret = -EINVAL;
1218 goto out;
1219 }
1220
1206 vol_args = memdup_user(arg, sizeof(*vol_args)); 1221 vol_args = memdup_user(arg, sizeof(*vol_args));
1207 if (IS_ERR(vol_args)) 1222 if (IS_ERR(vol_args)) {
1208 return PTR_ERR(vol_args); 1223 ret = PTR_ERR(vol_args);
1224 goto out;
1225 }
1209 1226
1210 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 1227 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
1211 1228
1212 mutex_lock(&root->fs_info->volume_mutex);
1213 sizestr = vol_args->name; 1229 sizestr = vol_args->name;
1214 devstr = strchr(sizestr, ':'); 1230 devstr = strchr(sizestr, ':');
1215 if (devstr) { 1231 if (devstr) {
@@ -1226,7 +1242,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1226 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", 1242 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
1227 (unsigned long long)devid); 1243 (unsigned long long)devid);
1228 ret = -EINVAL; 1244 ret = -EINVAL;
1229 goto out_unlock; 1245 goto out_free;
1230 } 1246 }
1231 if (!strcmp(sizestr, "max")) 1247 if (!strcmp(sizestr, "max"))
1232 new_size = device->bdev->bd_inode->i_size; 1248 new_size = device->bdev->bd_inode->i_size;
@@ -1241,7 +1257,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1241 new_size = memparse(sizestr, NULL); 1257 new_size = memparse(sizestr, NULL);
1242 if (new_size == 0) { 1258 if (new_size == 0) {
1243 ret = -EINVAL; 1259 ret = -EINVAL;
1244 goto out_unlock; 1260 goto out_free;
1245 } 1261 }
1246 } 1262 }
1247 1263
@@ -1250,7 +1266,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1250 if (mod < 0) { 1266 if (mod < 0) {
1251 if (new_size > old_size) { 1267 if (new_size > old_size) {
1252 ret = -EINVAL; 1268 ret = -EINVAL;
1253 goto out_unlock; 1269 goto out_free;
1254 } 1270 }
1255 new_size = old_size - new_size; 1271 new_size = old_size - new_size;
1256 } else if (mod > 0) { 1272 } else if (mod > 0) {
@@ -1259,11 +1275,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1259 1275
1260 if (new_size < 256 * 1024 * 1024) { 1276 if (new_size < 256 * 1024 * 1024) {
1261 ret = -EINVAL; 1277 ret = -EINVAL;
1262 goto out_unlock; 1278 goto out_free;
1263 } 1279 }
1264 if (new_size > device->bdev->bd_inode->i_size) { 1280 if (new_size > device->bdev->bd_inode->i_size) {
1265 ret = -EFBIG; 1281 ret = -EFBIG;
1266 goto out_unlock; 1282 goto out_free;
1267 } 1283 }
1268 1284
1269 do_div(new_size, root->sectorsize); 1285 do_div(new_size, root->sectorsize);
@@ -1276,7 +1292,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1276 trans = btrfs_start_transaction(root, 0); 1292 trans = btrfs_start_transaction(root, 0);
1277 if (IS_ERR(trans)) { 1293 if (IS_ERR(trans)) {
1278 ret = PTR_ERR(trans); 1294 ret = PTR_ERR(trans);
1279 goto out_unlock; 1295 goto out_free;
1280 } 1296 }
1281 ret = btrfs_grow_device(trans, device, new_size); 1297 ret = btrfs_grow_device(trans, device, new_size);
1282 btrfs_commit_transaction(trans, root); 1298 btrfs_commit_transaction(trans, root);
@@ -1284,9 +1300,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1284 ret = btrfs_shrink_device(device, new_size); 1300 ret = btrfs_shrink_device(device, new_size);
1285 } 1301 }
1286 1302
1287out_unlock: 1303out_free:
1288 mutex_unlock(&root->fs_info->volume_mutex);
1289 kfree(vol_args); 1304 kfree(vol_args);
1305out:
1306 mutex_unlock(&root->fs_info->volume_mutex);
1290 return ret; 1307 return ret;
1291} 1308}
1292 1309
@@ -2052,14 +2069,25 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
2052 if (!capable(CAP_SYS_ADMIN)) 2069 if (!capable(CAP_SYS_ADMIN))
2053 return -EPERM; 2070 return -EPERM;
2054 2071
2072 mutex_lock(&root->fs_info->volume_mutex);
2073 if (root->fs_info->balance_ctl) {
2074 printk(KERN_INFO "btrfs: balance in progress\n");
2075 ret = -EINVAL;
2076 goto out;
2077 }
2078
2055 vol_args = memdup_user(arg, sizeof(*vol_args)); 2079 vol_args = memdup_user(arg, sizeof(*vol_args));
2056 if (IS_ERR(vol_args)) 2080 if (IS_ERR(vol_args)) {
2057 return PTR_ERR(vol_args); 2081 ret = PTR_ERR(vol_args);
2082 goto out;
2083 }
2058 2084
2059 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2085 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
2060 ret = btrfs_init_new_device(root, vol_args->name); 2086 ret = btrfs_init_new_device(root, vol_args->name);
2061 2087
2062 kfree(vol_args); 2088 kfree(vol_args);
2089out:
2090 mutex_unlock(&root->fs_info->volume_mutex);
2063 return ret; 2091 return ret;
2064} 2092}
2065 2093
@@ -2074,14 +2102,25 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
2074 if (root->fs_info->sb->s_flags & MS_RDONLY) 2102 if (root->fs_info->sb->s_flags & MS_RDONLY)
2075 return -EROFS; 2103 return -EROFS;
2076 2104
2105 mutex_lock(&root->fs_info->volume_mutex);
2106 if (root->fs_info->balance_ctl) {
2107 printk(KERN_INFO "btrfs: balance in progress\n");
2108 ret = -EINVAL;
2109 goto out;
2110 }
2111
2077 vol_args = memdup_user(arg, sizeof(*vol_args)); 2112 vol_args = memdup_user(arg, sizeof(*vol_args));
2078 if (IS_ERR(vol_args)) 2113 if (IS_ERR(vol_args)) {
2079 return PTR_ERR(vol_args); 2114 ret = PTR_ERR(vol_args);
2115 goto out;
2116 }
2080 2117
2081 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2118 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
2082 ret = btrfs_rm_device(root, vol_args->name); 2119 ret = btrfs_rm_device(root, vol_args->name);
2083 2120
2084 kfree(vol_args); 2121 kfree(vol_args);
2122out:
2123 mutex_unlock(&root->fs_info->volume_mutex);
2085 return ret; 2124 return ret;
2086} 2125}
2087 2126
@@ -2427,7 +2466,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2427 disko, diskl, 0, 2466 disko, diskl, 0,
2428 root->root_key.objectid, 2467 root->root_key.objectid,
2429 btrfs_ino(inode), 2468 btrfs_ino(inode),
2430 new_key.offset - datao); 2469 new_key.offset - datao,
2470 0);
2431 BUG_ON(ret); 2471 BUG_ON(ret);
2432 } 2472 }
2433 } else if (type == BTRFS_FILE_EXTENT_INLINE) { 2473 } else if (type == BTRFS_FILE_EXTENT_INLINE) {
@@ -2977,7 +3017,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
2977{ 3017{
2978 int ret = 0; 3018 int ret = 0;
2979 int size; 3019 int size;
2980 u64 extent_offset; 3020 u64 extent_item_pos;
2981 struct btrfs_ioctl_logical_ino_args *loi; 3021 struct btrfs_ioctl_logical_ino_args *loi;
2982 struct btrfs_data_container *inodes = NULL; 3022 struct btrfs_data_container *inodes = NULL;
2983 struct btrfs_path *path = NULL; 3023 struct btrfs_path *path = NULL;
@@ -3008,15 +3048,17 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
3008 } 3048 }
3009 3049
3010 ret = extent_from_logical(root->fs_info, loi->logical, path, &key); 3050 ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
3051 btrfs_release_path(path);
3011 3052
3012 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) 3053 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
3013 ret = -ENOENT; 3054 ret = -ENOENT;
3014 if (ret < 0) 3055 if (ret < 0)
3015 goto out; 3056 goto out;
3016 3057
3017 extent_offset = loi->logical - key.objectid; 3058 extent_item_pos = loi->logical - key.objectid;
3018 ret = iterate_extent_inodes(root->fs_info, path, key.objectid, 3059 ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
3019 extent_offset, build_ino_list, inodes); 3060 extent_item_pos, build_ino_list,
3061 inodes);
3020 3062
3021 if (ret < 0) 3063 if (ret < 0)
3022 goto out; 3064 goto out;
@@ -3034,6 +3076,163 @@ out:
3034 return ret; 3076 return ret;
3035} 3077}
3036 3078
3079void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
3080 struct btrfs_ioctl_balance_args *bargs)
3081{
3082 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3083
3084 bargs->flags = bctl->flags;
3085
3086 if (atomic_read(&fs_info->balance_running))
3087 bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
3088 if (atomic_read(&fs_info->balance_pause_req))
3089 bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
3090 if (atomic_read(&fs_info->balance_cancel_req))
3091 bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ;
3092
3093 memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
3094 memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
3095 memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
3096
3097 if (lock) {
3098 spin_lock(&fs_info->balance_lock);
3099 memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
3100 spin_unlock(&fs_info->balance_lock);
3101 } else {
3102 memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
3103 }
3104}
3105
3106static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
3107{
3108 struct btrfs_fs_info *fs_info = root->fs_info;
3109 struct btrfs_ioctl_balance_args *bargs;
3110 struct btrfs_balance_control *bctl;
3111 int ret;
3112
3113 if (!capable(CAP_SYS_ADMIN))
3114 return -EPERM;
3115
3116 if (fs_info->sb->s_flags & MS_RDONLY)
3117 return -EROFS;
3118
3119 mutex_lock(&fs_info->volume_mutex);
3120 mutex_lock(&fs_info->balance_mutex);
3121
3122 if (arg) {
3123 bargs = memdup_user(arg, sizeof(*bargs));
3124 if (IS_ERR(bargs)) {
3125 ret = PTR_ERR(bargs);
3126 goto out;
3127 }
3128
3129 if (bargs->flags & BTRFS_BALANCE_RESUME) {
3130 if (!fs_info->balance_ctl) {
3131 ret = -ENOTCONN;
3132 goto out_bargs;
3133 }
3134
3135 bctl = fs_info->balance_ctl;
3136 spin_lock(&fs_info->balance_lock);
3137 bctl->flags |= BTRFS_BALANCE_RESUME;
3138 spin_unlock(&fs_info->balance_lock);
3139
3140 goto do_balance;
3141 }
3142 } else {
3143 bargs = NULL;
3144 }
3145
3146 if (fs_info->balance_ctl) {
3147 ret = -EINPROGRESS;
3148 goto out_bargs;
3149 }
3150
3151 bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
3152 if (!bctl) {
3153 ret = -ENOMEM;
3154 goto out_bargs;
3155 }
3156
3157 bctl->fs_info = fs_info;
3158 if (arg) {
3159 memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
3160 memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
3161 memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
3162
3163 bctl->flags = bargs->flags;
3164 } else {
3165 /* balance everything - no filters */
3166 bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
3167 }
3168
3169do_balance:
3170 ret = btrfs_balance(bctl, bargs);
3171 /*
3172 * bctl is freed in __cancel_balance or in free_fs_info if
3173 * restriper was paused all the way until unmount
3174 */
3175 if (arg) {
3176 if (copy_to_user(arg, bargs, sizeof(*bargs)))
3177 ret = -EFAULT;
3178 }
3179
3180out_bargs:
3181 kfree(bargs);
3182out:
3183 mutex_unlock(&fs_info->balance_mutex);
3184 mutex_unlock(&fs_info->volume_mutex);
3185 return ret;
3186}
3187
3188static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd)
3189{
3190 if (!capable(CAP_SYS_ADMIN))
3191 return -EPERM;
3192
3193 switch (cmd) {
3194 case BTRFS_BALANCE_CTL_PAUSE:
3195 return btrfs_pause_balance(root->fs_info);
3196 case BTRFS_BALANCE_CTL_CANCEL:
3197 return btrfs_cancel_balance(root->fs_info);
3198 }
3199
3200 return -EINVAL;
3201}
3202
3203static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
3204 void __user *arg)
3205{
3206 struct btrfs_fs_info *fs_info = root->fs_info;
3207 struct btrfs_ioctl_balance_args *bargs;
3208 int ret = 0;
3209
3210 if (!capable(CAP_SYS_ADMIN))
3211 return -EPERM;
3212
3213 mutex_lock(&fs_info->balance_mutex);
3214 if (!fs_info->balance_ctl) {
3215 ret = -ENOTCONN;
3216 goto out;
3217 }
3218
3219 bargs = kzalloc(sizeof(*bargs), GFP_NOFS);
3220 if (!bargs) {
3221 ret = -ENOMEM;
3222 goto out;
3223 }
3224
3225 update_ioctl_balance_args(fs_info, 1, bargs);
3226
3227 if (copy_to_user(arg, bargs, sizeof(*bargs)))
3228 ret = -EFAULT;
3229
3230 kfree(bargs);
3231out:
3232 mutex_unlock(&fs_info->balance_mutex);
3233 return ret;
3234}
3235
3037long btrfs_ioctl(struct file *file, unsigned int 3236long btrfs_ioctl(struct file *file, unsigned int
3038 cmd, unsigned long arg) 3237 cmd, unsigned long arg)
3039{ 3238{
@@ -3078,7 +3277,7 @@ long btrfs_ioctl(struct file *file, unsigned int
3078 case BTRFS_IOC_DEV_INFO: 3277 case BTRFS_IOC_DEV_INFO:
3079 return btrfs_ioctl_dev_info(root, argp); 3278 return btrfs_ioctl_dev_info(root, argp);
3080 case BTRFS_IOC_BALANCE: 3279 case BTRFS_IOC_BALANCE:
3081 return btrfs_balance(root->fs_info->dev_root); 3280 return btrfs_ioctl_balance(root, NULL);
3082 case BTRFS_IOC_CLONE: 3281 case BTRFS_IOC_CLONE:
3083 return btrfs_ioctl_clone(file, arg, 0, 0, 0); 3282 return btrfs_ioctl_clone(file, arg, 0, 0, 0);
3084 case BTRFS_IOC_CLONE_RANGE: 3283 case BTRFS_IOC_CLONE_RANGE:
@@ -3110,6 +3309,12 @@ long btrfs_ioctl(struct file *file, unsigned int
3110 return btrfs_ioctl_scrub_cancel(root, argp); 3309 return btrfs_ioctl_scrub_cancel(root, argp);
3111 case BTRFS_IOC_SCRUB_PROGRESS: 3310 case BTRFS_IOC_SCRUB_PROGRESS:
3112 return btrfs_ioctl_scrub_progress(root, argp); 3311 return btrfs_ioctl_scrub_progress(root, argp);
3312 case BTRFS_IOC_BALANCE_V2:
3313 return btrfs_ioctl_balance(root, argp);
3314 case BTRFS_IOC_BALANCE_CTL:
3315 return btrfs_ioctl_balance_ctl(root, arg);
3316 case BTRFS_IOC_BALANCE_PROGRESS:
3317 return btrfs_ioctl_balance_progress(root, argp);
3113 } 3318 }
3114 3319
3115 return -ENOTTY; 3320 return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 252ae9915de8..4f69028a68c4 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -109,6 +109,55 @@ struct btrfs_ioctl_fs_info_args {
109 __u64 reserved[124]; /* pad to 1k */ 109 __u64 reserved[124]; /* pad to 1k */
110}; 110};
111 111
112/* balance control ioctl modes */
113#define BTRFS_BALANCE_CTL_PAUSE 1
114#define BTRFS_BALANCE_CTL_CANCEL 2
115
116/*
117 * this is packed, because it should be exactly the same as its disk
118 * byte order counterpart (struct btrfs_disk_balance_args)
119 */
120struct btrfs_balance_args {
121 __u64 profiles;
122 __u64 usage;
123 __u64 devid;
124 __u64 pstart;
125 __u64 pend;
126 __u64 vstart;
127 __u64 vend;
128
129 __u64 target;
130
131 __u64 flags;
132
133 __u64 unused[8];
134} __attribute__ ((__packed__));
135
136/* report balance progress to userspace */
137struct btrfs_balance_progress {
138 __u64 expected; /* estimated # of chunks that will be
139 * relocated to fulfill the request */
140 __u64 considered; /* # of chunks we have considered so far */
141 __u64 completed; /* # of chunks relocated so far */
142};
143
144#define BTRFS_BALANCE_STATE_RUNNING (1ULL << 0)
145#define BTRFS_BALANCE_STATE_PAUSE_REQ (1ULL << 1)
146#define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2)
147
148struct btrfs_ioctl_balance_args {
149 __u64 flags; /* in/out */
150 __u64 state; /* out */
151
152 struct btrfs_balance_args data; /* in/out */
153 struct btrfs_balance_args meta; /* in/out */
154 struct btrfs_balance_args sys; /* in/out */
155
156 struct btrfs_balance_progress stat; /* out */
157
158 __u64 unused[72]; /* pad to 1k */
159};
160
112#define BTRFS_INO_LOOKUP_PATH_MAX 4080 161#define BTRFS_INO_LOOKUP_PATH_MAX 4080
113struct btrfs_ioctl_ino_lookup_args { 162struct btrfs_ioctl_ino_lookup_args {
114 __u64 treeid; 163 __u64 treeid;
@@ -272,6 +321,11 @@ struct btrfs_ioctl_logical_ino_args {
272 struct btrfs_ioctl_dev_info_args) 321 struct btrfs_ioctl_dev_info_args)
273#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ 322#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
274 struct btrfs_ioctl_fs_info_args) 323 struct btrfs_ioctl_fs_info_args)
324#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \
325 struct btrfs_ioctl_balance_args)
326#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int)
327#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \
328 struct btrfs_ioctl_balance_args)
275#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ 329#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
276 struct btrfs_ioctl_ino_path_args) 330 struct btrfs_ioctl_ino_path_args)
277#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ 331#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index d77b67c4b275..5e178d8f7167 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -33,6 +33,14 @@ void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
33 */ 33 */
34void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) 34void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
35{ 35{
36 if (eb->lock_nested) {
37 read_lock(&eb->lock);
38 if (eb->lock_nested && current->pid == eb->lock_owner) {
39 read_unlock(&eb->lock);
40 return;
41 }
42 read_unlock(&eb->lock);
43 }
36 if (rw == BTRFS_WRITE_LOCK) { 44 if (rw == BTRFS_WRITE_LOCK) {
37 if (atomic_read(&eb->blocking_writers) == 0) { 45 if (atomic_read(&eb->blocking_writers) == 0) {
38 WARN_ON(atomic_read(&eb->spinning_writers) != 1); 46 WARN_ON(atomic_read(&eb->spinning_writers) != 1);
@@ -57,6 +65,14 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
57 */ 65 */
58void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) 66void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
59{ 67{
68 if (eb->lock_nested) {
69 read_lock(&eb->lock);
70 if (&eb->lock_nested && current->pid == eb->lock_owner) {
71 read_unlock(&eb->lock);
72 return;
73 }
74 read_unlock(&eb->lock);
75 }
60 if (rw == BTRFS_WRITE_LOCK_BLOCKING) { 76 if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
61 BUG_ON(atomic_read(&eb->blocking_writers) != 1); 77 BUG_ON(atomic_read(&eb->blocking_writers) != 1);
62 write_lock(&eb->lock); 78 write_lock(&eb->lock);
@@ -81,12 +97,25 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
81void btrfs_tree_read_lock(struct extent_buffer *eb) 97void btrfs_tree_read_lock(struct extent_buffer *eb)
82{ 98{
83again: 99again:
100 read_lock(&eb->lock);
101 if (atomic_read(&eb->blocking_writers) &&
102 current->pid == eb->lock_owner) {
103 /*
104 * This extent is already write-locked by our thread. We allow
105 * an additional read lock to be added because it's for the same
106 * thread. btrfs_find_all_roots() depends on this as it may be
107 * called on a partly (write-)locked tree.
108 */
109 BUG_ON(eb->lock_nested);
110 eb->lock_nested = 1;
111 read_unlock(&eb->lock);
112 return;
113 }
114 read_unlock(&eb->lock);
84 wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); 115 wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
85 read_lock(&eb->lock); 116 read_lock(&eb->lock);
86 if (atomic_read(&eb->blocking_writers)) { 117 if (atomic_read(&eb->blocking_writers)) {
87 read_unlock(&eb->lock); 118 read_unlock(&eb->lock);
88 wait_event(eb->write_lock_wq,
89 atomic_read(&eb->blocking_writers) == 0);
90 goto again; 119 goto again;
91 } 120 }
92 atomic_inc(&eb->read_locks); 121 atomic_inc(&eb->read_locks);
@@ -129,6 +158,7 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
129 } 158 }
130 atomic_inc(&eb->write_locks); 159 atomic_inc(&eb->write_locks);
131 atomic_inc(&eb->spinning_writers); 160 atomic_inc(&eb->spinning_writers);
161 eb->lock_owner = current->pid;
132 return 1; 162 return 1;
133} 163}
134 164
@@ -137,6 +167,15 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
137 */ 167 */
138void btrfs_tree_read_unlock(struct extent_buffer *eb) 168void btrfs_tree_read_unlock(struct extent_buffer *eb)
139{ 169{
170 if (eb->lock_nested) {
171 read_lock(&eb->lock);
172 if (eb->lock_nested && current->pid == eb->lock_owner) {
173 eb->lock_nested = 0;
174 read_unlock(&eb->lock);
175 return;
176 }
177 read_unlock(&eb->lock);
178 }
140 btrfs_assert_tree_read_locked(eb); 179 btrfs_assert_tree_read_locked(eb);
141 WARN_ON(atomic_read(&eb->spinning_readers) == 0); 180 WARN_ON(atomic_read(&eb->spinning_readers) == 0);
142 atomic_dec(&eb->spinning_readers); 181 atomic_dec(&eb->spinning_readers);
@@ -149,6 +188,15 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
149 */ 188 */
150void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) 189void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
151{ 190{
191 if (eb->lock_nested) {
192 read_lock(&eb->lock);
193 if (eb->lock_nested && current->pid == eb->lock_owner) {
194 eb->lock_nested = 0;
195 read_unlock(&eb->lock);
196 return;
197 }
198 read_unlock(&eb->lock);
199 }
152 btrfs_assert_tree_read_locked(eb); 200 btrfs_assert_tree_read_locked(eb);
153 WARN_ON(atomic_read(&eb->blocking_readers) == 0); 201 WARN_ON(atomic_read(&eb->blocking_readers) == 0);
154 if (atomic_dec_and_test(&eb->blocking_readers)) 202 if (atomic_dec_and_test(&eb->blocking_readers))
@@ -181,6 +229,7 @@ again:
181 WARN_ON(atomic_read(&eb->spinning_writers)); 229 WARN_ON(atomic_read(&eb->spinning_writers));
182 atomic_inc(&eb->spinning_writers); 230 atomic_inc(&eb->spinning_writers);
183 atomic_inc(&eb->write_locks); 231 atomic_inc(&eb->write_locks);
232 eb->lock_owner = current->pid;
184 return 0; 233 return 0;
185} 234}
186 235
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index cfb55434a469..8c1aae2c845d 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1604,12 +1604,12 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
1604 ret = btrfs_inc_extent_ref(trans, root, new_bytenr, 1604 ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
1605 num_bytes, parent, 1605 num_bytes, parent,
1606 btrfs_header_owner(leaf), 1606 btrfs_header_owner(leaf),
1607 key.objectid, key.offset); 1607 key.objectid, key.offset, 1);
1608 BUG_ON(ret); 1608 BUG_ON(ret);
1609 1609
1610 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 1610 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1611 parent, btrfs_header_owner(leaf), 1611 parent, btrfs_header_owner(leaf),
1612 key.objectid, key.offset); 1612 key.objectid, key.offset, 1);
1613 BUG_ON(ret); 1613 BUG_ON(ret);
1614 } 1614 }
1615 if (dirty) 1615 if (dirty)
@@ -1778,21 +1778,23 @@ again:
1778 1778
1779 ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize, 1779 ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
1780 path->nodes[level]->start, 1780 path->nodes[level]->start,
1781 src->root_key.objectid, level - 1, 0); 1781 src->root_key.objectid, level - 1, 0,
1782 1);
1782 BUG_ON(ret); 1783 BUG_ON(ret);
1783 ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize, 1784 ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
1784 0, dest->root_key.objectid, level - 1, 1785 0, dest->root_key.objectid, level - 1,
1785 0); 1786 0, 1);
1786 BUG_ON(ret); 1787 BUG_ON(ret);
1787 1788
1788 ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, 1789 ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
1789 path->nodes[level]->start, 1790 path->nodes[level]->start,
1790 src->root_key.objectid, level - 1, 0); 1791 src->root_key.objectid, level - 1, 0,
1792 1);
1791 BUG_ON(ret); 1793 BUG_ON(ret);
1792 1794
1793 ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, 1795 ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
1794 0, dest->root_key.objectid, level - 1, 1796 0, dest->root_key.objectid, level - 1,
1795 0); 1797 0, 1);
1796 BUG_ON(ret); 1798 BUG_ON(ret);
1797 1799
1798 btrfs_unlock_up_safe(path, 0); 1800 btrfs_unlock_up_safe(path, 0);
@@ -2244,7 +2246,7 @@ again:
2244 } else { 2246 } else {
2245 list_del_init(&reloc_root->root_list); 2247 list_del_init(&reloc_root->root_list);
2246 } 2248 }
2247 btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0); 2249 btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
2248 } 2250 }
2249 2251
2250 if (found) { 2252 if (found) {
@@ -2558,7 +2560,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2558 node->eb->start, blocksize, 2560 node->eb->start, blocksize,
2559 upper->eb->start, 2561 upper->eb->start,
2560 btrfs_header_owner(upper->eb), 2562 btrfs_header_owner(upper->eb),
2561 node->level, 0); 2563 node->level, 0, 1);
2562 BUG_ON(ret); 2564 BUG_ON(ret);
2563 2565
2564 ret = btrfs_drop_subtree(trans, root, eb, upper->eb); 2566 ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
@@ -2947,9 +2949,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
2947 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; 2949 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
2948 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; 2950 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
2949 while (index <= last_index) { 2951 while (index <= last_index) {
2950 mutex_lock(&inode->i_mutex);
2951 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); 2952 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
2952 mutex_unlock(&inode->i_mutex);
2953 if (ret) 2953 if (ret)
2954 goto out; 2954 goto out;
2955 2955
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ddf2c90d3fc0..9770cc5bfb76 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -25,6 +25,7 @@
25#include "transaction.h" 25#include "transaction.h"
26#include "backref.h" 26#include "backref.h"
27#include "extent_io.h" 27#include "extent_io.h"
28#include "check-integrity.h"
28 29
29/* 30/*
30 * This is only the first step towards a full-features scrub. It reads all 31 * This is only the first step towards a full-features scrub. It reads all
@@ -309,7 +310,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
309 u8 ref_level; 310 u8 ref_level;
310 unsigned long ptr = 0; 311 unsigned long ptr = 0;
311 const int bufsize = 4096; 312 const int bufsize = 4096;
312 u64 extent_offset; 313 u64 extent_item_pos;
313 314
314 path = btrfs_alloc_path(); 315 path = btrfs_alloc_path();
315 316
@@ -329,12 +330,13 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
329 if (ret < 0) 330 if (ret < 0)
330 goto out; 331 goto out;
331 332
332 extent_offset = swarn.logical - found_key.objectid; 333 extent_item_pos = swarn.logical - found_key.objectid;
333 swarn.extent_item_size = found_key.offset; 334 swarn.extent_item_size = found_key.offset;
334 335
335 eb = path->nodes[0]; 336 eb = path->nodes[0];
336 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); 337 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
337 item_size = btrfs_item_size_nr(eb, path->slots[0]); 338 item_size = btrfs_item_size_nr(eb, path->slots[0]);
339 btrfs_release_path(path);
338 340
339 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 341 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
340 do { 342 do {
@@ -351,7 +353,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
351 } else { 353 } else {
352 swarn.path = path; 354 swarn.path = path;
353 iterate_extent_inodes(fs_info, path, found_key.objectid, 355 iterate_extent_inodes(fs_info, path, found_key.objectid,
354 extent_offset, 356 extent_item_pos,
355 scrub_print_warning_inode, &swarn); 357 scrub_print_warning_inode, &swarn);
356 } 358 }
357 359
@@ -732,7 +734,7 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
732 bio_add_page(bio, page, PAGE_SIZE, 0); 734 bio_add_page(bio, page, PAGE_SIZE, 0);
733 bio->bi_end_io = scrub_fixup_end_io; 735 bio->bi_end_io = scrub_fixup_end_io;
734 bio->bi_private = &complete; 736 bio->bi_private = &complete;
735 submit_bio(rw, bio); 737 btrfsic_submit_bio(rw, bio);
736 738
737 /* this will also unplug the queue */ 739 /* this will also unplug the queue */
738 wait_for_completion(&complete); 740 wait_for_completion(&complete);
@@ -958,7 +960,7 @@ static int scrub_submit(struct scrub_dev *sdev)
958 sdev->curr = -1; 960 sdev->curr = -1;
959 atomic_inc(&sdev->in_flight); 961 atomic_inc(&sdev->in_flight);
960 962
961 submit_bio(READ, sbio->bio); 963 btrfsic_submit_bio(READ, sbio->bio);
962 964
963 return 0; 965 return 0;
964} 966}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ae488aa1966a..8ffaaa8e3df8 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -163,8 +163,11 @@ enum {
163 Opt_compress_type, Opt_compress_force, Opt_compress_force_type, 163 Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
164 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, 164 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
165 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, 165 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
166 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, 166 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
167 Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err, 167 Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
168 Opt_check_integrity, Opt_check_integrity_including_extent_data,
169 Opt_check_integrity_print_mask,
170 Opt_err,
168}; 171};
169 172
170static match_table_t tokens = { 173static match_table_t tokens = {
@@ -199,6 +202,10 @@ static match_table_t tokens = {
199 {Opt_inode_cache, "inode_cache"}, 202 {Opt_inode_cache, "inode_cache"},
200 {Opt_no_space_cache, "nospace_cache"}, 203 {Opt_no_space_cache, "nospace_cache"},
201 {Opt_recovery, "recovery"}, 204 {Opt_recovery, "recovery"},
205 {Opt_skip_balance, "skip_balance"},
206 {Opt_check_integrity, "check_int"},
207 {Opt_check_integrity_including_extent_data, "check_int_data"},
208 {Opt_check_integrity_print_mask, "check_int_print_mask=%d"},
202 {Opt_err, NULL}, 209 {Opt_err, NULL},
203}; 210};
204 211
@@ -397,6 +404,40 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
397 printk(KERN_INFO "btrfs: enabling auto recovery"); 404 printk(KERN_INFO "btrfs: enabling auto recovery");
398 btrfs_set_opt(info->mount_opt, RECOVERY); 405 btrfs_set_opt(info->mount_opt, RECOVERY);
399 break; 406 break;
407 case Opt_skip_balance:
408 btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
409 break;
410#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
411 case Opt_check_integrity_including_extent_data:
412 printk(KERN_INFO "btrfs: enabling check integrity"
413 " including extent data\n");
414 btrfs_set_opt(info->mount_opt,
415 CHECK_INTEGRITY_INCLUDING_EXTENT_DATA);
416 btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
417 break;
418 case Opt_check_integrity:
419 printk(KERN_INFO "btrfs: enabling check integrity\n");
420 btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
421 break;
422 case Opt_check_integrity_print_mask:
423 intarg = 0;
424 match_int(&args[0], &intarg);
425 if (intarg) {
426 info->check_integrity_print_mask = intarg;
427 printk(KERN_INFO "btrfs:"
428 " check_integrity_print_mask 0x%x\n",
429 info->check_integrity_print_mask);
430 }
431 break;
432#else
433 case Opt_check_integrity_including_extent_data:
434 case Opt_check_integrity:
435 case Opt_check_integrity_print_mask:
436 printk(KERN_ERR "btrfs: support for check_integrity*"
437 " not compiled in!\n");
438 ret = -EINVAL;
439 goto out;
440#endif
400 case Opt_err: 441 case Opt_err:
401 printk(KERN_INFO "btrfs: unrecognized mount option " 442 printk(KERN_INFO "btrfs: unrecognized mount option "
402 "'%s'\n", p); 443 "'%s'\n", p);
@@ -722,6 +763,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
722 seq_puts(seq, ",autodefrag"); 763 seq_puts(seq, ",autodefrag");
723 if (btrfs_test_opt(root, INODE_MAP_CACHE)) 764 if (btrfs_test_opt(root, INODE_MAP_CACHE))
724 seq_puts(seq, ",inode_cache"); 765 seq_puts(seq, ",inode_cache");
766 if (btrfs_test_opt(root, SKIP_BALANCE))
767 seq_puts(seq, ",skip_balance");
725 return 0; 768 return 0;
726} 769}
727 770
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 81376d94cd3c..287a6728b1ad 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -36,6 +36,8 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
36 WARN_ON(atomic_read(&transaction->use_count) == 0); 36 WARN_ON(atomic_read(&transaction->use_count) == 0);
37 if (atomic_dec_and_test(&transaction->use_count)) { 37 if (atomic_dec_and_test(&transaction->use_count)) {
38 BUG_ON(!list_empty(&transaction->list)); 38 BUG_ON(!list_empty(&transaction->list));
39 WARN_ON(transaction->delayed_refs.root.rb_node);
40 WARN_ON(!list_empty(&transaction->delayed_refs.seq_head));
39 memset(transaction, 0, sizeof(*transaction)); 41 memset(transaction, 0, sizeof(*transaction));
40 kmem_cache_free(btrfs_transaction_cachep, transaction); 42 kmem_cache_free(btrfs_transaction_cachep, transaction);
41 } 43 }
@@ -108,8 +110,11 @@ loop:
108 cur_trans->delayed_refs.num_heads = 0; 110 cur_trans->delayed_refs.num_heads = 0;
109 cur_trans->delayed_refs.flushing = 0; 111 cur_trans->delayed_refs.flushing = 0;
110 cur_trans->delayed_refs.run_delayed_start = 0; 112 cur_trans->delayed_refs.run_delayed_start = 0;
113 cur_trans->delayed_refs.seq = 1;
114 init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);
111 spin_lock_init(&cur_trans->commit_lock); 115 spin_lock_init(&cur_trans->commit_lock);
112 spin_lock_init(&cur_trans->delayed_refs.lock); 116 spin_lock_init(&cur_trans->delayed_refs.lock);
117 INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
113 118
114 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 119 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
115 list_add_tail(&cur_trans->list, &root->fs_info->trans_list); 120 list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
@@ -321,6 +326,8 @@ again:
321 } 326 }
322 327
323 if (num_bytes) { 328 if (num_bytes) {
329 trace_btrfs_space_reservation(root->fs_info, "transaction",
330 (u64)h, num_bytes, 1);
324 h->block_rsv = &root->fs_info->trans_block_rsv; 331 h->block_rsv = &root->fs_info->trans_block_rsv;
325 h->bytes_reserved = num_bytes; 332 h->bytes_reserved = num_bytes;
326 } 333 }
@@ -467,19 +474,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
467 474
468 btrfs_trans_release_metadata(trans, root); 475 btrfs_trans_release_metadata(trans, root);
469 trans->block_rsv = NULL; 476 trans->block_rsv = NULL;
470 while (count < 4) { 477 while (count < 2) {
471 unsigned long cur = trans->delayed_ref_updates; 478 unsigned long cur = trans->delayed_ref_updates;
472 trans->delayed_ref_updates = 0; 479 trans->delayed_ref_updates = 0;
473 if (cur && 480 if (cur &&
474 trans->transaction->delayed_refs.num_heads_ready > 64) { 481 trans->transaction->delayed_refs.num_heads_ready > 64) {
475 trans->delayed_ref_updates = 0; 482 trans->delayed_ref_updates = 0;
476
477 /*
478 * do a full flush if the transaction is trying
479 * to close
480 */
481 if (trans->transaction->delayed_refs.flushing)
482 cur = 0;
483 btrfs_run_delayed_refs(trans, root, cur); 483 btrfs_run_delayed_refs(trans, root, cur);
484 } else { 484 } else {
485 break; 485 break;
@@ -1393,9 +1393,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
1393 1393
1394 if (btrfs_header_backref_rev(root->node) < 1394 if (btrfs_header_backref_rev(root->node) <
1395 BTRFS_MIXED_BACKREF_REV) 1395 BTRFS_MIXED_BACKREF_REV)
1396 btrfs_drop_snapshot(root, NULL, 0); 1396 btrfs_drop_snapshot(root, NULL, 0, 0);
1397 else 1397 else
1398 btrfs_drop_snapshot(root, NULL, 1); 1398 btrfs_drop_snapshot(root, NULL, 1, 0);
1399 } 1399 }
1400 return 0; 1400 return 0;
1401} 1401}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 3568374d419d..cb877e0886a7 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -589,7 +589,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
589 ret = btrfs_inc_extent_ref(trans, root, 589 ret = btrfs_inc_extent_ref(trans, root,
590 ins.objectid, ins.offset, 590 ins.objectid, ins.offset,
591 0, root->root_key.objectid, 591 0, root->root_key.objectid,
592 key->objectid, offset); 592 key->objectid, offset, 0);
593 BUG_ON(ret); 593 BUG_ON(ret);
594 } else { 594 } else {
595 /* 595 /*
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
new file mode 100644
index 000000000000..12f5147bd2b1
--- /dev/null
+++ b/fs/btrfs/ulist.c
@@ -0,0 +1,220 @@
1/*
2 * Copyright (C) 2011 STRATO AG
3 * written by Arne Jansen <sensille@gmx.net>
4 * Distributed under the GNU GPL license version 2.
5 */
6
7#include <linux/slab.h>
8#include <linux/module.h>
9#include "ulist.h"
10
11/*
12 * ulist is a generic data structure to hold a collection of unique u64
13 * values. The only operations it supports is adding to the list and
14 * enumerating it.
15 * It is possible to store an auxiliary value along with the key.
16 *
17 * The implementation is preliminary and can probably be sped up
18 * significantly. A first step would be to store the values in an rbtree
19 * as soon as ULIST_SIZE is exceeded.
20 *
21 * A sample usage for ulists is the enumeration of directed graphs without
22 * visiting a node twice. The pseudo-code could look like this:
23 *
24 * ulist = ulist_alloc();
25 * ulist_add(ulist, root);
26 * elem = NULL;
27 *
28 * while ((elem = ulist_next(ulist, elem)) {
29 * for (all child nodes n in elem)
30 * ulist_add(ulist, n);
31 * do something useful with the node;
32 * }
33 * ulist_free(ulist);
34 *
35 * This assumes the graph nodes are adressable by u64. This stems from the
36 * usage for tree enumeration in btrfs, where the logical addresses are
37 * 64 bit.
38 *
39 * It is also useful for tree enumeration which could be done elegantly
40 * recursively, but is not possible due to kernel stack limitations. The
41 * loop would be similar to the above.
42 */
43
44/**
45 * ulist_init - freshly initialize a ulist
46 * @ulist: the ulist to initialize
47 *
48 * Note: don't use this function to init an already used ulist, use
49 * ulist_reinit instead.
50 */
51void ulist_init(struct ulist *ulist)
52{
53 ulist->nnodes = 0;
54 ulist->nodes = ulist->int_nodes;
55 ulist->nodes_alloced = ULIST_SIZE;
56}
57EXPORT_SYMBOL(ulist_init);
58
59/**
60 * ulist_fini - free up additionally allocated memory for the ulist
61 * @ulist: the ulist from which to free the additional memory
62 *
63 * This is useful in cases where the base 'struct ulist' has been statically
64 * allocated.
65 */
66void ulist_fini(struct ulist *ulist)
67{
68 /*
69 * The first ULIST_SIZE elements are stored inline in struct ulist.
70 * Only if more elements are alocated they need to be freed.
71 */
72 if (ulist->nodes_alloced > ULIST_SIZE)
73 kfree(ulist->nodes);
74 ulist->nodes_alloced = 0; /* in case ulist_fini is called twice */
75}
76EXPORT_SYMBOL(ulist_fini);
77
78/**
79 * ulist_reinit - prepare a ulist for reuse
80 * @ulist: ulist to be reused
81 *
82 * Free up all additional memory allocated for the list elements and reinit
83 * the ulist.
84 */
85void ulist_reinit(struct ulist *ulist)
86{
87 ulist_fini(ulist);
88 ulist_init(ulist);
89}
90EXPORT_SYMBOL(ulist_reinit);
91
92/**
93 * ulist_alloc - dynamically allocate a ulist
94 * @gfp_mask: allocation flags to for base allocation
95 *
96 * The allocated ulist will be returned in an initialized state.
97 */
98struct ulist *ulist_alloc(unsigned long gfp_mask)
99{
100 struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask);
101
102 if (!ulist)
103 return NULL;
104
105 ulist_init(ulist);
106
107 return ulist;
108}
109EXPORT_SYMBOL(ulist_alloc);
110
111/**
112 * ulist_free - free dynamically allocated ulist
113 * @ulist: ulist to free
114 *
115 * It is not necessary to call ulist_fini before.
116 */
117void ulist_free(struct ulist *ulist)
118{
119 if (!ulist)
120 return;
121 ulist_fini(ulist);
122 kfree(ulist);
123}
124EXPORT_SYMBOL(ulist_free);
125
126/**
127 * ulist_add - add an element to the ulist
128 * @ulist: ulist to add the element to
129 * @val: value to add to ulist
130 * @aux: auxiliary value to store along with val
131 * @gfp_mask: flags to use for allocation
132 *
133 * Note: locking must be provided by the caller. In case of rwlocks write
134 * locking is needed
135 *
136 * Add an element to a ulist. The @val will only be added if it doesn't
137 * already exist. If it is added, the auxiliary value @aux is stored along with
138 * it. In case @val already exists in the ulist, @aux is ignored, even if
139 * it differs from the already stored value.
140 *
141 * ulist_add returns 0 if @val already exists in ulist and 1 if @val has been
142 * inserted.
143 * In case of allocation failure -ENOMEM is returned and the ulist stays
144 * unaltered.
145 */
146int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
147 unsigned long gfp_mask)
148{
149 int i;
150
151 for (i = 0; i < ulist->nnodes; ++i) {
152 if (ulist->nodes[i].val == val)
153 return 0;
154 }
155
156 if (ulist->nnodes >= ulist->nodes_alloced) {
157 u64 new_alloced = ulist->nodes_alloced + 128;
158 struct ulist_node *new_nodes;
159 void *old = NULL;
160
161 /*
162 * if nodes_alloced == ULIST_SIZE no memory has been allocated
163 * yet, so pass NULL to krealloc
164 */
165 if (ulist->nodes_alloced > ULIST_SIZE)
166 old = ulist->nodes;
167
168 new_nodes = krealloc(old, sizeof(*new_nodes) * new_alloced,
169 gfp_mask);
170 if (!new_nodes)
171 return -ENOMEM;
172
173 if (!old)
174 memcpy(new_nodes, ulist->int_nodes,
175 sizeof(ulist->int_nodes));
176
177 ulist->nodes = new_nodes;
178 ulist->nodes_alloced = new_alloced;
179 }
180 ulist->nodes[ulist->nnodes].val = val;
181 ulist->nodes[ulist->nnodes].aux = aux;
182 ++ulist->nnodes;
183
184 return 1;
185}
186EXPORT_SYMBOL(ulist_add);
187
188/**
189 * ulist_next - iterate ulist
190 * @ulist: ulist to iterate
191 * @prev: previously returned element or %NULL to start iteration
192 *
193 * Note: locking must be provided by the caller. In case of rwlocks only read
194 * locking is needed
195 *
196 * This function is used to iterate an ulist. The iteration is started with
197 * @prev = %NULL. It returns the next element from the ulist or %NULL when the
198 * end is reached. No guarantee is made with respect to the order in which
199 * the elements are returned. They might neither be returned in order of
200 * addition nor in ascending order.
201 * It is allowed to call ulist_add during an enumeration. Newly added items
202 * are guaranteed to show up in the running enumeration.
203 */
204struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev)
205{
206 int next;
207
208 if (ulist->nnodes == 0)
209 return NULL;
210
211 if (!prev)
212 return &ulist->nodes[0];
213
214 next = (prev - ulist->nodes) + 1;
215 if (next < 0 || next >= ulist->nnodes)
216 return NULL;
217
218 return &ulist->nodes[next];
219}
220EXPORT_SYMBOL(ulist_next);
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
new file mode 100644
index 000000000000..2e25dec58ec0
--- /dev/null
+++ b/fs/btrfs/ulist.h
@@ -0,0 +1,68 @@
1/*
2 * Copyright (C) 2011 STRATO AG
3 * written by Arne Jansen <sensille@gmx.net>
4 * Distributed under the GNU GPL license version 2.
5 *
6 */
7
8#ifndef __ULIST__
9#define __ULIST__
10
11/*
12 * ulist is a generic data structure to hold a collection of unique u64
13 * values. The only operations it supports is adding to the list and
14 * enumerating it.
15 * It is possible to store an auxiliary value along with the key.
16 *
17 * The implementation is preliminary and can probably be sped up
18 * significantly. A first step would be to store the values in an rbtree
19 * as soon as ULIST_SIZE is exceeded.
20 */
21
22/*
23 * number of elements statically allocated inside struct ulist
24 */
25#define ULIST_SIZE 16
26
27/*
28 * element of the list
29 */
30struct ulist_node {
31 u64 val; /* value to store */
32 unsigned long aux; /* auxiliary value saved along with the val */
33};
34
35struct ulist {
36 /*
37 * number of elements stored in list
38 */
39 unsigned long nnodes;
40
41 /*
42 * number of nodes we already have room for
43 */
44 unsigned long nodes_alloced;
45
46 /*
47 * pointer to the array storing the elements. The first ULIST_SIZE
48 * elements are stored inline. In this case the it points to int_nodes.
49 * After exceeding ULIST_SIZE, dynamic memory is allocated.
50 */
51 struct ulist_node *nodes;
52
53 /*
54 * inline storage space for the first ULIST_SIZE entries
55 */
56 struct ulist_node int_nodes[ULIST_SIZE];
57};
58
59void ulist_init(struct ulist *ulist);
60void ulist_fini(struct ulist *ulist);
61void ulist_reinit(struct ulist *ulist);
62struct ulist *ulist_alloc(unsigned long gfp_mask);
63void ulist_free(struct ulist *ulist);
64int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
65 unsigned long gfp_mask);
66struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev);
67
68#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f4b839fd3c9d..7ffdb154daec 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -23,6 +23,7 @@
23#include <linux/random.h> 23#include <linux/random.h>
24#include <linux/iocontext.h> 24#include <linux/iocontext.h>
25#include <linux/capability.h> 25#include <linux/capability.h>
26#include <linux/kthread.h>
26#include <asm/div64.h> 27#include <asm/div64.h>
27#include "compat.h" 28#include "compat.h"
28#include "ctree.h" 29#include "ctree.h"
@@ -32,6 +33,7 @@
32#include "print-tree.h" 33#include "print-tree.h"
33#include "volumes.h" 34#include "volumes.h"
34#include "async-thread.h" 35#include "async-thread.h"
36#include "check-integrity.h"
35 37
36static int init_first_rw_device(struct btrfs_trans_handle *trans, 38static int init_first_rw_device(struct btrfs_trans_handle *trans,
37 struct btrfs_root *root, 39 struct btrfs_root *root,
@@ -246,7 +248,7 @@ loop_lock:
246 sync_pending = 0; 248 sync_pending = 0;
247 } 249 }
248 250
249 submit_bio(cur->bi_rw, cur); 251 btrfsic_submit_bio(cur->bi_rw, cur);
250 num_run++; 252 num_run++;
251 batch_run++; 253 batch_run++;
252 if (need_resched()) 254 if (need_resched())
@@ -829,7 +831,6 @@ out:
829 831
830/* 832/*
831 * find_free_dev_extent - find free space in the specified device 833 * find_free_dev_extent - find free space in the specified device
832 * @trans: transaction handler
833 * @device: the device which we search the free space in 834 * @device: the device which we search the free space in
834 * @num_bytes: the size of the free space that we need 835 * @num_bytes: the size of the free space that we need
835 * @start: store the start of the free space. 836 * @start: store the start of the free space.
@@ -848,8 +849,7 @@ out:
848 * But if we don't find suitable free space, it is used to store the size of 849 * But if we don't find suitable free space, it is used to store the size of
849 * the max free space. 850 * the max free space.
850 */ 851 */
851int find_free_dev_extent(struct btrfs_trans_handle *trans, 852int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
852 struct btrfs_device *device, u64 num_bytes,
853 u64 *start, u64 *len) 853 u64 *start, u64 *len)
854{ 854{
855 struct btrfs_key key; 855 struct btrfs_key key;
@@ -893,7 +893,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
893 key.offset = search_start; 893 key.offset = search_start;
894 key.type = BTRFS_DEV_EXTENT_KEY; 894 key.type = BTRFS_DEV_EXTENT_KEY;
895 895
896 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 896 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
897 if (ret < 0) 897 if (ret < 0)
898 goto out; 898 goto out;
899 if (ret > 0) { 899 if (ret > 0) {
@@ -1282,7 +1282,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1282 bool clear_super = false; 1282 bool clear_super = false;
1283 1283
1284 mutex_lock(&uuid_mutex); 1284 mutex_lock(&uuid_mutex);
1285 mutex_lock(&root->fs_info->volume_mutex);
1286 1285
1287 all_avail = root->fs_info->avail_data_alloc_bits | 1286 all_avail = root->fs_info->avail_data_alloc_bits |
1288 root->fs_info->avail_system_alloc_bits | 1287 root->fs_info->avail_system_alloc_bits |
@@ -1452,7 +1451,6 @@ error_close:
1452 if (bdev) 1451 if (bdev)
1453 blkdev_put(bdev, FMODE_READ | FMODE_EXCL); 1452 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1454out: 1453out:
1455 mutex_unlock(&root->fs_info->volume_mutex);
1456 mutex_unlock(&uuid_mutex); 1454 mutex_unlock(&uuid_mutex);
1457 return ret; 1455 return ret;
1458error_undo: 1456error_undo:
@@ -1469,8 +1467,7 @@ error_undo:
1469/* 1467/*
1470 * does all the dirty work required for changing file system's UUID. 1468 * does all the dirty work required for changing file system's UUID.
1471 */ 1469 */
1472static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, 1470static int btrfs_prepare_sprout(struct btrfs_root *root)
1473 struct btrfs_root *root)
1474{ 1471{
1475 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1472 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1476 struct btrfs_fs_devices *old_devices; 1473 struct btrfs_fs_devices *old_devices;
@@ -1629,7 +1626,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1629 } 1626 }
1630 1627
1631 filemap_write_and_wait(bdev->bd_inode->i_mapping); 1628 filemap_write_and_wait(bdev->bd_inode->i_mapping);
1632 mutex_lock(&root->fs_info->volume_mutex);
1633 1629
1634 devices = &root->fs_info->fs_devices->devices; 1630 devices = &root->fs_info->fs_devices->devices;
1635 /* 1631 /*
@@ -1695,7 +1691,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1695 1691
1696 if (seeding_dev) { 1692 if (seeding_dev) {
1697 sb->s_flags &= ~MS_RDONLY; 1693 sb->s_flags &= ~MS_RDONLY;
1698 ret = btrfs_prepare_sprout(trans, root); 1694 ret = btrfs_prepare_sprout(root);
1699 BUG_ON(ret); 1695 BUG_ON(ret);
1700 } 1696 }
1701 1697
@@ -1757,8 +1753,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1757 ret = btrfs_relocate_sys_chunks(root); 1753 ret = btrfs_relocate_sys_chunks(root);
1758 BUG_ON(ret); 1754 BUG_ON(ret);
1759 } 1755 }
1760out: 1756
1761 mutex_unlock(&root->fs_info->volume_mutex);
1762 return ret; 1757 return ret;
1763error: 1758error:
1764 blkdev_put(bdev, FMODE_EXCL); 1759 blkdev_put(bdev, FMODE_EXCL);
@@ -1766,7 +1761,7 @@ error:
1766 mutex_unlock(&uuid_mutex); 1761 mutex_unlock(&uuid_mutex);
1767 up_write(&sb->s_umount); 1762 up_write(&sb->s_umount);
1768 } 1763 }
1769 goto out; 1764 return ret;
1770} 1765}
1771 1766
1772static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 1767static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
@@ -2077,6 +2072,362 @@ error:
2077 return ret; 2072 return ret;
2078} 2073}
2079 2074
2075static int insert_balance_item(struct btrfs_root *root,
2076 struct btrfs_balance_control *bctl)
2077{
2078 struct btrfs_trans_handle *trans;
2079 struct btrfs_balance_item *item;
2080 struct btrfs_disk_balance_args disk_bargs;
2081 struct btrfs_path *path;
2082 struct extent_buffer *leaf;
2083 struct btrfs_key key;
2084 int ret, err;
2085
2086 path = btrfs_alloc_path();
2087 if (!path)
2088 return -ENOMEM;
2089
2090 trans = btrfs_start_transaction(root, 0);
2091 if (IS_ERR(trans)) {
2092 btrfs_free_path(path);
2093 return PTR_ERR(trans);
2094 }
2095
2096 key.objectid = BTRFS_BALANCE_OBJECTID;
2097 key.type = BTRFS_BALANCE_ITEM_KEY;
2098 key.offset = 0;
2099
2100 ret = btrfs_insert_empty_item(trans, root, path, &key,
2101 sizeof(*item));
2102 if (ret)
2103 goto out;
2104
2105 leaf = path->nodes[0];
2106 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2107
2108 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
2109
2110 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
2111 btrfs_set_balance_data(leaf, item, &disk_bargs);
2112 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
2113 btrfs_set_balance_meta(leaf, item, &disk_bargs);
2114 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
2115 btrfs_set_balance_sys(leaf, item, &disk_bargs);
2116
2117 btrfs_set_balance_flags(leaf, item, bctl->flags);
2118
2119 btrfs_mark_buffer_dirty(leaf);
2120out:
2121 btrfs_free_path(path);
2122 err = btrfs_commit_transaction(trans, root);
2123 if (err && !ret)
2124 ret = err;
2125 return ret;
2126}
2127
2128static int del_balance_item(struct btrfs_root *root)
2129{
2130 struct btrfs_trans_handle *trans;
2131 struct btrfs_path *path;
2132 struct btrfs_key key;
2133 int ret, err;
2134
2135 path = btrfs_alloc_path();
2136 if (!path)
2137 return -ENOMEM;
2138
2139 trans = btrfs_start_transaction(root, 0);
2140 if (IS_ERR(trans)) {
2141 btrfs_free_path(path);
2142 return PTR_ERR(trans);
2143 }
2144
2145 key.objectid = BTRFS_BALANCE_OBJECTID;
2146 key.type = BTRFS_BALANCE_ITEM_KEY;
2147 key.offset = 0;
2148
2149 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2150 if (ret < 0)
2151 goto out;
2152 if (ret > 0) {
2153 ret = -ENOENT;
2154 goto out;
2155 }
2156
2157 ret = btrfs_del_item(trans, root, path);
2158out:
2159 btrfs_free_path(path);
2160 err = btrfs_commit_transaction(trans, root);
2161 if (err && !ret)
2162 ret = err;
2163 return ret;
2164}
2165
2166/*
2167 * This is a heuristic used to reduce the number of chunks balanced on
2168 * resume after balance was interrupted.
2169 */
2170static void update_balance_args(struct btrfs_balance_control *bctl)
2171{
2172 /*
2173 * Turn on soft mode for chunk types that were being converted.
2174 */
2175 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
2176 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
2177 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
2178 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
2179 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
2180 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
2181
2182 /*
2183 * Turn on usage filter if is not already used. The idea is
2184 * that chunks that we have already balanced should be
2185 * reasonably full. Don't do it for chunks that are being
2186 * converted - that will keep us from relocating unconverted
2187 * (albeit full) chunks.
2188 */
2189 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2190 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2191 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
2192 bctl->data.usage = 90;
2193 }
2194 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2195 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2196 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
2197 bctl->sys.usage = 90;
2198 }
2199 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2200 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2201 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
2202 bctl->meta.usage = 90;
2203 }
2204}
2205
2206/*
2207 * Should be called with both balance and volume mutexes held to
2208 * serialize other volume operations (add_dev/rm_dev/resize) with
2209 * restriper. Same goes for unset_balance_control.
2210 */
2211static void set_balance_control(struct btrfs_balance_control *bctl)
2212{
2213 struct btrfs_fs_info *fs_info = bctl->fs_info;
2214
2215 BUG_ON(fs_info->balance_ctl);
2216
2217 spin_lock(&fs_info->balance_lock);
2218 fs_info->balance_ctl = bctl;
2219 spin_unlock(&fs_info->balance_lock);
2220}
2221
2222static void unset_balance_control(struct btrfs_fs_info *fs_info)
2223{
2224 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2225
2226 BUG_ON(!fs_info->balance_ctl);
2227
2228 spin_lock(&fs_info->balance_lock);
2229 fs_info->balance_ctl = NULL;
2230 spin_unlock(&fs_info->balance_lock);
2231
2232 kfree(bctl);
2233}
2234
2235/*
2236 * Balance filters. Return 1 if chunk should be filtered out
2237 * (should not be balanced).
2238 */
2239static int chunk_profiles_filter(u64 chunk_profile,
2240 struct btrfs_balance_args *bargs)
2241{
2242 chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
2243
2244 if (chunk_profile == 0)
2245 chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2246
2247 if (bargs->profiles & chunk_profile)
2248 return 0;
2249
2250 return 1;
2251}
2252
2253static u64 div_factor_fine(u64 num, int factor)
2254{
2255 if (factor <= 0)
2256 return 0;
2257 if (factor >= 100)
2258 return num;
2259
2260 num *= factor;
2261 do_div(num, 100);
2262 return num;
2263}
2264
2265static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2266 struct btrfs_balance_args *bargs)
2267{
2268 struct btrfs_block_group_cache *cache;
2269 u64 chunk_used, user_thresh;
2270 int ret = 1;
2271
2272 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2273 chunk_used = btrfs_block_group_used(&cache->item);
2274
2275 user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
2276 if (chunk_used < user_thresh)
2277 ret = 0;
2278
2279 btrfs_put_block_group(cache);
2280 return ret;
2281}
2282
2283static int chunk_devid_filter(struct extent_buffer *leaf,
2284 struct btrfs_chunk *chunk,
2285 struct btrfs_balance_args *bargs)
2286{
2287 struct btrfs_stripe *stripe;
2288 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2289 int i;
2290
2291 for (i = 0; i < num_stripes; i++) {
2292 stripe = btrfs_stripe_nr(chunk, i);
2293 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
2294 return 0;
2295 }
2296
2297 return 1;
2298}
2299
2300/* [pstart, pend) */
2301static int chunk_drange_filter(struct extent_buffer *leaf,
2302 struct btrfs_chunk *chunk,
2303 u64 chunk_offset,
2304 struct btrfs_balance_args *bargs)
2305{
2306 struct btrfs_stripe *stripe;
2307 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2308 u64 stripe_offset;
2309 u64 stripe_length;
2310 int factor;
2311 int i;
2312
2313 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
2314 return 0;
2315
2316 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
2317 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
2318 factor = 2;
2319 else
2320 factor = 1;
2321 factor = num_stripes / factor;
2322
2323 for (i = 0; i < num_stripes; i++) {
2324 stripe = btrfs_stripe_nr(chunk, i);
2325 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
2326 continue;
2327
2328 stripe_offset = btrfs_stripe_offset(leaf, stripe);
2329 stripe_length = btrfs_chunk_length(leaf, chunk);
2330 do_div(stripe_length, factor);
2331
2332 if (stripe_offset < bargs->pend &&
2333 stripe_offset + stripe_length > bargs->pstart)
2334 return 0;
2335 }
2336
2337 return 1;
2338}
2339
2340/* [vstart, vend) */
2341static int chunk_vrange_filter(struct extent_buffer *leaf,
2342 struct btrfs_chunk *chunk,
2343 u64 chunk_offset,
2344 struct btrfs_balance_args *bargs)
2345{
2346 if (chunk_offset < bargs->vend &&
2347 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
2348 /* at least part of the chunk is inside this vrange */
2349 return 0;
2350
2351 return 1;
2352}
2353
2354static int chunk_soft_convert_filter(u64 chunk_profile,
2355 struct btrfs_balance_args *bargs)
2356{
2357 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
2358 return 0;
2359
2360 chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
2361
2362 if (chunk_profile == 0)
2363 chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2364
2365 if (bargs->target & chunk_profile)
2366 return 1;
2367
2368 return 0;
2369}
2370
2371static int should_balance_chunk(struct btrfs_root *root,
2372 struct extent_buffer *leaf,
2373 struct btrfs_chunk *chunk, u64 chunk_offset)
2374{
2375 struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
2376 struct btrfs_balance_args *bargs = NULL;
2377 u64 chunk_type = btrfs_chunk_type(leaf, chunk);
2378
2379 /* type filter */
2380 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
2381 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
2382 return 0;
2383 }
2384
2385 if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
2386 bargs = &bctl->data;
2387 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
2388 bargs = &bctl->sys;
2389 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
2390 bargs = &bctl->meta;
2391
2392 /* profiles filter */
2393 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
2394 chunk_profiles_filter(chunk_type, bargs)) {
2395 return 0;
2396 }
2397
2398 /* usage filter */
2399 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
2400 chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
2401 return 0;
2402 }
2403
2404 /* devid filter */
2405 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
2406 chunk_devid_filter(leaf, chunk, bargs)) {
2407 return 0;
2408 }
2409
2410 /* drange filter, makes sense only with devid filter */
2411 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
2412 chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
2413 return 0;
2414 }
2415
2416 /* vrange filter */
2417 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
2418 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
2419 return 0;
2420 }
2421
2422 /* soft profile changing mode */
2423 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
2424 chunk_soft_convert_filter(chunk_type, bargs)) {
2425 return 0;
2426 }
2427
2428 return 1;
2429}
2430
2080static u64 div_factor(u64 num, int factor) 2431static u64 div_factor(u64 num, int factor)
2081{ 2432{
2082 if (factor == 10) 2433 if (factor == 10)
@@ -2086,29 +2437,28 @@ static u64 div_factor(u64 num, int factor)
2086 return num; 2437 return num;
2087} 2438}
2088 2439
2089int btrfs_balance(struct btrfs_root *dev_root) 2440static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2090{ 2441{
2091 int ret; 2442 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2092 struct list_head *devices = &dev_root->fs_info->fs_devices->devices; 2443 struct btrfs_root *chunk_root = fs_info->chunk_root;
2444 struct btrfs_root *dev_root = fs_info->dev_root;
2445 struct list_head *devices;
2093 struct btrfs_device *device; 2446 struct btrfs_device *device;
2094 u64 old_size; 2447 u64 old_size;
2095 u64 size_to_free; 2448 u64 size_to_free;
2449 struct btrfs_chunk *chunk;
2096 struct btrfs_path *path; 2450 struct btrfs_path *path;
2097 struct btrfs_key key; 2451 struct btrfs_key key;
2098 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
2099 struct btrfs_trans_handle *trans;
2100 struct btrfs_key found_key; 2452 struct btrfs_key found_key;
2101 2453 struct btrfs_trans_handle *trans;
2102 if (dev_root->fs_info->sb->s_flags & MS_RDONLY) 2454 struct extent_buffer *leaf;
2103 return -EROFS; 2455 int slot;
2104 2456 int ret;
2105 if (!capable(CAP_SYS_ADMIN)) 2457 int enospc_errors = 0;
2106 return -EPERM; 2458 bool counting = true;
2107
2108 mutex_lock(&dev_root->fs_info->volume_mutex);
2109 dev_root = dev_root->fs_info->dev_root;
2110 2459
2111 /* step one make some room on all the devices */ 2460 /* step one make some room on all the devices */
2461 devices = &fs_info->fs_devices->devices;
2112 list_for_each_entry(device, devices, dev_list) { 2462 list_for_each_entry(device, devices, dev_list) {
2113 old_size = device->total_bytes; 2463 old_size = device->total_bytes;
2114 size_to_free = div_factor(old_size, 1); 2464 size_to_free = div_factor(old_size, 1);
@@ -2137,11 +2487,23 @@ int btrfs_balance(struct btrfs_root *dev_root)
2137 ret = -ENOMEM; 2487 ret = -ENOMEM;
2138 goto error; 2488 goto error;
2139 } 2489 }
2490
2491 /* zero out stat counters */
2492 spin_lock(&fs_info->balance_lock);
2493 memset(&bctl->stat, 0, sizeof(bctl->stat));
2494 spin_unlock(&fs_info->balance_lock);
2495again:
2140 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2496 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2141 key.offset = (u64)-1; 2497 key.offset = (u64)-1;
2142 key.type = BTRFS_CHUNK_ITEM_KEY; 2498 key.type = BTRFS_CHUNK_ITEM_KEY;
2143 2499
2144 while (1) { 2500 while (1) {
2501 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
2502 atomic_read(&fs_info->balance_cancel_req)) {
2503 ret = -ECANCELED;
2504 goto error;
2505 }
2506
2145 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 2507 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2146 if (ret < 0) 2508 if (ret < 0)
2147 goto error; 2509 goto error;
@@ -2151,15 +2513,19 @@ int btrfs_balance(struct btrfs_root *dev_root)
2151 * failed 2513 * failed
2152 */ 2514 */
2153 if (ret == 0) 2515 if (ret == 0)
2154 break; 2516 BUG(); /* FIXME break ? */
2155 2517
2156 ret = btrfs_previous_item(chunk_root, path, 0, 2518 ret = btrfs_previous_item(chunk_root, path, 0,
2157 BTRFS_CHUNK_ITEM_KEY); 2519 BTRFS_CHUNK_ITEM_KEY);
2158 if (ret) 2520 if (ret) {
2521 ret = 0;
2159 break; 2522 break;
2523 }
2524
2525 leaf = path->nodes[0];
2526 slot = path->slots[0];
2527 btrfs_item_key_to_cpu(leaf, &found_key, slot);
2160 2528
2161 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2162 path->slots[0]);
2163 if (found_key.objectid != key.objectid) 2529 if (found_key.objectid != key.objectid)
2164 break; 2530 break;
2165 2531
@@ -2167,22 +2533,375 @@ int btrfs_balance(struct btrfs_root *dev_root)
2167 if (found_key.offset == 0) 2533 if (found_key.offset == 0)
2168 break; 2534 break;
2169 2535
2536 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
2537
2538 if (!counting) {
2539 spin_lock(&fs_info->balance_lock);
2540 bctl->stat.considered++;
2541 spin_unlock(&fs_info->balance_lock);
2542 }
2543
2544 ret = should_balance_chunk(chunk_root, leaf, chunk,
2545 found_key.offset);
2170 btrfs_release_path(path); 2546 btrfs_release_path(path);
2547 if (!ret)
2548 goto loop;
2549
2550 if (counting) {
2551 spin_lock(&fs_info->balance_lock);
2552 bctl->stat.expected++;
2553 spin_unlock(&fs_info->balance_lock);
2554 goto loop;
2555 }
2556
2171 ret = btrfs_relocate_chunk(chunk_root, 2557 ret = btrfs_relocate_chunk(chunk_root,
2172 chunk_root->root_key.objectid, 2558 chunk_root->root_key.objectid,
2173 found_key.objectid, 2559 found_key.objectid,
2174 found_key.offset); 2560 found_key.offset);
2175 if (ret && ret != -ENOSPC) 2561 if (ret && ret != -ENOSPC)
2176 goto error; 2562 goto error;
2563 if (ret == -ENOSPC) {
2564 enospc_errors++;
2565 } else {
2566 spin_lock(&fs_info->balance_lock);
2567 bctl->stat.completed++;
2568 spin_unlock(&fs_info->balance_lock);
2569 }
2570loop:
2177 key.offset = found_key.offset - 1; 2571 key.offset = found_key.offset - 1;
2178 } 2572 }
2179 ret = 0; 2573
2574 if (counting) {
2575 btrfs_release_path(path);
2576 counting = false;
2577 goto again;
2578 }
2180error: 2579error:
2181 btrfs_free_path(path); 2580 btrfs_free_path(path);
2182 mutex_unlock(&dev_root->fs_info->volume_mutex); 2581 if (enospc_errors) {
2582 printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
2583 enospc_errors);
2584 if (!ret)
2585 ret = -ENOSPC;
2586 }
2587
2588 return ret;
2589}
2590
2591static inline int balance_need_close(struct btrfs_fs_info *fs_info)
2592{
2593 /* cancel requested || normal exit path */
2594 return atomic_read(&fs_info->balance_cancel_req) ||
2595 (atomic_read(&fs_info->balance_pause_req) == 0 &&
2596 atomic_read(&fs_info->balance_cancel_req) == 0);
2597}
2598
2599static void __cancel_balance(struct btrfs_fs_info *fs_info)
2600{
2601 int ret;
2602
2603 unset_balance_control(fs_info);
2604 ret = del_balance_item(fs_info->tree_root);
2605 BUG_ON(ret);
2606}
2607
2608void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
2609 struct btrfs_ioctl_balance_args *bargs);
2610
2611/*
2612 * Should be called with both balance and volume mutexes held
2613 */
2614int btrfs_balance(struct btrfs_balance_control *bctl,
2615 struct btrfs_ioctl_balance_args *bargs)
2616{
2617 struct btrfs_fs_info *fs_info = bctl->fs_info;
2618 u64 allowed;
2619 int ret;
2620
2621 if (btrfs_fs_closing(fs_info) ||
2622 atomic_read(&fs_info->balance_pause_req) ||
2623 atomic_read(&fs_info->balance_cancel_req)) {
2624 ret = -EINVAL;
2625 goto out;
2626 }
2627
2628 /*
2629 * In case of mixed groups both data and meta should be picked,
2630 * and identical options should be given for both of them.
2631 */
2632 allowed = btrfs_super_incompat_flags(fs_info->super_copy);
2633 if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
2634 (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) {
2635 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
2636 !(bctl->flags & BTRFS_BALANCE_METADATA) ||
2637 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
2638 printk(KERN_ERR "btrfs: with mixed groups data and "
2639 "metadata balance options must be the same\n");
2640 ret = -EINVAL;
2641 goto out;
2642 }
2643 }
2644
2645 /*
2646 * Profile changing sanity checks. Skip them if a simple
2647 * balance is requested.
2648 */
2649 if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) &
2650 BTRFS_BALANCE_ARGS_CONVERT))
2651 goto do_balance;
2652
2653 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2654 if (fs_info->fs_devices->num_devices == 1)
2655 allowed |= BTRFS_BLOCK_GROUP_DUP;
2656 else if (fs_info->fs_devices->num_devices < 4)
2657 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
2658 else
2659 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
2660 BTRFS_BLOCK_GROUP_RAID10);
2661
2662 if (!profile_is_valid(bctl->data.target, 1) ||
2663 bctl->data.target & ~allowed) {
2664 printk(KERN_ERR "btrfs: unable to start balance with target "
2665 "data profile %llu\n",
2666 (unsigned long long)bctl->data.target);
2667 ret = -EINVAL;
2668 goto out;
2669 }
2670 if (!profile_is_valid(bctl->meta.target, 1) ||
2671 bctl->meta.target & ~allowed) {
2672 printk(KERN_ERR "btrfs: unable to start balance with target "
2673 "metadata profile %llu\n",
2674 (unsigned long long)bctl->meta.target);
2675 ret = -EINVAL;
2676 goto out;
2677 }
2678 if (!profile_is_valid(bctl->sys.target, 1) ||
2679 bctl->sys.target & ~allowed) {
2680 printk(KERN_ERR "btrfs: unable to start balance with target "
2681 "system profile %llu\n",
2682 (unsigned long long)bctl->sys.target);
2683 ret = -EINVAL;
2684 goto out;
2685 }
2686
2687 if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) {
2688 printk(KERN_ERR "btrfs: dup for data is not allowed\n");
2689 ret = -EINVAL;
2690 goto out;
2691 }
2692
2693 /* allow to reduce meta or sys integrity only if force set */
2694 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2695 BTRFS_BLOCK_GROUP_RAID10;
2696 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2697 (fs_info->avail_system_alloc_bits & allowed) &&
2698 !(bctl->sys.target & allowed)) ||
2699 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2700 (fs_info->avail_metadata_alloc_bits & allowed) &&
2701 !(bctl->meta.target & allowed))) {
2702 if (bctl->flags & BTRFS_BALANCE_FORCE) {
2703 printk(KERN_INFO "btrfs: force reducing metadata "
2704 "integrity\n");
2705 } else {
2706 printk(KERN_ERR "btrfs: balance will reduce metadata "
2707 "integrity, use force if you want this\n");
2708 ret = -EINVAL;
2709 goto out;
2710 }
2711 }
2712
2713do_balance:
2714 ret = insert_balance_item(fs_info->tree_root, bctl);
2715 if (ret && ret != -EEXIST)
2716 goto out;
2717
2718 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
2719 BUG_ON(ret == -EEXIST);
2720 set_balance_control(bctl);
2721 } else {
2722 BUG_ON(ret != -EEXIST);
2723 spin_lock(&fs_info->balance_lock);
2724 update_balance_args(bctl);
2725 spin_unlock(&fs_info->balance_lock);
2726 }
2727
2728 atomic_inc(&fs_info->balance_running);
2729 mutex_unlock(&fs_info->balance_mutex);
2730
2731 ret = __btrfs_balance(fs_info);
2732
2733 mutex_lock(&fs_info->balance_mutex);
2734 atomic_dec(&fs_info->balance_running);
2735
2736 if (bargs) {
2737 memset(bargs, 0, sizeof(*bargs));
2738 update_ioctl_balance_args(fs_info, 0, bargs);
2739 }
2740
2741 if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
2742 balance_need_close(fs_info)) {
2743 __cancel_balance(fs_info);
2744 }
2745
2746 wake_up(&fs_info->balance_wait_q);
2747
2748 return ret;
2749out:
2750 if (bctl->flags & BTRFS_BALANCE_RESUME)
2751 __cancel_balance(fs_info);
2752 else
2753 kfree(bctl);
2754 return ret;
2755}
2756
2757static int balance_kthread(void *data)
2758{
2759 struct btrfs_balance_control *bctl =
2760 (struct btrfs_balance_control *)data;
2761 struct btrfs_fs_info *fs_info = bctl->fs_info;
2762 int ret = 0;
2763
2764 mutex_lock(&fs_info->volume_mutex);
2765 mutex_lock(&fs_info->balance_mutex);
2766
2767 set_balance_control(bctl);
2768
2769 if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
2770 printk(KERN_INFO "btrfs: force skipping balance\n");
2771 } else {
2772 printk(KERN_INFO "btrfs: continuing balance\n");
2773 ret = btrfs_balance(bctl, NULL);
2774 }
2775
2776 mutex_unlock(&fs_info->balance_mutex);
2777 mutex_unlock(&fs_info->volume_mutex);
2778 return ret;
2779}
2780
2781int btrfs_recover_balance(struct btrfs_root *tree_root)
2782{
2783 struct task_struct *tsk;
2784 struct btrfs_balance_control *bctl;
2785 struct btrfs_balance_item *item;
2786 struct btrfs_disk_balance_args disk_bargs;
2787 struct btrfs_path *path;
2788 struct extent_buffer *leaf;
2789 struct btrfs_key key;
2790 int ret;
2791
2792 path = btrfs_alloc_path();
2793 if (!path)
2794 return -ENOMEM;
2795
2796 bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
2797 if (!bctl) {
2798 ret = -ENOMEM;
2799 goto out;
2800 }
2801
2802 key.objectid = BTRFS_BALANCE_OBJECTID;
2803 key.type = BTRFS_BALANCE_ITEM_KEY;
2804 key.offset = 0;
2805
2806 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
2807 if (ret < 0)
2808 goto out_bctl;
2809 if (ret > 0) { /* ret = -ENOENT; */
2810 ret = 0;
2811 goto out_bctl;
2812 }
2813
2814 leaf = path->nodes[0];
2815 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2816
2817 bctl->fs_info = tree_root->fs_info;
2818 bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME;
2819
2820 btrfs_balance_data(leaf, item, &disk_bargs);
2821 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
2822 btrfs_balance_meta(leaf, item, &disk_bargs);
2823 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
2824 btrfs_balance_sys(leaf, item, &disk_bargs);
2825 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
2826
2827 tsk = kthread_run(balance_kthread, bctl, "btrfs-balance");
2828 if (IS_ERR(tsk))
2829 ret = PTR_ERR(tsk);
2830 else
2831 goto out;
2832
2833out_bctl:
2834 kfree(bctl);
2835out:
2836 btrfs_free_path(path);
2183 return ret; 2837 return ret;
2184} 2838}
2185 2839
2840int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
2841{
2842 int ret = 0;
2843
2844 mutex_lock(&fs_info->balance_mutex);
2845 if (!fs_info->balance_ctl) {
2846 mutex_unlock(&fs_info->balance_mutex);
2847 return -ENOTCONN;
2848 }
2849
2850 if (atomic_read(&fs_info->balance_running)) {
2851 atomic_inc(&fs_info->balance_pause_req);
2852 mutex_unlock(&fs_info->balance_mutex);
2853
2854 wait_event(fs_info->balance_wait_q,
2855 atomic_read(&fs_info->balance_running) == 0);
2856
2857 mutex_lock(&fs_info->balance_mutex);
2858 /* we are good with balance_ctl ripped off from under us */
2859 BUG_ON(atomic_read(&fs_info->balance_running));
2860 atomic_dec(&fs_info->balance_pause_req);
2861 } else {
2862 ret = -ENOTCONN;
2863 }
2864
2865 mutex_unlock(&fs_info->balance_mutex);
2866 return ret;
2867}
2868
2869int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
2870{
2871 mutex_lock(&fs_info->balance_mutex);
2872 if (!fs_info->balance_ctl) {
2873 mutex_unlock(&fs_info->balance_mutex);
2874 return -ENOTCONN;
2875 }
2876
2877 atomic_inc(&fs_info->balance_cancel_req);
2878 /*
2879 * if we are running just wait and return, balance item is
2880 * deleted in btrfs_balance in this case
2881 */
2882 if (atomic_read(&fs_info->balance_running)) {
2883 mutex_unlock(&fs_info->balance_mutex);
2884 wait_event(fs_info->balance_wait_q,
2885 atomic_read(&fs_info->balance_running) == 0);
2886 mutex_lock(&fs_info->balance_mutex);
2887 } else {
2888 /* __cancel_balance needs volume_mutex */
2889 mutex_unlock(&fs_info->balance_mutex);
2890 mutex_lock(&fs_info->volume_mutex);
2891 mutex_lock(&fs_info->balance_mutex);
2892
2893 if (fs_info->balance_ctl)
2894 __cancel_balance(fs_info);
2895
2896 mutex_unlock(&fs_info->volume_mutex);
2897 }
2898
2899 BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
2900 atomic_dec(&fs_info->balance_cancel_req);
2901 mutex_unlock(&fs_info->balance_mutex);
2902 return 0;
2903}
2904
2186/* 2905/*
2187 * shrinking a device means finding all of the device extents past 2906 * shrinking a device means finding all of the device extents past
2188 * the new size, and then following the back refs to the chunks. 2907 * the new size, and then following the back refs to the chunks.
@@ -2323,8 +3042,7 @@ done:
2323 return ret; 3042 return ret;
2324} 3043}
2325 3044
2326static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, 3045static int btrfs_add_system_chunk(struct btrfs_root *root,
2327 struct btrfs_root *root,
2328 struct btrfs_key *key, 3046 struct btrfs_key *key,
2329 struct btrfs_chunk *chunk, int item_size) 3047 struct btrfs_chunk *chunk, int item_size)
2330{ 3048{
@@ -2441,10 +3159,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2441 max_stripe_size = 1024 * 1024 * 1024; 3159 max_stripe_size = 1024 * 1024 * 1024;
2442 max_chunk_size = 10 * max_stripe_size; 3160 max_chunk_size = 10 * max_stripe_size;
2443 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 3161 } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
2444 max_stripe_size = 256 * 1024 * 1024; 3162 /* for larger filesystems, use larger metadata chunks */
3163 if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
3164 max_stripe_size = 1024 * 1024 * 1024;
3165 else
3166 max_stripe_size = 256 * 1024 * 1024;
2445 max_chunk_size = max_stripe_size; 3167 max_chunk_size = max_stripe_size;
2446 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 3168 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
2447 max_stripe_size = 8 * 1024 * 1024; 3169 max_stripe_size = 32 * 1024 * 1024;
2448 max_chunk_size = 2 * max_stripe_size; 3170 max_chunk_size = 2 * max_stripe_size;
2449 } else { 3171 } else {
2450 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n", 3172 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
@@ -2496,7 +3218,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2496 if (total_avail == 0) 3218 if (total_avail == 0)
2497 continue; 3219 continue;
2498 3220
2499 ret = find_free_dev_extent(trans, device, 3221 ret = find_free_dev_extent(device,
2500 max_stripe_size * dev_stripes, 3222 max_stripe_size * dev_stripes,
2501 &dev_offset, &max_avail); 3223 &dev_offset, &max_avail);
2502 if (ret && ret != -ENOSPC) 3224 if (ret && ret != -ENOSPC)
@@ -2687,7 +3409,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2687 BUG_ON(ret); 3409 BUG_ON(ret);
2688 3410
2689 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 3411 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2690 ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk, 3412 ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
2691 item_size); 3413 item_size);
2692 BUG_ON(ret); 3414 BUG_ON(ret);
2693 } 3415 }
@@ -2752,8 +3474,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
2752 return ret; 3474 return ret;
2753 3475
2754 alloc_profile = BTRFS_BLOCK_GROUP_METADATA | 3476 alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
2755 (fs_info->metadata_alloc_profile & 3477 fs_info->avail_metadata_alloc_bits;
2756 fs_info->avail_metadata_alloc_bits);
2757 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 3478 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2758 3479
2759 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 3480 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
@@ -2763,8 +3484,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
2763 sys_chunk_offset = chunk_offset + chunk_size; 3484 sys_chunk_offset = chunk_offset + chunk_size;
2764 3485
2765 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | 3486 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
2766 (fs_info->system_alloc_profile & 3487 fs_info->avail_system_alloc_bits;
2767 fs_info->avail_system_alloc_bits);
2768 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 3488 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2769 3489
2770 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 3490 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
@@ -2901,26 +3621,13 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2901 u64 stripe_nr; 3621 u64 stripe_nr;
2902 u64 stripe_nr_orig; 3622 u64 stripe_nr_orig;
2903 u64 stripe_nr_end; 3623 u64 stripe_nr_end;
2904 int stripes_allocated = 8;
2905 int stripes_required = 1;
2906 int stripe_index; 3624 int stripe_index;
2907 int i; 3625 int i;
3626 int ret = 0;
2908 int num_stripes; 3627 int num_stripes;
2909 int max_errors = 0; 3628 int max_errors = 0;
2910 struct btrfs_bio *bbio = NULL; 3629 struct btrfs_bio *bbio = NULL;
2911 3630
2912 if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
2913 stripes_allocated = 1;
2914again:
2915 if (bbio_ret) {
2916 bbio = kzalloc(btrfs_bio_size(stripes_allocated),
2917 GFP_NOFS);
2918 if (!bbio)
2919 return -ENOMEM;
2920
2921 atomic_set(&bbio->error, 0);
2922 }
2923
2924 read_lock(&em_tree->lock); 3631 read_lock(&em_tree->lock);
2925 em = lookup_extent_mapping(em_tree, logical, *length); 3632 em = lookup_extent_mapping(em_tree, logical, *length);
2926 read_unlock(&em_tree->lock); 3633 read_unlock(&em_tree->lock);
@@ -2939,32 +3646,6 @@ again:
2939 if (mirror_num > map->num_stripes) 3646 if (mirror_num > map->num_stripes)
2940 mirror_num = 0; 3647 mirror_num = 0;
2941 3648
2942 /* if our btrfs_bio struct is too small, back off and try again */
2943 if (rw & REQ_WRITE) {
2944 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2945 BTRFS_BLOCK_GROUP_DUP)) {
2946 stripes_required = map->num_stripes;
2947 max_errors = 1;
2948 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2949 stripes_required = map->sub_stripes;
2950 max_errors = 1;
2951 }
2952 }
2953 if (rw & REQ_DISCARD) {
2954 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2955 BTRFS_BLOCK_GROUP_RAID1 |
2956 BTRFS_BLOCK_GROUP_DUP |
2957 BTRFS_BLOCK_GROUP_RAID10)) {
2958 stripes_required = map->num_stripes;
2959 }
2960 }
2961 if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
2962 stripes_allocated < stripes_required) {
2963 stripes_allocated = map->num_stripes;
2964 free_extent_map(em);
2965 kfree(bbio);
2966 goto again;
2967 }
2968 stripe_nr = offset; 3649 stripe_nr = offset;
2969 /* 3650 /*
2970 * stripe_nr counts the total number of stripes we have to stride 3651 * stripe_nr counts the total number of stripes we have to stride
@@ -2980,10 +3661,7 @@ again:
2980 3661
2981 if (rw & REQ_DISCARD) 3662 if (rw & REQ_DISCARD)
2982 *length = min_t(u64, em->len - offset, *length); 3663 *length = min_t(u64, em->len - offset, *length);
2983 else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 3664 else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
2984 BTRFS_BLOCK_GROUP_RAID1 |
2985 BTRFS_BLOCK_GROUP_RAID10 |
2986 BTRFS_BLOCK_GROUP_DUP)) {
2987 /* we limit the length of each bio to what fits in a stripe */ 3665 /* we limit the length of each bio to what fits in a stripe */
2988 *length = min_t(u64, em->len - offset, 3666 *length = min_t(u64, em->len - offset,
2989 map->stripe_len - stripe_offset); 3667 map->stripe_len - stripe_offset);
@@ -3059,81 +3737,55 @@ again:
3059 } 3737 }
3060 BUG_ON(stripe_index >= map->num_stripes); 3738 BUG_ON(stripe_index >= map->num_stripes);
3061 3739
3740 bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS);
3741 if (!bbio) {
3742 ret = -ENOMEM;
3743 goto out;
3744 }
3745 atomic_set(&bbio->error, 0);
3746
3062 if (rw & REQ_DISCARD) { 3747 if (rw & REQ_DISCARD) {
3748 int factor = 0;
3749 int sub_stripes = 0;
3750 u64 stripes_per_dev = 0;
3751 u32 remaining_stripes = 0;
3752
3753 if (map->type &
3754 (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
3755 if (map->type & BTRFS_BLOCK_GROUP_RAID0)
3756 sub_stripes = 1;
3757 else
3758 sub_stripes = map->sub_stripes;
3759
3760 factor = map->num_stripes / sub_stripes;
3761 stripes_per_dev = div_u64_rem(stripe_nr_end -
3762 stripe_nr_orig,
3763 factor,
3764 &remaining_stripes);
3765 }
3766
3063 for (i = 0; i < num_stripes; i++) { 3767 for (i = 0; i < num_stripes; i++) {
3064 bbio->stripes[i].physical = 3768 bbio->stripes[i].physical =
3065 map->stripes[stripe_index].physical + 3769 map->stripes[stripe_index].physical +
3066 stripe_offset + stripe_nr * map->stripe_len; 3770 stripe_offset + stripe_nr * map->stripe_len;
3067 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 3771 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
3068 3772
3069 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3773 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3070 u64 stripes; 3774 BTRFS_BLOCK_GROUP_RAID10)) {
3071 u32 last_stripe = 0; 3775 bbio->stripes[i].length = stripes_per_dev *
3072 int j; 3776 map->stripe_len;
3073 3777 if (i / sub_stripes < remaining_stripes)
3074 div_u64_rem(stripe_nr_end - 1, 3778 bbio->stripes[i].length +=
3075 map->num_stripes, 3779 map->stripe_len;
3076 &last_stripe); 3780 if (i < sub_stripes)
3077
3078 for (j = 0; j < map->num_stripes; j++) {
3079 u32 test;
3080
3081 div_u64_rem(stripe_nr_end - 1 - j,
3082 map->num_stripes, &test);
3083 if (test == stripe_index)
3084 break;
3085 }
3086 stripes = stripe_nr_end - 1 - j;
3087 do_div(stripes, map->num_stripes);
3088 bbio->stripes[i].length = map->stripe_len *
3089 (stripes - stripe_nr + 1);
3090
3091 if (i == 0) {
3092 bbio->stripes[i].length -=
3093 stripe_offset;
3094 stripe_offset = 0;
3095 }
3096 if (stripe_index == last_stripe)
3097 bbio->stripes[i].length -=
3098 stripe_end_offset;
3099 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3100 u64 stripes;
3101 int j;
3102 int factor = map->num_stripes /
3103 map->sub_stripes;
3104 u32 last_stripe = 0;
3105
3106 div_u64_rem(stripe_nr_end - 1,
3107 factor, &last_stripe);
3108 last_stripe *= map->sub_stripes;
3109
3110 for (j = 0; j < factor; j++) {
3111 u32 test;
3112
3113 div_u64_rem(stripe_nr_end - 1 - j,
3114 factor, &test);
3115
3116 if (test ==
3117 stripe_index / map->sub_stripes)
3118 break;
3119 }
3120 stripes = stripe_nr_end - 1 - j;
3121 do_div(stripes, factor);
3122 bbio->stripes[i].length = map->stripe_len *
3123 (stripes - stripe_nr + 1);
3124
3125 if (i < map->sub_stripes) {
3126 bbio->stripes[i].length -= 3781 bbio->stripes[i].length -=
3127 stripe_offset; 3782 stripe_offset;
3128 if (i == map->sub_stripes - 1) 3783 if ((i / sub_stripes + 1) %
3129 stripe_offset = 0; 3784 sub_stripes == remaining_stripes)
3130 }
3131 if (stripe_index >= last_stripe &&
3132 stripe_index <= (last_stripe +
3133 map->sub_stripes - 1)) {
3134 bbio->stripes[i].length -= 3785 bbio->stripes[i].length -=
3135 stripe_end_offset; 3786 stripe_end_offset;
3136 } 3787 if (i == sub_stripes - 1)
3788 stripe_offset = 0;
3137 } else 3789 } else
3138 bbio->stripes[i].length = *length; 3790 bbio->stripes[i].length = *length;
3139 3791
@@ -3155,15 +3807,22 @@ again:
3155 stripe_index++; 3807 stripe_index++;
3156 } 3808 }
3157 } 3809 }
3158 if (bbio_ret) { 3810
3159 *bbio_ret = bbio; 3811 if (rw & REQ_WRITE) {
3160 bbio->num_stripes = num_stripes; 3812 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
3161 bbio->max_errors = max_errors; 3813 BTRFS_BLOCK_GROUP_RAID10 |
3162 bbio->mirror_num = mirror_num; 3814 BTRFS_BLOCK_GROUP_DUP)) {
3815 max_errors = 1;
3816 }
3163 } 3817 }
3818
3819 *bbio_ret = bbio;
3820 bbio->num_stripes = num_stripes;
3821 bbio->max_errors = max_errors;
3822 bbio->mirror_num = mirror_num;
3164out: 3823out:
3165 free_extent_map(em); 3824 free_extent_map(em);
3166 return 0; 3825 return ret;
3167} 3826}
3168 3827
3169int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 3828int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
@@ -3304,7 +3963,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
3304 /* don't bother with additional async steps for reads, right now */ 3963 /* don't bother with additional async steps for reads, right now */
3305 if (!(rw & REQ_WRITE)) { 3964 if (!(rw & REQ_WRITE)) {
3306 bio_get(bio); 3965 bio_get(bio);
3307 submit_bio(rw, bio); 3966 btrfsic_submit_bio(rw, bio);
3308 bio_put(bio); 3967 bio_put(bio);
3309 return 0; 3968 return 0;
3310 } 3969 }
@@ -3399,7 +4058,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3399 if (async_submit) 4058 if (async_submit)
3400 schedule_bio(root, dev, rw, bio); 4059 schedule_bio(root, dev, rw, bio);
3401 else 4060 else
3402 submit_bio(rw, bio); 4061 btrfsic_submit_bio(rw, bio);
3403 } else { 4062 } else {
3404 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; 4063 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
3405 bio->bi_sector = logical >> 9; 4064 bio->bi_sector = logical >> 9;
@@ -3568,7 +4227,7 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
3568 struct btrfs_fs_devices *fs_devices; 4227 struct btrfs_fs_devices *fs_devices;
3569 int ret; 4228 int ret;
3570 4229
3571 mutex_lock(&uuid_mutex); 4230 BUG_ON(!mutex_is_locked(&uuid_mutex));
3572 4231
3573 fs_devices = root->fs_info->fs_devices->seed; 4232 fs_devices = root->fs_info->fs_devices->seed;
3574 while (fs_devices) { 4233 while (fs_devices) {
@@ -3606,7 +4265,6 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
3606 fs_devices->seed = root->fs_info->fs_devices->seed; 4265 fs_devices->seed = root->fs_info->fs_devices->seed;
3607 root->fs_info->fs_devices->seed = fs_devices; 4266 root->fs_info->fs_devices->seed = fs_devices;
3608out: 4267out:
3609 mutex_unlock(&uuid_mutex);
3610 return ret; 4268 return ret;
3611} 4269}
3612 4270
@@ -3749,6 +4407,9 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
3749 if (!path) 4407 if (!path)
3750 return -ENOMEM; 4408 return -ENOMEM;
3751 4409
4410 mutex_lock(&uuid_mutex);
4411 lock_chunks(root);
4412
3752 /* first we search for all of the device items, and then we 4413 /* first we search for all of the device items, and then we
3753 * read in all of the chunk items. This way we can create chunk 4414 * read in all of the chunk items. This way we can create chunk
3754 * mappings that reference all of the devices that are afound 4415 * mappings that reference all of the devices that are afound
@@ -3799,6 +4460,9 @@ again:
3799 } 4460 }
3800 ret = 0; 4461 ret = 0;
3801error: 4462error:
4463 unlock_chunks(root);
4464 mutex_unlock(&uuid_mutex);
4465
3802 btrfs_free_path(path); 4466 btrfs_free_path(path);
3803 return ret; 4467 return ret;
3804} 4468}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 78f2d4d4f37f..19ac95048b88 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -186,6 +186,51 @@ struct map_lookup {
186#define map_lookup_size(n) (sizeof(struct map_lookup) + \ 186#define map_lookup_size(n) (sizeof(struct map_lookup) + \
187 (sizeof(struct btrfs_bio_stripe) * (n))) 187 (sizeof(struct btrfs_bio_stripe) * (n)))
188 188
189/*
190 * Restriper's general type filter
191 */
192#define BTRFS_BALANCE_DATA (1ULL << 0)
193#define BTRFS_BALANCE_SYSTEM (1ULL << 1)
194#define BTRFS_BALANCE_METADATA (1ULL << 2)
195
196#define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \
197 BTRFS_BALANCE_SYSTEM | \
198 BTRFS_BALANCE_METADATA)
199
200#define BTRFS_BALANCE_FORCE (1ULL << 3)
201#define BTRFS_BALANCE_RESUME (1ULL << 4)
202
203/*
204 * Balance filters
205 */
206#define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0)
207#define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1)
208#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2)
209#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3)
210#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4)
211
212/*
213 * Profile changing flags. When SOFT is set we won't relocate chunk if
214 * it already has the target profile (even though it may be
215 * half-filled).
216 */
217#define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8)
218#define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9)
219
220struct btrfs_balance_args;
221struct btrfs_balance_progress;
222struct btrfs_balance_control {
223 struct btrfs_fs_info *fs_info;
224
225 struct btrfs_balance_args data;
226 struct btrfs_balance_args meta;
227 struct btrfs_balance_args sys;
228
229 u64 flags;
230
231 struct btrfs_balance_progress stat;
232};
233
189int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 234int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
190 u64 end, u64 *length); 235 u64 end, u64 *length);
191 236
@@ -228,9 +273,12 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
228 u8 *uuid, u8 *fsid); 273 u8 *uuid, u8 *fsid);
229int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); 274int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
230int btrfs_init_new_device(struct btrfs_root *root, char *path); 275int btrfs_init_new_device(struct btrfs_root *root, char *path);
231int btrfs_balance(struct btrfs_root *dev_root); 276int btrfs_balance(struct btrfs_balance_control *bctl,
277 struct btrfs_ioctl_balance_args *bargs);
278int btrfs_recover_balance(struct btrfs_root *tree_root);
279int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
280int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
232int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); 281int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
233int find_free_dev_extent(struct btrfs_trans_handle *trans, 282int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
234 struct btrfs_device *device, u64 num_bytes,
235 u64 *start, u64 *max_avail); 283 u64 *start, u64 *max_avail);
236#endif 284#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3848b04e310e..e7a5659087e6 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -200,7 +200,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
200 ret = btrfs_update_inode(trans, root, inode); 200 ret = btrfs_update_inode(trans, root, inode);
201 BUG_ON(ret); 201 BUG_ON(ret);
202out: 202out:
203 btrfs_end_transaction_throttle(trans, root); 203 btrfs_end_transaction(trans, root);
204 return ret; 204 return ret;
205} 205}
206 206
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index b31702ac15be..84f3001a568d 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -16,6 +16,8 @@ struct btrfs_delayed_ref_node;
16struct btrfs_delayed_tree_ref; 16struct btrfs_delayed_tree_ref;
17struct btrfs_delayed_data_ref; 17struct btrfs_delayed_data_ref;
18struct btrfs_delayed_ref_head; 18struct btrfs_delayed_ref_head;
19struct btrfs_block_group_cache;
20struct btrfs_free_cluster;
19struct map_lookup; 21struct map_lookup;
20struct extent_buffer; 22struct extent_buffer;
21 23
@@ -44,6 +46,17 @@ struct extent_buffer;
44 obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) || \ 46 obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) || \
45 (obj <= BTRFS_CSUM_TREE_OBJECTID )) ? __show_root_type(obj) : "-" 47 (obj <= BTRFS_CSUM_TREE_OBJECTID )) ? __show_root_type(obj) : "-"
46 48
49#define BTRFS_GROUP_FLAGS \
50 { BTRFS_BLOCK_GROUP_DATA, "DATA"}, \
51 { BTRFS_BLOCK_GROUP_SYSTEM, "SYSTEM"}, \
52 { BTRFS_BLOCK_GROUP_METADATA, "METADATA"}, \
53 { BTRFS_BLOCK_GROUP_RAID0, "RAID0"}, \
54 { BTRFS_BLOCK_GROUP_RAID1, "RAID1"}, \
55 { BTRFS_BLOCK_GROUP_DUP, "DUP"}, \
56 { BTRFS_BLOCK_GROUP_RAID10, "RAID10"}
57
58#define BTRFS_UUID_SIZE 16
59
47TRACE_EVENT(btrfs_transaction_commit, 60TRACE_EVENT(btrfs_transaction_commit,
48 61
49 TP_PROTO(struct btrfs_root *root), 62 TP_PROTO(struct btrfs_root *root),
@@ -621,6 +634,34 @@ TRACE_EVENT(btrfs_cow_block,
621 __entry->cow_level) 634 __entry->cow_level)
622); 635);
623 636
637TRACE_EVENT(btrfs_space_reservation,
638
639 TP_PROTO(struct btrfs_fs_info *fs_info, char *type, u64 val,
640 u64 bytes, int reserve),
641
642 TP_ARGS(fs_info, type, val, bytes, reserve),
643
644 TP_STRUCT__entry(
645 __array( u8, fsid, BTRFS_UUID_SIZE )
646 __string( type, type )
647 __field( u64, val )
648 __field( u64, bytes )
649 __field( int, reserve )
650 ),
651
652 TP_fast_assign(
653 memcpy(__entry->fsid, fs_info->fsid, BTRFS_UUID_SIZE);
654 __assign_str(type, type);
655 __entry->val = val;
656 __entry->bytes = bytes;
657 __entry->reserve = reserve;
658 ),
659
660 TP_printk("%pU: %s: %Lu %s %Lu", __entry->fsid, __get_str(type),
661 __entry->val, __entry->reserve ? "reserve" : "release",
662 __entry->bytes)
663);
664
624DECLARE_EVENT_CLASS(btrfs__reserved_extent, 665DECLARE_EVENT_CLASS(btrfs__reserved_extent,
625 666
626 TP_PROTO(struct btrfs_root *root, u64 start, u64 len), 667 TP_PROTO(struct btrfs_root *root, u64 start, u64 len),
@@ -659,6 +700,168 @@ DEFINE_EVENT(btrfs__reserved_extent, btrfs_reserved_extent_free,
659 TP_ARGS(root, start, len) 700 TP_ARGS(root, start, len)
660); 701);
661 702
703TRACE_EVENT(find_free_extent,
704
705 TP_PROTO(struct btrfs_root *root, u64 num_bytes, u64 empty_size,
706 u64 data),
707
708 TP_ARGS(root, num_bytes, empty_size, data),
709
710 TP_STRUCT__entry(
711 __field( u64, root_objectid )
712 __field( u64, num_bytes )
713 __field( u64, empty_size )
714 __field( u64, data )
715 ),
716
717 TP_fast_assign(
718 __entry->root_objectid = root->root_key.objectid;
719 __entry->num_bytes = num_bytes;
720 __entry->empty_size = empty_size;
721 __entry->data = data;
722 ),
723
724 TP_printk("root = %Lu(%s), len = %Lu, empty_size = %Lu, "
725 "flags = %Lu(%s)", show_root_type(__entry->root_objectid),
726 __entry->num_bytes, __entry->empty_size, __entry->data,
727 __print_flags((unsigned long)__entry->data, "|",
728 BTRFS_GROUP_FLAGS))
729);
730
731DECLARE_EVENT_CLASS(btrfs__reserve_extent,
732
733 TP_PROTO(struct btrfs_root *root,
734 struct btrfs_block_group_cache *block_group, u64 start,
735 u64 len),
736
737 TP_ARGS(root, block_group, start, len),
738
739 TP_STRUCT__entry(
740 __field( u64, root_objectid )
741 __field( u64, bg_objectid )
742 __field( u64, flags )
743 __field( u64, start )
744 __field( u64, len )
745 ),
746
747 TP_fast_assign(
748 __entry->root_objectid = root->root_key.objectid;
749 __entry->bg_objectid = block_group->key.objectid;
750 __entry->flags = block_group->flags;
751 __entry->start = start;
752 __entry->len = len;
753 ),
754
755 TP_printk("root = %Lu(%s), block_group = %Lu, flags = %Lu(%s), "
756 "start = %Lu, len = %Lu",
757 show_root_type(__entry->root_objectid), __entry->bg_objectid,
758 __entry->flags, __print_flags((unsigned long)__entry->flags,
759 "|", BTRFS_GROUP_FLAGS),
760 __entry->start, __entry->len)
761);
762
763DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent,
764
765 TP_PROTO(struct btrfs_root *root,
766 struct btrfs_block_group_cache *block_group, u64 start,
767 u64 len),
768
769 TP_ARGS(root, block_group, start, len)
770);
771
772DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster,
773
774 TP_PROTO(struct btrfs_root *root,
775 struct btrfs_block_group_cache *block_group, u64 start,
776 u64 len),
777
778 TP_ARGS(root, block_group, start, len)
779);
780
781TRACE_EVENT(btrfs_find_cluster,
782
783 TP_PROTO(struct btrfs_block_group_cache *block_group, u64 start,
784 u64 bytes, u64 empty_size, u64 min_bytes),
785
786 TP_ARGS(block_group, start, bytes, empty_size, min_bytes),
787
788 TP_STRUCT__entry(
789 __field( u64, bg_objectid )
790 __field( u64, flags )
791 __field( u64, start )
792 __field( u64, bytes )
793 __field( u64, empty_size )
794 __field( u64, min_bytes )
795 ),
796
797 TP_fast_assign(
798 __entry->bg_objectid = block_group->key.objectid;
799 __entry->flags = block_group->flags;
800 __entry->start = start;
801 __entry->bytes = bytes;
802 __entry->empty_size = empty_size;
803 __entry->min_bytes = min_bytes;
804 ),
805
806 TP_printk("block_group = %Lu, flags = %Lu(%s), start = %Lu, len = %Lu,"
807 " empty_size = %Lu, min_bytes = %Lu", __entry->bg_objectid,
808 __entry->flags,
809 __print_flags((unsigned long)__entry->flags, "|",
810 BTRFS_GROUP_FLAGS), __entry->start,
811 __entry->bytes, __entry->empty_size, __entry->min_bytes)
812);
813
814TRACE_EVENT(btrfs_failed_cluster_setup,
815
816 TP_PROTO(struct btrfs_block_group_cache *block_group),
817
818 TP_ARGS(block_group),
819
820 TP_STRUCT__entry(
821 __field( u64, bg_objectid )
822 ),
823
824 TP_fast_assign(
825 __entry->bg_objectid = block_group->key.objectid;
826 ),
827
828 TP_printk("block_group = %Lu", __entry->bg_objectid)
829);
830
831TRACE_EVENT(btrfs_setup_cluster,
832
833 TP_PROTO(struct btrfs_block_group_cache *block_group,
834 struct btrfs_free_cluster *cluster, u64 size, int bitmap),
835
836 TP_ARGS(block_group, cluster, size, bitmap),
837
838 TP_STRUCT__entry(
839 __field( u64, bg_objectid )
840 __field( u64, flags )
841 __field( u64, start )
842 __field( u64, max_size )
843 __field( u64, size )
844 __field( int, bitmap )
845 ),
846
847 TP_fast_assign(
848 __entry->bg_objectid = block_group->key.objectid;
849 __entry->flags = block_group->flags;
850 __entry->start = cluster->window_start;
851 __entry->max_size = cluster->max_size;
852 __entry->size = size;
853 __entry->bitmap = bitmap;
854 ),
855
856 TP_printk("block_group = %Lu, flags = %Lu(%s), window_start = %Lu, "
857 "size = %Lu, max_size = %Lu, bitmap = %d",
858 __entry->bg_objectid,
859 __entry->flags,
860 __print_flags((unsigned long)__entry->flags, "|",
861 BTRFS_GROUP_FLAGS), __entry->start,
862 __entry->size, __entry->max_size, __entry->bitmap)
863);
864
662#endif /* _TRACE_BTRFS_H */ 865#endif /* _TRACE_BTRFS_H */
663 866
664/* This part must be outside protection */ 867/* This part must be outside protection */