aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Kconfig19
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/async-thread.c2
-rw-r--r--fs/btrfs/backref.c1131
-rw-r--r--fs/btrfs/backref.h5
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/check-integrity.c3068
-rw-r--r--fs/btrfs/check-integrity.h36
-rw-r--r--fs/btrfs/ctree.c42
-rw-r--r--fs/btrfs/ctree.h239
-rw-r--r--fs/btrfs/delayed-inode.c45
-rw-r--r--fs/btrfs/delayed-ref.c153
-rw-r--r--fs/btrfs/delayed-ref.h104
-rw-r--r--fs/btrfs/disk-io.c132
-rw-r--r--fs/btrfs/disk-io.h6
-rw-r--r--fs/btrfs/export.c2
-rw-r--r--fs/btrfs/extent-tree.c465
-rw-r--r--fs/btrfs/extent_io.c6
-rw-r--r--fs/btrfs/extent_io.h2
-rw-r--r--fs/btrfs/file.c16
-rw-r--r--fs/btrfs/free-space-cache.c419
-rw-r--r--fs/btrfs/inode-map.c4
-rw-r--r--fs/btrfs/inode.c79
-rw-r--r--fs/btrfs/ioctl.c290
-rw-r--r--fs/btrfs/ioctl.h54
-rw-r--r--fs/btrfs/locking.c53
-rw-r--r--fs/btrfs/relocation.c20
-rw-r--r--fs/btrfs/scrub.c12
-rw-r--r--fs/btrfs/super.c193
-rw-r--r--fs/btrfs/transaction.c20
-rw-r--r--fs/btrfs/tree-log.c2
-rw-r--r--fs/btrfs/ulist.c220
-rw-r--r--fs/btrfs/ulist.h68
-rw-r--r--fs/btrfs/volumes.c993
-rw-r--r--fs/btrfs/volumes.h54
-rw-r--r--fs/btrfs/xattr.c2
36 files changed, 6884 insertions, 1078 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index ecb9fd3be143..d33f01c08b60 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -31,3 +31,22 @@ config BTRFS_FS_POSIX_ACL
31 Linux website <http://acl.bestbits.at/>. 31 Linux website <http://acl.bestbits.at/>.
32 32
33 If you don't know what Access Control Lists are, say N 33 If you don't know what Access Control Lists are, say N
34
35config BTRFS_FS_CHECK_INTEGRITY
36 bool "Btrfs with integrity check tool compiled in (DANGEROUS)"
37 depends on BTRFS_FS
38 help
39 Adds code that examines all block write requests (including
40 writes of the super block). The goal is to verify that the
41 state of the filesystem on disk is always consistent, i.e.,
42 after a power-loss or kernel panic event the filesystem is
43 in a consistent state.
44
45 If the integrity check tool is included and activated in
46 the mount options, plenty of kernel memory is used, and
47 plenty of additional CPU cycles are spent. Enabling this
48 functionality is not intended for normal use.
49
50 In most cases, unless you are a btrfs developer who needs
51 to verify the integrity of (super)-block write requests
52 during the run of a regression test, say N
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index c0ddfd29c5e5..0c4fa2befae7 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,6 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o backref.o 11 reada.o backref.o ulist.o
12 12
13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 0b394580d860..0cc20b35c1c4 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -334,7 +334,7 @@ again:
334 if (freezing(current)) { 334 if (freezing(current)) {
335 worker->working = 0; 335 worker->working = 0;
336 spin_unlock_irq(&worker->lock); 336 spin_unlock_irq(&worker->lock);
337 refrigerator(); 337 try_to_freeze();
338 } else { 338 } else {
339 spin_unlock_irq(&worker->lock); 339 spin_unlock_irq(&worker->lock);
340 if (!kthread_should_stop()) { 340 if (!kthread_should_stop()) {
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 22c64fff1bd5..b9a843226de8 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -19,18 +19,789 @@
19#include "ctree.h" 19#include "ctree.h"
20#include "disk-io.h" 20#include "disk-io.h"
21#include "backref.h" 21#include "backref.h"
22#include "ulist.h"
23#include "transaction.h"
24#include "delayed-ref.h"
22 25
23struct __data_ref { 26/*
27 * this structure records all encountered refs on the way up to the root
28 */
29struct __prelim_ref {
24 struct list_head list; 30 struct list_head list;
25 u64 inum; 31 u64 root_id;
26 u64 root; 32 struct btrfs_key key;
27 u64 extent_data_item_offset; 33 int level;
34 int count;
35 u64 parent;
36 u64 wanted_disk_byte;
28}; 37};
29 38
30struct __shared_ref { 39static int __add_prelim_ref(struct list_head *head, u64 root_id,
31 struct list_head list; 40 struct btrfs_key *key, int level, u64 parent,
41 u64 wanted_disk_byte, int count)
42{
43 struct __prelim_ref *ref;
44
45 /* in case we're adding delayed refs, we're holding the refs spinlock */
46 ref = kmalloc(sizeof(*ref), GFP_ATOMIC);
47 if (!ref)
48 return -ENOMEM;
49
50 ref->root_id = root_id;
51 if (key)
52 ref->key = *key;
53 else
54 memset(&ref->key, 0, sizeof(ref->key));
55
56 ref->level = level;
57 ref->count = count;
58 ref->parent = parent;
59 ref->wanted_disk_byte = wanted_disk_byte;
60 list_add_tail(&ref->list, head);
61
62 return 0;
63}
64
65static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
66 struct ulist *parents,
67 struct extent_buffer *eb, int level,
68 u64 wanted_objectid, u64 wanted_disk_byte)
69{
70 int ret;
71 int slot;
72 struct btrfs_file_extent_item *fi;
73 struct btrfs_key key;
32 u64 disk_byte; 74 u64 disk_byte;
33}; 75
76add_parent:
77 ret = ulist_add(parents, eb->start, 0, GFP_NOFS);
78 if (ret < 0)
79 return ret;
80
81 if (level != 0)
82 return 0;
83
84 /*
85 * if the current leaf is full with EXTENT_DATA items, we must
86 * check the next one if that holds a reference as well.
87 * ref->count cannot be used to skip this check.
88 * repeat this until we don't find any additional EXTENT_DATA items.
89 */
90 while (1) {
91 ret = btrfs_next_leaf(root, path);
92 if (ret < 0)
93 return ret;
94 if (ret)
95 return 0;
96
97 eb = path->nodes[0];
98 for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) {
99 btrfs_item_key_to_cpu(eb, &key, slot);
100 if (key.objectid != wanted_objectid ||
101 key.type != BTRFS_EXTENT_DATA_KEY)
102 return 0;
103 fi = btrfs_item_ptr(eb, slot,
104 struct btrfs_file_extent_item);
105 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
106 if (disk_byte == wanted_disk_byte)
107 goto add_parent;
108 }
109 }
110
111 return 0;
112}
113
114/*
115 * resolve an indirect backref in the form (root_id, key, level)
116 * to a logical address
117 */
118static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
119 struct __prelim_ref *ref,
120 struct ulist *parents)
121{
122 struct btrfs_path *path;
123 struct btrfs_root *root;
124 struct btrfs_key root_key;
125 struct btrfs_key key = {0};
126 struct extent_buffer *eb;
127 int ret = 0;
128 int root_level;
129 int level = ref->level;
130
131 path = btrfs_alloc_path();
132 if (!path)
133 return -ENOMEM;
134
135 root_key.objectid = ref->root_id;
136 root_key.type = BTRFS_ROOT_ITEM_KEY;
137 root_key.offset = (u64)-1;
138 root = btrfs_read_fs_root_no_name(fs_info, &root_key);
139 if (IS_ERR(root)) {
140 ret = PTR_ERR(root);
141 goto out;
142 }
143
144 rcu_read_lock();
145 root_level = btrfs_header_level(root->node);
146 rcu_read_unlock();
147
148 if (root_level + 1 == level)
149 goto out;
150
151 path->lowest_level = level;
152 ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0);
153 pr_debug("search slot in root %llu (level %d, ref count %d) returned "
154 "%d for key (%llu %u %llu)\n",
155 (unsigned long long)ref->root_id, level, ref->count, ret,
156 (unsigned long long)ref->key.objectid, ref->key.type,
157 (unsigned long long)ref->key.offset);
158 if (ret < 0)
159 goto out;
160
161 eb = path->nodes[level];
162 if (!eb) {
163 WARN_ON(1);
164 ret = 1;
165 goto out;
166 }
167
168 if (level == 0) {
169 if (ret == 1 && path->slots[0] >= btrfs_header_nritems(eb)) {
170 ret = btrfs_next_leaf(root, path);
171 if (ret)
172 goto out;
173 eb = path->nodes[0];
174 }
175
176 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
177 }
178
179 /* the last two parameters will only be used for level == 0 */
180 ret = add_all_parents(root, path, parents, eb, level, key.objectid,
181 ref->wanted_disk_byte);
182out:
183 btrfs_free_path(path);
184 return ret;
185}
186
187/*
188 * resolve all indirect backrefs from the list
189 */
190static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
191 struct list_head *head)
192{
193 int err;
194 int ret = 0;
195 struct __prelim_ref *ref;
196 struct __prelim_ref *ref_safe;
197 struct __prelim_ref *new_ref;
198 struct ulist *parents;
199 struct ulist_node *node;
200
201 parents = ulist_alloc(GFP_NOFS);
202 if (!parents)
203 return -ENOMEM;
204
205 /*
206 * _safe allows us to insert directly after the current item without
207 * iterating over the newly inserted items.
208 * we're also allowed to re-assign ref during iteration.
209 */
210 list_for_each_entry_safe(ref, ref_safe, head, list) {
211 if (ref->parent) /* already direct */
212 continue;
213 if (ref->count == 0)
214 continue;
215 err = __resolve_indirect_ref(fs_info, ref, parents);
216 if (err) {
217 if (ret == 0)
218 ret = err;
219 continue;
220 }
221
222 /* we put the first parent into the ref at hand */
223 node = ulist_next(parents, NULL);
224 ref->parent = node ? node->val : 0;
225
226 /* additional parents require new refs being added here */
227 while ((node = ulist_next(parents, node))) {
228 new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS);
229 if (!new_ref) {
230 ret = -ENOMEM;
231 break;
232 }
233 memcpy(new_ref, ref, sizeof(*ref));
234 new_ref->parent = node->val;
235 list_add(&new_ref->list, &ref->list);
236 }
237 ulist_reinit(parents);
238 }
239
240 ulist_free(parents);
241 return ret;
242}
243
244/*
245 * merge two lists of backrefs and adjust counts accordingly
246 *
247 * mode = 1: merge identical keys, if key is set
248 * mode = 2: merge identical parents
249 */
250static int __merge_refs(struct list_head *head, int mode)
251{
252 struct list_head *pos1;
253
254 list_for_each(pos1, head) {
255 struct list_head *n2;
256 struct list_head *pos2;
257 struct __prelim_ref *ref1;
258
259 ref1 = list_entry(pos1, struct __prelim_ref, list);
260
261 if (mode == 1 && ref1->key.type == 0)
262 continue;
263 for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
264 pos2 = n2, n2 = pos2->next) {
265 struct __prelim_ref *ref2;
266
267 ref2 = list_entry(pos2, struct __prelim_ref, list);
268
269 if (mode == 1) {
270 if (memcmp(&ref1->key, &ref2->key,
271 sizeof(ref1->key)) ||
272 ref1->level != ref2->level ||
273 ref1->root_id != ref2->root_id)
274 continue;
275 ref1->count += ref2->count;
276 } else {
277 if (ref1->parent != ref2->parent)
278 continue;
279 ref1->count += ref2->count;
280 }
281 list_del(&ref2->list);
282 kfree(ref2);
283 }
284
285 }
286 return 0;
287}
288
289/*
290 * add all currently queued delayed refs from this head whose seq nr is
291 * smaller or equal that seq to the list
292 */
293static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
294 struct btrfs_key *info_key,
295 struct list_head *prefs)
296{
297 struct btrfs_delayed_extent_op *extent_op = head->extent_op;
298 struct rb_node *n = &head->node.rb_node;
299 int sgn;
300 int ret;
301
302 if (extent_op && extent_op->update_key)
303 btrfs_disk_key_to_cpu(info_key, &extent_op->key);
304
305 while ((n = rb_prev(n))) {
306 struct btrfs_delayed_ref_node *node;
307 node = rb_entry(n, struct btrfs_delayed_ref_node,
308 rb_node);
309 if (node->bytenr != head->node.bytenr)
310 break;
311 WARN_ON(node->is_head);
312
313 if (node->seq > seq)
314 continue;
315
316 switch (node->action) {
317 case BTRFS_ADD_DELAYED_EXTENT:
318 case BTRFS_UPDATE_DELAYED_HEAD:
319 WARN_ON(1);
320 continue;
321 case BTRFS_ADD_DELAYED_REF:
322 sgn = 1;
323 break;
324 case BTRFS_DROP_DELAYED_REF:
325 sgn = -1;
326 break;
327 default:
328 BUG_ON(1);
329 }
330 switch (node->type) {
331 case BTRFS_TREE_BLOCK_REF_KEY: {
332 struct btrfs_delayed_tree_ref *ref;
333
334 ref = btrfs_delayed_node_to_tree_ref(node);
335 ret = __add_prelim_ref(prefs, ref->root, info_key,
336 ref->level + 1, 0, node->bytenr,
337 node->ref_mod * sgn);
338 break;
339 }
340 case BTRFS_SHARED_BLOCK_REF_KEY: {
341 struct btrfs_delayed_tree_ref *ref;
342
343 ref = btrfs_delayed_node_to_tree_ref(node);
344 ret = __add_prelim_ref(prefs, ref->root, info_key,
345 ref->level + 1, ref->parent,
346 node->bytenr,
347 node->ref_mod * sgn);
348 break;
349 }
350 case BTRFS_EXTENT_DATA_REF_KEY: {
351 struct btrfs_delayed_data_ref *ref;
352 struct btrfs_key key;
353
354 ref = btrfs_delayed_node_to_data_ref(node);
355
356 key.objectid = ref->objectid;
357 key.type = BTRFS_EXTENT_DATA_KEY;
358 key.offset = ref->offset;
359 ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0,
360 node->bytenr,
361 node->ref_mod * sgn);
362 break;
363 }
364 case BTRFS_SHARED_DATA_REF_KEY: {
365 struct btrfs_delayed_data_ref *ref;
366 struct btrfs_key key;
367
368 ref = btrfs_delayed_node_to_data_ref(node);
369
370 key.objectid = ref->objectid;
371 key.type = BTRFS_EXTENT_DATA_KEY;
372 key.offset = ref->offset;
373 ret = __add_prelim_ref(prefs, ref->root, &key, 0,
374 ref->parent, node->bytenr,
375 node->ref_mod * sgn);
376 break;
377 }
378 default:
379 WARN_ON(1);
380 }
381 BUG_ON(ret);
382 }
383
384 return 0;
385}
386
387/*
388 * add all inline backrefs for bytenr to the list
389 */
390static int __add_inline_refs(struct btrfs_fs_info *fs_info,
391 struct btrfs_path *path, u64 bytenr,
392 struct btrfs_key *info_key, int *info_level,
393 struct list_head *prefs)
394{
395 int ret;
396 int slot;
397 struct extent_buffer *leaf;
398 struct btrfs_key key;
399 unsigned long ptr;
400 unsigned long end;
401 struct btrfs_extent_item *ei;
402 u64 flags;
403 u64 item_size;
404
405 /*
406 * enumerate all inline refs
407 */
408 leaf = path->nodes[0];
409 slot = path->slots[0] - 1;
410
411 item_size = btrfs_item_size_nr(leaf, slot);
412 BUG_ON(item_size < sizeof(*ei));
413
414 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
415 flags = btrfs_extent_flags(leaf, ei);
416
417 ptr = (unsigned long)(ei + 1);
418 end = (unsigned long)ei + item_size;
419
420 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
421 struct btrfs_tree_block_info *info;
422 struct btrfs_disk_key disk_key;
423
424 info = (struct btrfs_tree_block_info *)ptr;
425 *info_level = btrfs_tree_block_level(leaf, info);
426 btrfs_tree_block_key(leaf, info, &disk_key);
427 btrfs_disk_key_to_cpu(info_key, &disk_key);
428 ptr += sizeof(struct btrfs_tree_block_info);
429 BUG_ON(ptr > end);
430 } else {
431 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
432 }
433
434 while (ptr < end) {
435 struct btrfs_extent_inline_ref *iref;
436 u64 offset;
437 int type;
438
439 iref = (struct btrfs_extent_inline_ref *)ptr;
440 type = btrfs_extent_inline_ref_type(leaf, iref);
441 offset = btrfs_extent_inline_ref_offset(leaf, iref);
442
443 switch (type) {
444 case BTRFS_SHARED_BLOCK_REF_KEY:
445 ret = __add_prelim_ref(prefs, 0, info_key,
446 *info_level + 1, offset,
447 bytenr, 1);
448 break;
449 case BTRFS_SHARED_DATA_REF_KEY: {
450 struct btrfs_shared_data_ref *sdref;
451 int count;
452
453 sdref = (struct btrfs_shared_data_ref *)(iref + 1);
454 count = btrfs_shared_data_ref_count(leaf, sdref);
455 ret = __add_prelim_ref(prefs, 0, NULL, 0, offset,
456 bytenr, count);
457 break;
458 }
459 case BTRFS_TREE_BLOCK_REF_KEY:
460 ret = __add_prelim_ref(prefs, offset, info_key,
461 *info_level + 1, 0, bytenr, 1);
462 break;
463 case BTRFS_EXTENT_DATA_REF_KEY: {
464 struct btrfs_extent_data_ref *dref;
465 int count;
466 u64 root;
467
468 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
469 count = btrfs_extent_data_ref_count(leaf, dref);
470 key.objectid = btrfs_extent_data_ref_objectid(leaf,
471 dref);
472 key.type = BTRFS_EXTENT_DATA_KEY;
473 key.offset = btrfs_extent_data_ref_offset(leaf, dref);
474 root = btrfs_extent_data_ref_root(leaf, dref);
475 ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr,
476 count);
477 break;
478 }
479 default:
480 WARN_ON(1);
481 }
482 BUG_ON(ret);
483 ptr += btrfs_extent_inline_ref_size(type);
484 }
485
486 return 0;
487}
488
489/*
490 * add all non-inline backrefs for bytenr to the list
491 */
492static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
493 struct btrfs_path *path, u64 bytenr,
494 struct btrfs_key *info_key, int info_level,
495 struct list_head *prefs)
496{
497 struct btrfs_root *extent_root = fs_info->extent_root;
498 int ret;
499 int slot;
500 struct extent_buffer *leaf;
501 struct btrfs_key key;
502
503 while (1) {
504 ret = btrfs_next_item(extent_root, path);
505 if (ret < 0)
506 break;
507 if (ret) {
508 ret = 0;
509 break;
510 }
511
512 slot = path->slots[0];
513 leaf = path->nodes[0];
514 btrfs_item_key_to_cpu(leaf, &key, slot);
515
516 if (key.objectid != bytenr)
517 break;
518 if (key.type < BTRFS_TREE_BLOCK_REF_KEY)
519 continue;
520 if (key.type > BTRFS_SHARED_DATA_REF_KEY)
521 break;
522
523 switch (key.type) {
524 case BTRFS_SHARED_BLOCK_REF_KEY:
525 ret = __add_prelim_ref(prefs, 0, info_key,
526 info_level + 1, key.offset,
527 bytenr, 1);
528 break;
529 case BTRFS_SHARED_DATA_REF_KEY: {
530 struct btrfs_shared_data_ref *sdref;
531 int count;
532
533 sdref = btrfs_item_ptr(leaf, slot,
534 struct btrfs_shared_data_ref);
535 count = btrfs_shared_data_ref_count(leaf, sdref);
536 ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset,
537 bytenr, count);
538 break;
539 }
540 case BTRFS_TREE_BLOCK_REF_KEY:
541 ret = __add_prelim_ref(prefs, key.offset, info_key,
542 info_level + 1, 0, bytenr, 1);
543 break;
544 case BTRFS_EXTENT_DATA_REF_KEY: {
545 struct btrfs_extent_data_ref *dref;
546 int count;
547 u64 root;
548
549 dref = btrfs_item_ptr(leaf, slot,
550 struct btrfs_extent_data_ref);
551 count = btrfs_extent_data_ref_count(leaf, dref);
552 key.objectid = btrfs_extent_data_ref_objectid(leaf,
553 dref);
554 key.type = BTRFS_EXTENT_DATA_KEY;
555 key.offset = btrfs_extent_data_ref_offset(leaf, dref);
556 root = btrfs_extent_data_ref_root(leaf, dref);
557 ret = __add_prelim_ref(prefs, root, &key, 0, 0,
558 bytenr, count);
559 break;
560 }
561 default:
562 WARN_ON(1);
563 }
564 BUG_ON(ret);
565 }
566
567 return ret;
568}
569
570/*
571 * this adds all existing backrefs (inline backrefs, backrefs and delayed
572 * refs) for the given bytenr to the refs list, merges duplicates and resolves
573 * indirect refs to their parent bytenr.
574 * When roots are found, they're added to the roots list
575 *
576 * FIXME some caching might speed things up
577 */
578static int find_parent_nodes(struct btrfs_trans_handle *trans,
579 struct btrfs_fs_info *fs_info, u64 bytenr,
580 u64 seq, struct ulist *refs, struct ulist *roots)
581{
582 struct btrfs_key key;
583 struct btrfs_path *path;
584 struct btrfs_key info_key = { 0 };
585 struct btrfs_delayed_ref_root *delayed_refs = NULL;
586 struct btrfs_delayed_ref_head *head = NULL;
587 int info_level = 0;
588 int ret;
589 struct list_head prefs_delayed;
590 struct list_head prefs;
591 struct __prelim_ref *ref;
592
593 INIT_LIST_HEAD(&prefs);
594 INIT_LIST_HEAD(&prefs_delayed);
595
596 key.objectid = bytenr;
597 key.type = BTRFS_EXTENT_ITEM_KEY;
598 key.offset = (u64)-1;
599
600 path = btrfs_alloc_path();
601 if (!path)
602 return -ENOMEM;
603
604 /*
605 * grab both a lock on the path and a lock on the delayed ref head.
606 * We need both to get a consistent picture of how the refs look
607 * at a specified point in time
608 */
609again:
610 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
611 if (ret < 0)
612 goto out;
613 BUG_ON(ret == 0);
614
615 /*
616 * look if there are updates for this ref queued and lock the head
617 */
618 delayed_refs = &trans->transaction->delayed_refs;
619 spin_lock(&delayed_refs->lock);
620 head = btrfs_find_delayed_ref_head(trans, bytenr);
621 if (head) {
622 if (!mutex_trylock(&head->mutex)) {
623 atomic_inc(&head->node.refs);
624 spin_unlock(&delayed_refs->lock);
625
626 btrfs_release_path(path);
627
628 /*
629 * Mutex was contended, block until it's
630 * released and try again
631 */
632 mutex_lock(&head->mutex);
633 mutex_unlock(&head->mutex);
634 btrfs_put_delayed_ref(&head->node);
635 goto again;
636 }
637 ret = __add_delayed_refs(head, seq, &info_key, &prefs_delayed);
638 if (ret)
639 goto out;
640 }
641 spin_unlock(&delayed_refs->lock);
642
643 if (path->slots[0]) {
644 struct extent_buffer *leaf;
645 int slot;
646
647 leaf = path->nodes[0];
648 slot = path->slots[0] - 1;
649 btrfs_item_key_to_cpu(leaf, &key, slot);
650 if (key.objectid == bytenr &&
651 key.type == BTRFS_EXTENT_ITEM_KEY) {
652 ret = __add_inline_refs(fs_info, path, bytenr,
653 &info_key, &info_level, &prefs);
654 if (ret)
655 goto out;
656 ret = __add_keyed_refs(fs_info, path, bytenr, &info_key,
657 info_level, &prefs);
658 if (ret)
659 goto out;
660 }
661 }
662 btrfs_release_path(path);
663
664 /*
665 * when adding the delayed refs above, the info_key might not have
666 * been known yet. Go over the list and replace the missing keys
667 */
668 list_for_each_entry(ref, &prefs_delayed, list) {
669 if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0)
670 memcpy(&ref->key, &info_key, sizeof(ref->key));
671 }
672 list_splice_init(&prefs_delayed, &prefs);
673
674 ret = __merge_refs(&prefs, 1);
675 if (ret)
676 goto out;
677
678 ret = __resolve_indirect_refs(fs_info, &prefs);
679 if (ret)
680 goto out;
681
682 ret = __merge_refs(&prefs, 2);
683 if (ret)
684 goto out;
685
686 while (!list_empty(&prefs)) {
687 ref = list_first_entry(&prefs, struct __prelim_ref, list);
688 list_del(&ref->list);
689 if (ref->count < 0)
690 WARN_ON(1);
691 if (ref->count && ref->root_id && ref->parent == 0) {
692 /* no parent == root of tree */
693 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
694 BUG_ON(ret < 0);
695 }
696 if (ref->count && ref->parent) {
697 ret = ulist_add(refs, ref->parent, 0, GFP_NOFS);
698 BUG_ON(ret < 0);
699 }
700 kfree(ref);
701 }
702
703out:
704 if (head)
705 mutex_unlock(&head->mutex);
706 btrfs_free_path(path);
707 while (!list_empty(&prefs)) {
708 ref = list_first_entry(&prefs, struct __prelim_ref, list);
709 list_del(&ref->list);
710 kfree(ref);
711 }
712 while (!list_empty(&prefs_delayed)) {
713 ref = list_first_entry(&prefs_delayed, struct __prelim_ref,
714 list);
715 list_del(&ref->list);
716 kfree(ref);
717 }
718
719 return ret;
720}
721
722/*
723 * Finds all leafs with a reference to the specified combination of bytenr and
724 * offset. key_list_head will point to a list of corresponding keys (caller must
725 * free each list element). The leafs will be stored in the leafs ulist, which
726 * must be freed with ulist_free.
727 *
728 * returns 0 on success, <0 on error
729 */
730static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
731 struct btrfs_fs_info *fs_info, u64 bytenr,
732 u64 num_bytes, u64 seq, struct ulist **leafs)
733{
734 struct ulist *tmp;
735 int ret;
736
737 tmp = ulist_alloc(GFP_NOFS);
738 if (!tmp)
739 return -ENOMEM;
740 *leafs = ulist_alloc(GFP_NOFS);
741 if (!*leafs) {
742 ulist_free(tmp);
743 return -ENOMEM;
744 }
745
746 ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp);
747 ulist_free(tmp);
748
749 if (ret < 0 && ret != -ENOENT) {
750 ulist_free(*leafs);
751 return ret;
752 }
753
754 return 0;
755}
756
757/*
758 * walk all backrefs for a given extent to find all roots that reference this
759 * extent. Walking a backref means finding all extents that reference this
760 * extent and in turn walk the backrefs of those, too. Naturally this is a
761 * recursive process, but here it is implemented in an iterative fashion: We
762 * find all referencing extents for the extent in question and put them on a
763 * list. In turn, we find all referencing extents for those, further appending
764 * to the list. The way we iterate the list allows adding more elements after
765 * the current while iterating. The process stops when we reach the end of the
766 * list. Found roots are added to the roots list.
767 *
768 * returns 0 on success, < 0 on error.
769 */
770int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
771 struct btrfs_fs_info *fs_info, u64 bytenr,
772 u64 num_bytes, u64 seq, struct ulist **roots)
773{
774 struct ulist *tmp;
775 struct ulist_node *node = NULL;
776 int ret;
777
778 tmp = ulist_alloc(GFP_NOFS);
779 if (!tmp)
780 return -ENOMEM;
781 *roots = ulist_alloc(GFP_NOFS);
782 if (!*roots) {
783 ulist_free(tmp);
784 return -ENOMEM;
785 }
786
787 while (1) {
788 ret = find_parent_nodes(trans, fs_info, bytenr, seq,
789 tmp, *roots);
790 if (ret < 0 && ret != -ENOENT) {
791 ulist_free(tmp);
792 ulist_free(*roots);
793 return ret;
794 }
795 node = ulist_next(tmp, node);
796 if (!node)
797 break;
798 bytenr = node->val;
799 }
800
801 ulist_free(tmp);
802 return 0;
803}
804
34 805
35static int __inode_info(u64 inum, u64 ioff, u8 key_type, 806static int __inode_info(u64 inum, u64 ioff, u8 key_type,
36 struct btrfs_root *fs_root, struct btrfs_path *path, 807 struct btrfs_root *fs_root, struct btrfs_path *path,
@@ -181,8 +952,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
181 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); 952 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
182 if (found_key->type != BTRFS_EXTENT_ITEM_KEY || 953 if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
183 found_key->objectid > logical || 954 found_key->objectid > logical ||
184 found_key->objectid + found_key->offset <= logical) 955 found_key->objectid + found_key->offset <= logical) {
956 pr_debug("logical %llu is not within any extent\n",
957 (unsigned long long)logical);
185 return -ENOENT; 958 return -ENOENT;
959 }
186 960
187 eb = path->nodes[0]; 961 eb = path->nodes[0];
188 item_size = btrfs_item_size_nr(eb, path->slots[0]); 962 item_size = btrfs_item_size_nr(eb, path->slots[0]);
@@ -191,6 +965,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
191 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); 965 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
192 flags = btrfs_extent_flags(eb, ei); 966 flags = btrfs_extent_flags(eb, ei);
193 967
968 pr_debug("logical %llu is at position %llu within the extent (%llu "
969 "EXTENT_ITEM %llu) flags %#llx size %u\n",
970 (unsigned long long)logical,
971 (unsigned long long)(logical - found_key->objectid),
972 (unsigned long long)found_key->objectid,
973 (unsigned long long)found_key->offset,
974 (unsigned long long)flags, item_size);
194 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) 975 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
195 return BTRFS_EXTENT_FLAG_TREE_BLOCK; 976 return BTRFS_EXTENT_FLAG_TREE_BLOCK;
196 if (flags & BTRFS_EXTENT_FLAG_DATA) 977 if (flags & BTRFS_EXTENT_FLAG_DATA)
@@ -287,128 +1068,11 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
287 return 0; 1068 return 0;
288} 1069}
289 1070
290static int __data_list_add(struct list_head *head, u64 inum, 1071static int iterate_leaf_refs(struct btrfs_fs_info *fs_info,
291 u64 extent_data_item_offset, u64 root) 1072 struct btrfs_path *path, u64 logical,
292{ 1073 u64 orig_extent_item_objectid,
293 struct __data_ref *ref; 1074 u64 extent_item_pos, u64 root,
294 1075 iterate_extent_inodes_t *iterate, void *ctx)
295 ref = kmalloc(sizeof(*ref), GFP_NOFS);
296 if (!ref)
297 return -ENOMEM;
298
299 ref->inum = inum;
300 ref->extent_data_item_offset = extent_data_item_offset;
301 ref->root = root;
302 list_add_tail(&ref->list, head);
303
304 return 0;
305}
306
307static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb,
308 struct btrfs_extent_data_ref *dref)
309{
310 return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref),
311 btrfs_extent_data_ref_offset(eb, dref),
312 btrfs_extent_data_ref_root(eb, dref));
313}
314
315static int __shared_list_add(struct list_head *head, u64 disk_byte)
316{
317 struct __shared_ref *ref;
318
319 ref = kmalloc(sizeof(*ref), GFP_NOFS);
320 if (!ref)
321 return -ENOMEM;
322
323 ref->disk_byte = disk_byte;
324 list_add_tail(&ref->list, head);
325
326 return 0;
327}
328
329static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info,
330 u64 logical, u64 inum,
331 u64 extent_data_item_offset,
332 u64 extent_offset,
333 struct btrfs_path *path,
334 struct list_head *data_refs,
335 iterate_extent_inodes_t *iterate,
336 void *ctx)
337{
338 u64 ref_root;
339 u32 item_size;
340 struct btrfs_key key;
341 struct extent_buffer *eb;
342 struct btrfs_extent_item *ei;
343 struct btrfs_extent_inline_ref *eiref;
344 struct __data_ref *ref;
345 int ret;
346 int type;
347 int last;
348 unsigned long ptr = 0;
349
350 WARN_ON(!list_empty(data_refs));
351 ret = extent_from_logical(fs_info, logical, path, &key);
352 if (ret & BTRFS_EXTENT_FLAG_DATA)
353 ret = -EIO;
354 if (ret < 0)
355 goto out;
356
357 eb = path->nodes[0];
358 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
359 item_size = btrfs_item_size_nr(eb, path->slots[0]);
360
361 ret = 0;
362 ref_root = 0;
363 /*
364 * as done in iterate_extent_inodes, we first build a list of refs to
365 * iterate, then free the path and then iterate them to avoid deadlocks.
366 */
367 do {
368 last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
369 &eiref, &type);
370 if (last < 0) {
371 ret = last;
372 goto out;
373 }
374 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
375 type == BTRFS_SHARED_BLOCK_REF_KEY) {
376 ref_root = btrfs_extent_inline_ref_offset(eb, eiref);
377 ret = __data_list_add(data_refs, inum,
378 extent_data_item_offset,
379 ref_root);
380 }
381 } while (!ret && !last);
382
383 btrfs_release_path(path);
384
385 if (ref_root == 0) {
386 printk(KERN_ERR "btrfs: failed to find tree block ref "
387 "for shared data backref %llu\n", logical);
388 WARN_ON(1);
389 ret = -EIO;
390 }
391
392out:
393 while (!list_empty(data_refs)) {
394 ref = list_first_entry(data_refs, struct __data_ref, list);
395 list_del(&ref->list);
396 if (!ret)
397 ret = iterate(ref->inum, extent_offset +
398 ref->extent_data_item_offset,
399 ref->root, ctx);
400 kfree(ref);
401 }
402
403 return ret;
404}
405
406static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
407 u64 logical, u64 orig_extent_item_objectid,
408 u64 extent_offset, struct btrfs_path *path,
409 struct list_head *data_refs,
410 iterate_extent_inodes_t *iterate,
411 void *ctx)
412{ 1076{
413 u64 disk_byte; 1077 u64 disk_byte;
414 struct btrfs_key key; 1078 struct btrfs_key key;
@@ -416,8 +1080,10 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
416 struct extent_buffer *eb; 1080 struct extent_buffer *eb;
417 int slot; 1081 int slot;
418 int nritems; 1082 int nritems;
419 int ret; 1083 int ret = 0;
420 int found = 0; 1084 int extent_type;
1085 u64 data_offset;
1086 u64 data_len;
421 1087
422 eb = read_tree_block(fs_info->tree_root, logical, 1088 eb = read_tree_block(fs_info->tree_root, logical,
423 fs_info->tree_root->leafsize, 0); 1089 fs_info->tree_root->leafsize, 0);
@@ -435,149 +1101,99 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
435 if (key.type != BTRFS_EXTENT_DATA_KEY) 1101 if (key.type != BTRFS_EXTENT_DATA_KEY)
436 continue; 1102 continue;
437 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 1103 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
438 if (!fi) { 1104 extent_type = btrfs_file_extent_type(eb, fi);
439 free_extent_buffer(eb); 1105 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
440 return -EIO; 1106 continue;
441 } 1107 /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
442 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); 1108 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
443 if (disk_byte != orig_extent_item_objectid) { 1109 if (disk_byte != orig_extent_item_objectid)
444 if (found) 1110 continue;
445 break;
446 else
447 continue;
448 }
449 ++found;
450 ret = __iter_shared_inline_ref_inodes(fs_info, logical,
451 key.objectid,
452 key.offset,
453 extent_offset, path,
454 data_refs,
455 iterate, ctx);
456 if (ret)
457 break;
458 }
459 1111
460 if (!found) { 1112 data_offset = btrfs_file_extent_offset(eb, fi);
461 printk(KERN_ERR "btrfs: failed to follow shared data backref " 1113 data_len = btrfs_file_extent_num_bytes(eb, fi);
462 "to parent %llu\n", logical); 1114
463 WARN_ON(1); 1115 if (extent_item_pos < data_offset ||
464 ret = -EIO; 1116 extent_item_pos >= data_offset + data_len)
1117 continue;
1118
1119 pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), "
1120 "root %llu\n", orig_extent_item_objectid,
1121 key.objectid, key.offset, root);
1122 ret = iterate(key.objectid,
1123 key.offset + (extent_item_pos - data_offset),
1124 root, ctx);
1125 if (ret) {
1126 pr_debug("stopping iteration because ret=%d\n", ret);
1127 break;
1128 }
465 } 1129 }
466 1130
467 free_extent_buffer(eb); 1131 free_extent_buffer(eb);
1132
468 return ret; 1133 return ret;
469} 1134}
470 1135
471/* 1136/*
472 * calls iterate() for every inode that references the extent identified by 1137 * calls iterate() for every inode that references the extent identified by
473 * the given parameters. will use the path given as a parameter and return it 1138 * the given parameters.
474 * released.
475 * when the iterator function returns a non-zero value, iteration stops. 1139 * when the iterator function returns a non-zero value, iteration stops.
1140 * path is guaranteed to be in released state when iterate() is called.
476 */ 1141 */
477int iterate_extent_inodes(struct btrfs_fs_info *fs_info, 1142int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
478 struct btrfs_path *path, 1143 struct btrfs_path *path,
479 u64 extent_item_objectid, 1144 u64 extent_item_objectid, u64 extent_item_pos,
480 u64 extent_offset,
481 iterate_extent_inodes_t *iterate, void *ctx) 1145 iterate_extent_inodes_t *iterate, void *ctx)
482{ 1146{
483 unsigned long ptr = 0;
484 int last;
485 int ret; 1147 int ret;
486 int type;
487 u64 logical;
488 u32 item_size;
489 struct btrfs_extent_inline_ref *eiref;
490 struct btrfs_extent_data_ref *dref;
491 struct extent_buffer *eb;
492 struct btrfs_extent_item *ei;
493 struct btrfs_key key;
494 struct list_head data_refs = LIST_HEAD_INIT(data_refs); 1148 struct list_head data_refs = LIST_HEAD_INIT(data_refs);
495 struct list_head shared_refs = LIST_HEAD_INIT(shared_refs); 1149 struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
496 struct __data_ref *ref_d; 1150 struct btrfs_trans_handle *trans;
497 struct __shared_ref *ref_s; 1151 struct ulist *refs;
498 1152 struct ulist *roots;
499 eb = path->nodes[0]; 1153 struct ulist_node *ref_node = NULL;
500 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); 1154 struct ulist_node *root_node = NULL;
501 item_size = btrfs_item_size_nr(eb, path->slots[0]); 1155 struct seq_list seq_elem;
502 1156 struct btrfs_delayed_ref_root *delayed_refs;
503 /* first we iterate the inline refs, ... */ 1157
504 do { 1158 trans = btrfs_join_transaction(fs_info->extent_root);
505 last = __get_extent_inline_ref(&ptr, eb, ei, item_size, 1159 if (IS_ERR(trans))
506 &eiref, &type); 1160 return PTR_ERR(trans);
507 if (last == -ENOENT) { 1161
508 ret = 0; 1162 pr_debug("resolving all inodes for extent %llu\n",
509 break; 1163 extent_item_objectid);
510 } 1164
511 if (last < 0) { 1165 delayed_refs = &trans->transaction->delayed_refs;
512 ret = last; 1166 spin_lock(&delayed_refs->lock);
513 break; 1167 btrfs_get_delayed_seq(delayed_refs, &seq_elem);
514 } 1168 spin_unlock(&delayed_refs->lock);
1169
1170 ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
1171 extent_item_pos, seq_elem.seq,
1172 &refs);
515 1173
516 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1174 if (ret)
517 dref = (struct btrfs_extent_data_ref *)(&eiref->offset); 1175 goto out;
518 ret = __data_list_add_eb(&data_refs, eb, dref);
519 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
520 logical = btrfs_extent_inline_ref_offset(eb, eiref);
521 ret = __shared_list_add(&shared_refs, logical);
522 }
523 } while (!ret && !last);
524 1176
525 /* ... then we proceed to in-tree references and ... */ 1177 while (!ret && (ref_node = ulist_next(refs, ref_node))) {
526 while (!ret) { 1178 ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1,
527 ++path->slots[0]; 1179 seq_elem.seq, &roots);
528 if (path->slots[0] > btrfs_header_nritems(eb)) { 1180 if (ret)
529 ret = btrfs_next_leaf(fs_info->extent_root, path);
530 if (ret) {
531 if (ret == 1)
532 ret = 0; /* we're done */
533 break;
534 }
535 eb = path->nodes[0];
536 }
537 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
538 if (key.objectid != extent_item_objectid)
539 break; 1181 break;
540 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1182 while (!ret && (root_node = ulist_next(roots, root_node))) {
541 dref = btrfs_item_ptr(eb, path->slots[0], 1183 pr_debug("root %llu references leaf %llu\n",
542 struct btrfs_extent_data_ref); 1184 root_node->val, ref_node->val);
543 ret = __data_list_add_eb(&data_refs, eb, dref); 1185 ret = iterate_leaf_refs(fs_info, path, ref_node->val,
544 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1186 extent_item_objectid,
545 ret = __shared_list_add(&shared_refs, key.offset); 1187 extent_item_pos, root_node->val,
1188 iterate, ctx);
546 } 1189 }
547 } 1190 }
548 1191
549 btrfs_release_path(path); 1192 ulist_free(refs);
550 1193 ulist_free(roots);
551 /* 1194out:
552 * ... only at the very end we can process the refs we found. this is 1195 btrfs_put_delayed_seq(delayed_refs, &seq_elem);
553 * because the iterator function we call is allowed to make tree lookups 1196 btrfs_end_transaction(trans, fs_info->extent_root);
554 * and we have to avoid deadlocks. additionally, we need more tree
555 * lookups ourselves for shared data refs.
556 */
557 while (!list_empty(&data_refs)) {
558 ref_d = list_first_entry(&data_refs, struct __data_ref, list);
559 list_del(&ref_d->list);
560 if (!ret)
561 ret = iterate(ref_d->inum, extent_offset +
562 ref_d->extent_data_item_offset,
563 ref_d->root, ctx);
564 kfree(ref_d);
565 }
566
567 while (!list_empty(&shared_refs)) {
568 ref_s = list_first_entry(&shared_refs, struct __shared_ref,
569 list);
570 list_del(&ref_s->list);
571 if (!ret)
572 ret = __iter_shared_inline_ref(fs_info,
573 ref_s->disk_byte,
574 extent_item_objectid,
575 extent_offset, path,
576 &data_refs,
577 iterate, ctx);
578 kfree(ref_s);
579 }
580
581 return ret; 1197 return ret;
582} 1198}
583 1199
@@ -586,19 +1202,20 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
586 iterate_extent_inodes_t *iterate, void *ctx) 1202 iterate_extent_inodes_t *iterate, void *ctx)
587{ 1203{
588 int ret; 1204 int ret;
589 u64 offset; 1205 u64 extent_item_pos;
590 struct btrfs_key found_key; 1206 struct btrfs_key found_key;
591 1207
592 ret = extent_from_logical(fs_info, logical, path, 1208 ret = extent_from_logical(fs_info, logical, path,
593 &found_key); 1209 &found_key);
1210 btrfs_release_path(path);
594 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) 1211 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
595 ret = -EINVAL; 1212 ret = -EINVAL;
596 if (ret < 0) 1213 if (ret < 0)
597 return ret; 1214 return ret;
598 1215
599 offset = logical - found_key.objectid; 1216 extent_item_pos = logical - found_key.objectid;
600 ret = iterate_extent_inodes(fs_info, path, found_key.objectid, 1217 ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
601 offset, iterate, ctx); 1218 extent_item_pos, iterate, ctx);
602 1219
603 return ret; 1220 return ret;
604} 1221}
@@ -643,6 +1260,10 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
643 for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) { 1260 for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
644 name_len = btrfs_inode_ref_name_len(eb, iref); 1261 name_len = btrfs_inode_ref_name_len(eb, iref);
645 /* path must be released before calling iterate()! */ 1262 /* path must be released before calling iterate()! */
1263 pr_debug("following ref at offset %u for inode %llu in "
1264 "tree %llu\n", cur,
1265 (unsigned long long)found_key.objectid,
1266 (unsigned long long)fs_root->objectid);
646 ret = iterate(parent, iref, eb, ctx); 1267 ret = iterate(parent, iref, eb, ctx);
647 if (ret) { 1268 if (ret) {
648 free_extent_buffer(eb); 1269 free_extent_buffer(eb);
@@ -683,10 +1304,14 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
683 return PTR_ERR(fspath); 1304 return PTR_ERR(fspath);
684 1305
685 if (fspath > fspath_min) { 1306 if (fspath > fspath_min) {
1307 pr_debug("path resolved: %s\n", fspath);
686 ipath->fspath->val[i] = (u64)(unsigned long)fspath; 1308 ipath->fspath->val[i] = (u64)(unsigned long)fspath;
687 ++ipath->fspath->elem_cnt; 1309 ++ipath->fspath->elem_cnt;
688 ipath->fspath->bytes_left = fspath - fspath_min; 1310 ipath->fspath->bytes_left = fspath - fspath_min;
689 } else { 1311 } else {
1312 pr_debug("missed path, not enough space. missing bytes: %lu, "
1313 "constructed so far: %s\n",
1314 (unsigned long)(fspath_min - fspath), fspath_min);
690 ++ipath->fspath->elem_missed; 1315 ++ipath->fspath->elem_missed;
691 ipath->fspath->bytes_missing += fspath_min - fspath; 1316 ipath->fspath->bytes_missing += fspath_min - fspath;
692 ipath->fspath->bytes_left = 0; 1317 ipath->fspath->bytes_left = 0;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 92618837cb8f..d00dfa9ca934 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -20,6 +20,7 @@
20#define __BTRFS_BACKREF__ 20#define __BTRFS_BACKREF__
21 21
22#include "ioctl.h" 22#include "ioctl.h"
23#include "ulist.h"
23 24
24struct inode_fs_paths { 25struct inode_fs_paths {
25 struct btrfs_path *btrfs_path; 26 struct btrfs_path *btrfs_path;
@@ -54,6 +55,10 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
54 55
55int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); 56int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
56 57
58int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
59 struct btrfs_fs_info *fs_info, u64 bytenr,
60 u64 num_bytes, u64 seq, struct ulist **roots);
61
57struct btrfs_data_container *init_data_container(u32 total_bytes); 62struct btrfs_data_container *init_data_container(u32 total_bytes);
58struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, 63struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
59 struct btrfs_path *path); 64 struct btrfs_path *path);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 634608d2a6d0..9b9b15fd5204 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -51,6 +51,9 @@ struct btrfs_inode {
51 /* held while logging the inode in tree-log.c */ 51 /* held while logging the inode in tree-log.c */
52 struct mutex log_mutex; 52 struct mutex log_mutex;
53 53
54 /* held while doing delalloc reservations */
55 struct mutex delalloc_mutex;
56
54 /* used to order data wrt metadata */ 57 /* used to order data wrt metadata */
55 struct btrfs_ordered_inode_tree ordered_tree; 58 struct btrfs_ordered_inode_tree ordered_tree;
56 59
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
new file mode 100644
index 000000000000..ad0b3ba735b7
--- /dev/null
+++ b/fs/btrfs/check-integrity.c
@@ -0,0 +1,3068 @@
1/*
2 * Copyright (C) STRATO AG 2011. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19/*
20 * This module can be used to catch cases when the btrfs kernel
21 * code executes write requests to the disk that bring the file
22 * system in an inconsistent state. In such a state, a power-loss
23 * or kernel panic event would cause that the data on disk is
24 * lost or at least damaged.
25 *
26 * Code is added that examines all block write requests during
27 * runtime (including writes of the super block). Three rules
28 * are verified and an error is printed on violation of the
29 * rules:
30 * 1. It is not allowed to write a disk block which is
31 * currently referenced by the super block (either directly
32 * or indirectly).
33 * 2. When a super block is written, it is verified that all
34 * referenced (directly or indirectly) blocks fulfill the
35 * following requirements:
36 * 2a. All referenced blocks have either been present when
37 * the file system was mounted, (i.e., they have been
38 * referenced by the super block) or they have been
39 * written since then and the write completion callback
40 * was called and a FLUSH request to the device where
41 * these blocks are located was received and completed.
42 * 2b. All referenced blocks need to have a generation
43 * number which is equal to the parent's number.
44 *
45 * One issue that was found using this module was that the log
46 * tree on disk became temporarily corrupted because disk blocks
47 * that had been in use for the log tree had been freed and
48 * reused too early, while being referenced by the written super
49 * block.
50 *
51 * The search term in the kernel log that can be used to filter
52 * on the existence of detected integrity issues is
53 * "btrfs: attempt".
54 *
55 * The integrity check is enabled via mount options. These
56 * mount options are only supported if the integrity check
57 * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY.
58 *
59 * Example #1, apply integrity checks to all metadata:
60 * mount /dev/sdb1 /mnt -o check_int
61 *
62 * Example #2, apply integrity checks to all metadata and
63 * to data extents:
64 * mount /dev/sdb1 /mnt -o check_int_data
65 *
66 * Example #3, apply integrity checks to all metadata and dump
67 * the tree that the super block references to kernel messages
68 * each time after a super block was written:
69 * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263
70 *
71 * If the integrity check tool is included and activated in
72 * the mount options, plenty of kernel memory is used, and
73 * plenty of additional CPU cycles are spent. Enabling this
74 * functionality is not intended for normal use. In most
75 * cases, unless you are a btrfs developer who needs to verify
76 * the integrity of (super)-block write requests, do not
77 * enable the config option BTRFS_FS_CHECK_INTEGRITY to
78 * include and compile the integrity check tool.
79 */
80
81#include <linux/sched.h>
82#include <linux/slab.h>
83#include <linux/buffer_head.h>
84#include <linux/mutex.h>
85#include <linux/crc32c.h>
86#include <linux/genhd.h>
87#include <linux/blkdev.h>
88#include "ctree.h"
89#include "disk-io.h"
90#include "transaction.h"
91#include "extent_io.h"
92#include "disk-io.h"
93#include "volumes.h"
94#include "print-tree.h"
95#include "locking.h"
96#include "check-integrity.h"
97
98#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
99#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
100#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100
101#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051
102#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807
103#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530
104#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
105#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters,
106 * excluding " [...]" */
107#define BTRFSIC_BLOCK_SIZE PAGE_SIZE
108
109#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
110
111/*
112 * The definition of the bitmask fields for the print_mask.
113 * They are specified with the mount option check_integrity_print_mask.
114 */
115#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE 0x00000001
116#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION 0x00000002
117#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE 0x00000004
118#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE 0x00000008
119#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH 0x00000010
120#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH 0x00000020
121#define BTRFSIC_PRINT_MASK_VERBOSE 0x00000040
122#define BTRFSIC_PRINT_MASK_VERY_VERBOSE 0x00000080
123#define BTRFSIC_PRINT_MASK_INITIAL_TREE 0x00000100
124#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES 0x00000200
125#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400
126#define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800
127#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000
128
129struct btrfsic_dev_state;
130struct btrfsic_state;
131
132struct btrfsic_block {
133 u32 magic_num; /* only used for debug purposes */
134 unsigned int is_metadata:1; /* if it is meta-data, not data-data */
135 unsigned int is_superblock:1; /* if it is one of the superblocks */
136 unsigned int is_iodone:1; /* if is done by lower subsystem */
137 unsigned int iodone_w_error:1; /* error was indicated to endio */
138 unsigned int never_written:1; /* block was added because it was
139 * referenced, not because it was
140 * written */
141 unsigned int mirror_num:2; /* large enough to hold
142 * BTRFS_SUPER_MIRROR_MAX */
143 struct btrfsic_dev_state *dev_state;
144 u64 dev_bytenr; /* key, physical byte num on disk */
145 u64 logical_bytenr; /* logical byte num on disk */
146 u64 generation;
147 struct btrfs_disk_key disk_key; /* extra info to print in case of
148 * issues, will not always be correct */
149 struct list_head collision_resolving_node; /* list node */
150 struct list_head all_blocks_node; /* list node */
151
152 /* the following two lists contain block_link items */
153 struct list_head ref_to_list; /* list */
154 struct list_head ref_from_list; /* list */
155 struct btrfsic_block *next_in_same_bio;
156 void *orig_bio_bh_private;
157 union {
158 bio_end_io_t *bio;
159 bh_end_io_t *bh;
160 } orig_bio_bh_end_io;
161 int submit_bio_bh_rw;
162 u64 flush_gen; /* only valid if !never_written */
163};
164
165/*
166 * Elements of this type are allocated dynamically and required because
167 * each block object can refer to and can be ref from multiple blocks.
168 * The key to lookup them in the hashtable is the dev_bytenr of
169 * the block ref to plus the one from the block refered from.
170 * The fact that they are searchable via a hashtable and that a
171 * ref_cnt is maintained is not required for the btrfs integrity
172 * check algorithm itself, it is only used to make the output more
173 * beautiful in case that an error is detected (an error is defined
174 * as a write operation to a block while that block is still referenced).
175 */
176struct btrfsic_block_link {
177 u32 magic_num; /* only used for debug purposes */
178 u32 ref_cnt;
179 struct list_head node_ref_to; /* list node */
180 struct list_head node_ref_from; /* list node */
181 struct list_head collision_resolving_node; /* list node */
182 struct btrfsic_block *block_ref_to;
183 struct btrfsic_block *block_ref_from;
184 u64 parent_generation;
185};
186
187struct btrfsic_dev_state {
188 u32 magic_num; /* only used for debug purposes */
189 struct block_device *bdev;
190 struct btrfsic_state *state;
191 struct list_head collision_resolving_node; /* list node */
192 struct btrfsic_block dummy_block_for_bio_bh_flush;
193 u64 last_flush_gen;
194 char name[BDEVNAME_SIZE];
195};
196
197struct btrfsic_block_hashtable {
198 struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE];
199};
200
201struct btrfsic_block_link_hashtable {
202 struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE];
203};
204
205struct btrfsic_dev_state_hashtable {
206 struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE];
207};
208
209struct btrfsic_block_data_ctx {
210 u64 start; /* virtual bytenr */
211 u64 dev_bytenr; /* physical bytenr on device */
212 u32 len;
213 struct btrfsic_dev_state *dev;
214 char *data;
215 struct buffer_head *bh; /* do not use if set to NULL */
216};
217
218/* This structure is used to implement recursion without occupying
219 * any stack space, refer to btrfsic_process_metablock() */
220struct btrfsic_stack_frame {
221 u32 magic;
222 u32 nr;
223 int error;
224 int i;
225 int limit_nesting;
226 int num_copies;
227 int mirror_num;
228 struct btrfsic_block *block;
229 struct btrfsic_block_data_ctx *block_ctx;
230 struct btrfsic_block *next_block;
231 struct btrfsic_block_data_ctx next_block_ctx;
232 struct btrfs_header *hdr;
233 struct btrfsic_stack_frame *prev;
234};
235
236/* Some state per mounted filesystem */
237struct btrfsic_state {
238 u32 print_mask;
239 int include_extent_data;
240 int csum_size;
241 struct list_head all_blocks_list;
242 struct btrfsic_block_hashtable block_hashtable;
243 struct btrfsic_block_link_hashtable block_link_hashtable;
244 struct btrfs_root *root;
245 u64 max_superblock_generation;
246 struct btrfsic_block *latest_superblock;
247};
248
249static void btrfsic_block_init(struct btrfsic_block *b);
250static struct btrfsic_block *btrfsic_block_alloc(void);
251static void btrfsic_block_free(struct btrfsic_block *b);
252static void btrfsic_block_link_init(struct btrfsic_block_link *n);
253static struct btrfsic_block_link *btrfsic_block_link_alloc(void);
254static void btrfsic_block_link_free(struct btrfsic_block_link *n);
255static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds);
256static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void);
257static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds);
258static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h);
259static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
260 struct btrfsic_block_hashtable *h);
261static void btrfsic_block_hashtable_remove(struct btrfsic_block *b);
262static struct btrfsic_block *btrfsic_block_hashtable_lookup(
263 struct block_device *bdev,
264 u64 dev_bytenr,
265 struct btrfsic_block_hashtable *h);
266static void btrfsic_block_link_hashtable_init(
267 struct btrfsic_block_link_hashtable *h);
268static void btrfsic_block_link_hashtable_add(
269 struct btrfsic_block_link *l,
270 struct btrfsic_block_link_hashtable *h);
271static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l);
272static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
273 struct block_device *bdev_ref_to,
274 u64 dev_bytenr_ref_to,
275 struct block_device *bdev_ref_from,
276 u64 dev_bytenr_ref_from,
277 struct btrfsic_block_link_hashtable *h);
278static void btrfsic_dev_state_hashtable_init(
279 struct btrfsic_dev_state_hashtable *h);
280static void btrfsic_dev_state_hashtable_add(
281 struct btrfsic_dev_state *ds,
282 struct btrfsic_dev_state_hashtable *h);
283static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds);
284static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
285 struct block_device *bdev,
286 struct btrfsic_dev_state_hashtable *h);
287static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void);
288static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf);
289static int btrfsic_process_superblock(struct btrfsic_state *state,
290 struct btrfs_fs_devices *fs_devices);
291static int btrfsic_process_metablock(struct btrfsic_state *state,
292 struct btrfsic_block *block,
293 struct btrfsic_block_data_ctx *block_ctx,
294 struct btrfs_header *hdr,
295 int limit_nesting, int force_iodone_flag);
296static int btrfsic_create_link_to_next_block(
297 struct btrfsic_state *state,
298 struct btrfsic_block *block,
299 struct btrfsic_block_data_ctx
300 *block_ctx, u64 next_bytenr,
301 int limit_nesting,
302 struct btrfsic_block_data_ctx *next_block_ctx,
303 struct btrfsic_block **next_blockp,
304 int force_iodone_flag,
305 int *num_copiesp, int *mirror_nump,
306 struct btrfs_disk_key *disk_key,
307 u64 parent_generation);
308static int btrfsic_handle_extent_data(struct btrfsic_state *state,
309 struct btrfsic_block *block,
310 struct btrfsic_block_data_ctx *block_ctx,
311 u32 item_offset, int force_iodone_flag);
312static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
313 struct btrfsic_block_data_ctx *block_ctx_out,
314 int mirror_num);
315static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
316 u32 len, struct block_device *bdev,
317 struct btrfsic_block_data_ctx *block_ctx_out);
318static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
319static int btrfsic_read_block(struct btrfsic_state *state,
320 struct btrfsic_block_data_ctx *block_ctx);
321static void btrfsic_dump_database(struct btrfsic_state *state);
322static int btrfsic_test_for_metadata(struct btrfsic_state *state,
323 const u8 *data, unsigned int size);
324static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
325 u64 dev_bytenr, u8 *mapped_data,
326 unsigned int len, struct bio *bio,
327 int *bio_is_patched,
328 struct buffer_head *bh,
329 int submit_bio_bh_rw);
330static int btrfsic_process_written_superblock(
331 struct btrfsic_state *state,
332 struct btrfsic_block *const block,
333 struct btrfs_super_block *const super_hdr);
334static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status);
335static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate);
336static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
337 const struct btrfsic_block *block,
338 int recursion_level);
339static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
340 struct btrfsic_block *const block,
341 int recursion_level);
342static void btrfsic_print_add_link(const struct btrfsic_state *state,
343 const struct btrfsic_block_link *l);
344static void btrfsic_print_rem_link(const struct btrfsic_state *state,
345 const struct btrfsic_block_link *l);
346static char btrfsic_get_block_type(const struct btrfsic_state *state,
347 const struct btrfsic_block *block);
348static void btrfsic_dump_tree(const struct btrfsic_state *state);
349static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
350 const struct btrfsic_block *block,
351 int indent_level);
352static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
353 struct btrfsic_state *state,
354 struct btrfsic_block_data_ctx *next_block_ctx,
355 struct btrfsic_block *next_block,
356 struct btrfsic_block *from_block,
357 u64 parent_generation);
358static struct btrfsic_block *btrfsic_block_lookup_or_add(
359 struct btrfsic_state *state,
360 struct btrfsic_block_data_ctx *block_ctx,
361 const char *additional_string,
362 int is_metadata,
363 int is_iodone,
364 int never_written,
365 int mirror_num,
366 int *was_created);
367static int btrfsic_process_superblock_dev_mirror(
368 struct btrfsic_state *state,
369 struct btrfsic_dev_state *dev_state,
370 struct btrfs_device *device,
371 int superblock_mirror_num,
372 struct btrfsic_dev_state **selected_dev_state,
373 struct btrfs_super_block *selected_super);
374static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
375 struct block_device *bdev);
376static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
377 u64 bytenr,
378 struct btrfsic_dev_state *dev_state,
379 u64 dev_bytenr, char *data);
380
381static struct mutex btrfsic_mutex;
382static int btrfsic_is_initialized;
383static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable;
384
385
386static void btrfsic_block_init(struct btrfsic_block *b)
387{
388 b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER;
389 b->dev_state = NULL;
390 b->dev_bytenr = 0;
391 b->logical_bytenr = 0;
392 b->generation = BTRFSIC_GENERATION_UNKNOWN;
393 b->disk_key.objectid = 0;
394 b->disk_key.type = 0;
395 b->disk_key.offset = 0;
396 b->is_metadata = 0;
397 b->is_superblock = 0;
398 b->is_iodone = 0;
399 b->iodone_w_error = 0;
400 b->never_written = 0;
401 b->mirror_num = 0;
402 b->next_in_same_bio = NULL;
403 b->orig_bio_bh_private = NULL;
404 b->orig_bio_bh_end_io.bio = NULL;
405 INIT_LIST_HEAD(&b->collision_resolving_node);
406 INIT_LIST_HEAD(&b->all_blocks_node);
407 INIT_LIST_HEAD(&b->ref_to_list);
408 INIT_LIST_HEAD(&b->ref_from_list);
409 b->submit_bio_bh_rw = 0;
410 b->flush_gen = 0;
411}
412
413static struct btrfsic_block *btrfsic_block_alloc(void)
414{
415 struct btrfsic_block *b;
416
417 b = kzalloc(sizeof(*b), GFP_NOFS);
418 if (NULL != b)
419 btrfsic_block_init(b);
420
421 return b;
422}
423
424static void btrfsic_block_free(struct btrfsic_block *b)
425{
426 BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num));
427 kfree(b);
428}
429
430static void btrfsic_block_link_init(struct btrfsic_block_link *l)
431{
432 l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER;
433 l->ref_cnt = 1;
434 INIT_LIST_HEAD(&l->node_ref_to);
435 INIT_LIST_HEAD(&l->node_ref_from);
436 INIT_LIST_HEAD(&l->collision_resolving_node);
437 l->block_ref_to = NULL;
438 l->block_ref_from = NULL;
439}
440
441static struct btrfsic_block_link *btrfsic_block_link_alloc(void)
442{
443 struct btrfsic_block_link *l;
444
445 l = kzalloc(sizeof(*l), GFP_NOFS);
446 if (NULL != l)
447 btrfsic_block_link_init(l);
448
449 return l;
450}
451
452static void btrfsic_block_link_free(struct btrfsic_block_link *l)
453{
454 BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num));
455 kfree(l);
456}
457
458static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
459{
460 ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
461 ds->bdev = NULL;
462 ds->state = NULL;
463 ds->name[0] = '\0';
464 INIT_LIST_HEAD(&ds->collision_resolving_node);
465 ds->last_flush_gen = 0;
466 btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
467 ds->dummy_block_for_bio_bh_flush.is_iodone = 1;
468 ds->dummy_block_for_bio_bh_flush.dev_state = ds;
469}
470
471static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void)
472{
473 struct btrfsic_dev_state *ds;
474
475 ds = kzalloc(sizeof(*ds), GFP_NOFS);
476 if (NULL != ds)
477 btrfsic_dev_state_init(ds);
478
479 return ds;
480}
481
482static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds)
483{
484 BUG_ON(!(NULL == ds ||
485 BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num));
486 kfree(ds);
487}
488
489static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h)
490{
491 int i;
492
493 for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++)
494 INIT_LIST_HEAD(h->table + i);
495}
496
497static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
498 struct btrfsic_block_hashtable *h)
499{
500 const unsigned int hashval =
501 (((unsigned int)(b->dev_bytenr >> 16)) ^
502 ((unsigned int)((uintptr_t)b->dev_state->bdev))) &
503 (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
504
505 list_add(&b->collision_resolving_node, h->table + hashval);
506}
507
508static void btrfsic_block_hashtable_remove(struct btrfsic_block *b)
509{
510 list_del(&b->collision_resolving_node);
511}
512
513static struct btrfsic_block *btrfsic_block_hashtable_lookup(
514 struct block_device *bdev,
515 u64 dev_bytenr,
516 struct btrfsic_block_hashtable *h)
517{
518 const unsigned int hashval =
519 (((unsigned int)(dev_bytenr >> 16)) ^
520 ((unsigned int)((uintptr_t)bdev))) &
521 (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
522 struct list_head *elem;
523
524 list_for_each(elem, h->table + hashval) {
525 struct btrfsic_block *const b =
526 list_entry(elem, struct btrfsic_block,
527 collision_resolving_node);
528
529 if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr)
530 return b;
531 }
532
533 return NULL;
534}
535
536static void btrfsic_block_link_hashtable_init(
537 struct btrfsic_block_link_hashtable *h)
538{
539 int i;
540
541 for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++)
542 INIT_LIST_HEAD(h->table + i);
543}
544
545static void btrfsic_block_link_hashtable_add(
546 struct btrfsic_block_link *l,
547 struct btrfsic_block_link_hashtable *h)
548{
549 const unsigned int hashval =
550 (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^
551 ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^
552 ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^
553 ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev)))
554 & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
555
556 BUG_ON(NULL == l->block_ref_to);
557 BUG_ON(NULL == l->block_ref_from);
558 list_add(&l->collision_resolving_node, h->table + hashval);
559}
560
561static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l)
562{
563 list_del(&l->collision_resolving_node);
564}
565
566static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
567 struct block_device *bdev_ref_to,
568 u64 dev_bytenr_ref_to,
569 struct block_device *bdev_ref_from,
570 u64 dev_bytenr_ref_from,
571 struct btrfsic_block_link_hashtable *h)
572{
573 const unsigned int hashval =
574 (((unsigned int)(dev_bytenr_ref_to >> 16)) ^
575 ((unsigned int)(dev_bytenr_ref_from >> 16)) ^
576 ((unsigned int)((uintptr_t)bdev_ref_to)) ^
577 ((unsigned int)((uintptr_t)bdev_ref_from))) &
578 (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
579 struct list_head *elem;
580
581 list_for_each(elem, h->table + hashval) {
582 struct btrfsic_block_link *const l =
583 list_entry(elem, struct btrfsic_block_link,
584 collision_resolving_node);
585
586 BUG_ON(NULL == l->block_ref_to);
587 BUG_ON(NULL == l->block_ref_from);
588 if (l->block_ref_to->dev_state->bdev == bdev_ref_to &&
589 l->block_ref_to->dev_bytenr == dev_bytenr_ref_to &&
590 l->block_ref_from->dev_state->bdev == bdev_ref_from &&
591 l->block_ref_from->dev_bytenr == dev_bytenr_ref_from)
592 return l;
593 }
594
595 return NULL;
596}
597
598static void btrfsic_dev_state_hashtable_init(
599 struct btrfsic_dev_state_hashtable *h)
600{
601 int i;
602
603 for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++)
604 INIT_LIST_HEAD(h->table + i);
605}
606
607static void btrfsic_dev_state_hashtable_add(
608 struct btrfsic_dev_state *ds,
609 struct btrfsic_dev_state_hashtable *h)
610{
611 const unsigned int hashval =
612 (((unsigned int)((uintptr_t)ds->bdev)) &
613 (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
614
615 list_add(&ds->collision_resolving_node, h->table + hashval);
616}
617
618static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds)
619{
620 list_del(&ds->collision_resolving_node);
621}
622
623static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
624 struct block_device *bdev,
625 struct btrfsic_dev_state_hashtable *h)
626{
627 const unsigned int hashval =
628 (((unsigned int)((uintptr_t)bdev)) &
629 (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
630 struct list_head *elem;
631
632 list_for_each(elem, h->table + hashval) {
633 struct btrfsic_dev_state *const ds =
634 list_entry(elem, struct btrfsic_dev_state,
635 collision_resolving_node);
636
637 if (ds->bdev == bdev)
638 return ds;
639 }
640
641 return NULL;
642}
643
644static int btrfsic_process_superblock(struct btrfsic_state *state,
645 struct btrfs_fs_devices *fs_devices)
646{
647 int ret;
648 struct btrfs_super_block *selected_super;
649 struct list_head *dev_head = &fs_devices->devices;
650 struct btrfs_device *device;
651 struct btrfsic_dev_state *selected_dev_state = NULL;
652 int pass;
653
654 BUG_ON(NULL == state);
655 selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS);
656 if (NULL == selected_super) {
657 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
658 return -1;
659 }
660
661 list_for_each_entry(device, dev_head, dev_list) {
662 int i;
663 struct btrfsic_dev_state *dev_state;
664
665 if (!device->bdev || !device->name)
666 continue;
667
668 dev_state = btrfsic_dev_state_lookup(device->bdev);
669 BUG_ON(NULL == dev_state);
670 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
671 ret = btrfsic_process_superblock_dev_mirror(
672 state, dev_state, device, i,
673 &selected_dev_state, selected_super);
674 if (0 != ret && 0 == i) {
675 kfree(selected_super);
676 return ret;
677 }
678 }
679 }
680
681 if (NULL == state->latest_superblock) {
682 printk(KERN_INFO "btrfsic: no superblock found!\n");
683 kfree(selected_super);
684 return -1;
685 }
686
687 state->csum_size = btrfs_super_csum_size(selected_super);
688
689 for (pass = 0; pass < 3; pass++) {
690 int num_copies;
691 int mirror_num;
692 u64 next_bytenr;
693
694 switch (pass) {
695 case 0:
696 next_bytenr = btrfs_super_root(selected_super);
697 if (state->print_mask &
698 BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
699 printk(KERN_INFO "root@%llu\n",
700 (unsigned long long)next_bytenr);
701 break;
702 case 1:
703 next_bytenr = btrfs_super_chunk_root(selected_super);
704 if (state->print_mask &
705 BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
706 printk(KERN_INFO "chunk@%llu\n",
707 (unsigned long long)next_bytenr);
708 break;
709 case 2:
710 next_bytenr = btrfs_super_log_root(selected_super);
711 if (0 == next_bytenr)
712 continue;
713 if (state->print_mask &
714 BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
715 printk(KERN_INFO "log@%llu\n",
716 (unsigned long long)next_bytenr);
717 break;
718 }
719
720 num_copies =
721 btrfs_num_copies(&state->root->fs_info->mapping_tree,
722 next_bytenr, PAGE_SIZE);
723 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
724 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
725 (unsigned long long)next_bytenr, num_copies);
726
727 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
728 struct btrfsic_block *next_block;
729 struct btrfsic_block_data_ctx tmp_next_block_ctx;
730 struct btrfsic_block_link *l;
731 struct btrfs_header *hdr;
732
733 ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
734 &tmp_next_block_ctx,
735 mirror_num);
736 if (ret) {
737 printk(KERN_INFO "btrfsic:"
738 " btrfsic_map_block(root @%llu,"
739 " mirror %d) failed!\n",
740 (unsigned long long)next_bytenr,
741 mirror_num);
742 kfree(selected_super);
743 return -1;
744 }
745
746 next_block = btrfsic_block_hashtable_lookup(
747 tmp_next_block_ctx.dev->bdev,
748 tmp_next_block_ctx.dev_bytenr,
749 &state->block_hashtable);
750 BUG_ON(NULL == next_block);
751
752 l = btrfsic_block_link_hashtable_lookup(
753 tmp_next_block_ctx.dev->bdev,
754 tmp_next_block_ctx.dev_bytenr,
755 state->latest_superblock->dev_state->
756 bdev,
757 state->latest_superblock->dev_bytenr,
758 &state->block_link_hashtable);
759 BUG_ON(NULL == l);
760
761 ret = btrfsic_read_block(state, &tmp_next_block_ctx);
762 if (ret < (int)BTRFSIC_BLOCK_SIZE) {
763 printk(KERN_INFO
764 "btrfsic: read @logical %llu failed!\n",
765 (unsigned long long)
766 tmp_next_block_ctx.start);
767 btrfsic_release_block_ctx(&tmp_next_block_ctx);
768 kfree(selected_super);
769 return -1;
770 }
771
772 hdr = (struct btrfs_header *)tmp_next_block_ctx.data;
773 ret = btrfsic_process_metablock(state,
774 next_block,
775 &tmp_next_block_ctx,
776 hdr,
777 BTRFS_MAX_LEVEL + 3, 1);
778 btrfsic_release_block_ctx(&tmp_next_block_ctx);
779 }
780 }
781
782 kfree(selected_super);
783 return ret;
784}
785
786static int btrfsic_process_superblock_dev_mirror(
787 struct btrfsic_state *state,
788 struct btrfsic_dev_state *dev_state,
789 struct btrfs_device *device,
790 int superblock_mirror_num,
791 struct btrfsic_dev_state **selected_dev_state,
792 struct btrfs_super_block *selected_super)
793{
794 struct btrfs_super_block *super_tmp;
795 u64 dev_bytenr;
796 struct buffer_head *bh;
797 struct btrfsic_block *superblock_tmp;
798 int pass;
799 struct block_device *const superblock_bdev = device->bdev;
800
801 /* super block bytenr is always the unmapped device bytenr */
802 dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
803 bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096);
804 if (NULL == bh)
805 return -1;
806 super_tmp = (struct btrfs_super_block *)
807 (bh->b_data + (dev_bytenr & 4095));
808
809 if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
810 strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC,
811 sizeof(super_tmp->magic)) ||
812 memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) {
813 brelse(bh);
814 return 0;
815 }
816
817 superblock_tmp =
818 btrfsic_block_hashtable_lookup(superblock_bdev,
819 dev_bytenr,
820 &state->block_hashtable);
821 if (NULL == superblock_tmp) {
822 superblock_tmp = btrfsic_block_alloc();
823 if (NULL == superblock_tmp) {
824 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
825 brelse(bh);
826 return -1;
827 }
828 /* for superblock, only the dev_bytenr makes sense */
829 superblock_tmp->dev_bytenr = dev_bytenr;
830 superblock_tmp->dev_state = dev_state;
831 superblock_tmp->logical_bytenr = dev_bytenr;
832 superblock_tmp->generation = btrfs_super_generation(super_tmp);
833 superblock_tmp->is_metadata = 1;
834 superblock_tmp->is_superblock = 1;
835 superblock_tmp->is_iodone = 1;
836 superblock_tmp->never_written = 0;
837 superblock_tmp->mirror_num = 1 + superblock_mirror_num;
838 if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
839 printk(KERN_INFO "New initial S-block (bdev %p, %s)"
840 " @%llu (%s/%llu/%d)\n",
841 superblock_bdev, device->name,
842 (unsigned long long)dev_bytenr,
843 dev_state->name,
844 (unsigned long long)dev_bytenr,
845 superblock_mirror_num);
846 list_add(&superblock_tmp->all_blocks_node,
847 &state->all_blocks_list);
848 btrfsic_block_hashtable_add(superblock_tmp,
849 &state->block_hashtable);
850 }
851
852 /* select the one with the highest generation field */
853 if (btrfs_super_generation(super_tmp) >
854 state->max_superblock_generation ||
855 0 == state->max_superblock_generation) {
856 memcpy(selected_super, super_tmp, sizeof(*selected_super));
857 *selected_dev_state = dev_state;
858 state->max_superblock_generation =
859 btrfs_super_generation(super_tmp);
860 state->latest_superblock = superblock_tmp;
861 }
862
863 for (pass = 0; pass < 3; pass++) {
864 u64 next_bytenr;
865 int num_copies;
866 int mirror_num;
867 const char *additional_string = NULL;
868 struct btrfs_disk_key tmp_disk_key;
869
870 tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
871 tmp_disk_key.offset = 0;
872 switch (pass) {
873 case 0:
874 tmp_disk_key.objectid =
875 cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
876 additional_string = "initial root ";
877 next_bytenr = btrfs_super_root(super_tmp);
878 break;
879 case 1:
880 tmp_disk_key.objectid =
881 cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
882 additional_string = "initial chunk ";
883 next_bytenr = btrfs_super_chunk_root(super_tmp);
884 break;
885 case 2:
886 tmp_disk_key.objectid =
887 cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
888 additional_string = "initial log ";
889 next_bytenr = btrfs_super_log_root(super_tmp);
890 if (0 == next_bytenr)
891 continue;
892 break;
893 }
894
895 num_copies =
896 btrfs_num_copies(&state->root->fs_info->mapping_tree,
897 next_bytenr, PAGE_SIZE);
898 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
899 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
900 (unsigned long long)next_bytenr, num_copies);
901 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
902 struct btrfsic_block *next_block;
903 struct btrfsic_block_data_ctx tmp_next_block_ctx;
904 struct btrfsic_block_link *l;
905
906 if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
907 &tmp_next_block_ctx,
908 mirror_num)) {
909 printk(KERN_INFO "btrfsic: btrfsic_map_block("
910 "bytenr @%llu, mirror %d) failed!\n",
911 (unsigned long long)next_bytenr,
912 mirror_num);
913 brelse(bh);
914 return -1;
915 }
916
917 next_block = btrfsic_block_lookup_or_add(
918 state, &tmp_next_block_ctx,
919 additional_string, 1, 1, 0,
920 mirror_num, NULL);
921 if (NULL == next_block) {
922 btrfsic_release_block_ctx(&tmp_next_block_ctx);
923 brelse(bh);
924 return -1;
925 }
926
927 next_block->disk_key = tmp_disk_key;
928 next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
929 l = btrfsic_block_link_lookup_or_add(
930 state, &tmp_next_block_ctx,
931 next_block, superblock_tmp,
932 BTRFSIC_GENERATION_UNKNOWN);
933 btrfsic_release_block_ctx(&tmp_next_block_ctx);
934 if (NULL == l) {
935 brelse(bh);
936 return -1;
937 }
938 }
939 }
940 if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES)
941 btrfsic_dump_tree_sub(state, superblock_tmp, 0);
942
943 brelse(bh);
944 return 0;
945}
946
947static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
948{
949 struct btrfsic_stack_frame *sf;
950
951 sf = kzalloc(sizeof(*sf), GFP_NOFS);
952 if (NULL == sf)
953 printk(KERN_INFO "btrfsic: alloc memory failed!\n");
954 else
955 sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER;
956 return sf;
957}
958
959static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf)
960{
961 BUG_ON(!(NULL == sf ||
962 BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic));
963 kfree(sf);
964}
965
966static int btrfsic_process_metablock(
967 struct btrfsic_state *state,
968 struct btrfsic_block *const first_block,
969 struct btrfsic_block_data_ctx *const first_block_ctx,
970 struct btrfs_header *const first_hdr,
971 int first_limit_nesting, int force_iodone_flag)
972{
973 struct btrfsic_stack_frame initial_stack_frame = { 0 };
974 struct btrfsic_stack_frame *sf;
975 struct btrfsic_stack_frame *next_stack;
976
977 sf = &initial_stack_frame;
978 sf->error = 0;
979 sf->i = -1;
980 sf->limit_nesting = first_limit_nesting;
981 sf->block = first_block;
982 sf->block_ctx = first_block_ctx;
983 sf->next_block = NULL;
984 sf->hdr = first_hdr;
985 sf->prev = NULL;
986
987continue_with_new_stack_frame:
988 sf->block->generation = le64_to_cpu(sf->hdr->generation);
989 if (0 == sf->hdr->level) {
990 struct btrfs_leaf *const leafhdr =
991 (struct btrfs_leaf *)sf->hdr;
992
993 if (-1 == sf->i) {
994 sf->nr = le32_to_cpu(leafhdr->header.nritems);
995
996 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
997 printk(KERN_INFO
998 "leaf %llu items %d generation %llu"
999 " owner %llu\n",
1000 (unsigned long long)
1001 sf->block_ctx->start,
1002 sf->nr,
1003 (unsigned long long)
1004 le64_to_cpu(leafhdr->header.generation),
1005 (unsigned long long)
1006 le64_to_cpu(leafhdr->header.owner));
1007 }
1008
1009continue_with_current_leaf_stack_frame:
1010 if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
1011 sf->i++;
1012 sf->num_copies = 0;
1013 }
1014
1015 if (sf->i < sf->nr) {
1016 struct btrfs_item *disk_item = leafhdr->items + sf->i;
1017 struct btrfs_disk_key *disk_key = &disk_item->key;
1018 u8 type;
1019 const u32 item_offset = le32_to_cpu(disk_item->offset);
1020
1021 type = disk_key->type;
1022
1023 if (BTRFS_ROOT_ITEM_KEY == type) {
1024 const struct btrfs_root_item *const root_item =
1025 (struct btrfs_root_item *)
1026 (sf->block_ctx->data +
1027 offsetof(struct btrfs_leaf, items) +
1028 item_offset);
1029 const u64 next_bytenr =
1030 le64_to_cpu(root_item->bytenr);
1031
1032 sf->error =
1033 btrfsic_create_link_to_next_block(
1034 state,
1035 sf->block,
1036 sf->block_ctx,
1037 next_bytenr,
1038 sf->limit_nesting,
1039 &sf->next_block_ctx,
1040 &sf->next_block,
1041 force_iodone_flag,
1042 &sf->num_copies,
1043 &sf->mirror_num,
1044 disk_key,
1045 le64_to_cpu(root_item->
1046 generation));
1047 if (sf->error)
1048 goto one_stack_frame_backwards;
1049
1050 if (NULL != sf->next_block) {
1051 struct btrfs_header *const next_hdr =
1052 (struct btrfs_header *)
1053 sf->next_block_ctx.data;
1054
1055 next_stack =
1056 btrfsic_stack_frame_alloc();
1057 if (NULL == next_stack) {
1058 btrfsic_release_block_ctx(
1059 &sf->
1060 next_block_ctx);
1061 goto one_stack_frame_backwards;
1062 }
1063
1064 next_stack->i = -1;
1065 next_stack->block = sf->next_block;
1066 next_stack->block_ctx =
1067 &sf->next_block_ctx;
1068 next_stack->next_block = NULL;
1069 next_stack->hdr = next_hdr;
1070 next_stack->limit_nesting =
1071 sf->limit_nesting - 1;
1072 next_stack->prev = sf;
1073 sf = next_stack;
1074 goto continue_with_new_stack_frame;
1075 }
1076 } else if (BTRFS_EXTENT_DATA_KEY == type &&
1077 state->include_extent_data) {
1078 sf->error = btrfsic_handle_extent_data(
1079 state,
1080 sf->block,
1081 sf->block_ctx,
1082 item_offset,
1083 force_iodone_flag);
1084 if (sf->error)
1085 goto one_stack_frame_backwards;
1086 }
1087
1088 goto continue_with_current_leaf_stack_frame;
1089 }
1090 } else {
1091 struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr;
1092
1093 if (-1 == sf->i) {
1094 sf->nr = le32_to_cpu(nodehdr->header.nritems);
1095
1096 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1097 printk(KERN_INFO "node %llu level %d items %d"
1098 " generation %llu owner %llu\n",
1099 (unsigned long long)
1100 sf->block_ctx->start,
1101 nodehdr->header.level, sf->nr,
1102 (unsigned long long)
1103 le64_to_cpu(nodehdr->header.generation),
1104 (unsigned long long)
1105 le64_to_cpu(nodehdr->header.owner));
1106 }
1107
1108continue_with_current_node_stack_frame:
1109 if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
1110 sf->i++;
1111 sf->num_copies = 0;
1112 }
1113
1114 if (sf->i < sf->nr) {
1115 struct btrfs_key_ptr *disk_key_ptr =
1116 nodehdr->ptrs + sf->i;
1117 const u64 next_bytenr =
1118 le64_to_cpu(disk_key_ptr->blockptr);
1119
1120 sf->error = btrfsic_create_link_to_next_block(
1121 state,
1122 sf->block,
1123 sf->block_ctx,
1124 next_bytenr,
1125 sf->limit_nesting,
1126 &sf->next_block_ctx,
1127 &sf->next_block,
1128 force_iodone_flag,
1129 &sf->num_copies,
1130 &sf->mirror_num,
1131 &disk_key_ptr->key,
1132 le64_to_cpu(disk_key_ptr->generation));
1133 if (sf->error)
1134 goto one_stack_frame_backwards;
1135
1136 if (NULL != sf->next_block) {
1137 struct btrfs_header *const next_hdr =
1138 (struct btrfs_header *)
1139 sf->next_block_ctx.data;
1140
1141 next_stack = btrfsic_stack_frame_alloc();
1142 if (NULL == next_stack)
1143 goto one_stack_frame_backwards;
1144
1145 next_stack->i = -1;
1146 next_stack->block = sf->next_block;
1147 next_stack->block_ctx = &sf->next_block_ctx;
1148 next_stack->next_block = NULL;
1149 next_stack->hdr = next_hdr;
1150 next_stack->limit_nesting =
1151 sf->limit_nesting - 1;
1152 next_stack->prev = sf;
1153 sf = next_stack;
1154 goto continue_with_new_stack_frame;
1155 }
1156
1157 goto continue_with_current_node_stack_frame;
1158 }
1159 }
1160
1161one_stack_frame_backwards:
1162 if (NULL != sf->prev) {
1163 struct btrfsic_stack_frame *const prev = sf->prev;
1164
1165 /* the one for the initial block is freed in the caller */
1166 btrfsic_release_block_ctx(sf->block_ctx);
1167
1168 if (sf->error) {
1169 prev->error = sf->error;
1170 btrfsic_stack_frame_free(sf);
1171 sf = prev;
1172 goto one_stack_frame_backwards;
1173 }
1174
1175 btrfsic_stack_frame_free(sf);
1176 sf = prev;
1177 goto continue_with_new_stack_frame;
1178 } else {
1179 BUG_ON(&initial_stack_frame != sf);
1180 }
1181
1182 return sf->error;
1183}
1184
1185static int btrfsic_create_link_to_next_block(
1186 struct btrfsic_state *state,
1187 struct btrfsic_block *block,
1188 struct btrfsic_block_data_ctx *block_ctx,
1189 u64 next_bytenr,
1190 int limit_nesting,
1191 struct btrfsic_block_data_ctx *next_block_ctx,
1192 struct btrfsic_block **next_blockp,
1193 int force_iodone_flag,
1194 int *num_copiesp, int *mirror_nump,
1195 struct btrfs_disk_key *disk_key,
1196 u64 parent_generation)
1197{
1198 struct btrfsic_block *next_block = NULL;
1199 int ret;
1200 struct btrfsic_block_link *l;
1201 int did_alloc_block_link;
1202 int block_was_created;
1203
1204 *next_blockp = NULL;
1205 if (0 == *num_copiesp) {
1206 *num_copiesp =
1207 btrfs_num_copies(&state->root->fs_info->mapping_tree,
1208 next_bytenr, PAGE_SIZE);
1209 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
1210 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
1211 (unsigned long long)next_bytenr, *num_copiesp);
1212 *mirror_nump = 1;
1213 }
1214
1215 if (*mirror_nump > *num_copiesp)
1216 return 0;
1217
1218 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1219 printk(KERN_INFO
1220 "btrfsic_create_link_to_next_block(mirror_num=%d)\n",
1221 *mirror_nump);
1222 ret = btrfsic_map_block(state, next_bytenr,
1223 BTRFSIC_BLOCK_SIZE,
1224 next_block_ctx, *mirror_nump);
1225 if (ret) {
1226 printk(KERN_INFO
1227 "btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
1228 (unsigned long long)next_bytenr, *mirror_nump);
1229 btrfsic_release_block_ctx(next_block_ctx);
1230 *next_blockp = NULL;
1231 return -1;
1232 }
1233
1234 next_block = btrfsic_block_lookup_or_add(state,
1235 next_block_ctx, "referenced ",
1236 1, force_iodone_flag,
1237 !force_iodone_flag,
1238 *mirror_nump,
1239 &block_was_created);
1240 if (NULL == next_block) {
1241 btrfsic_release_block_ctx(next_block_ctx);
1242 *next_blockp = NULL;
1243 return -1;
1244 }
1245 if (block_was_created) {
1246 l = NULL;
1247 next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
1248 } else {
1249 if (next_block->logical_bytenr != next_bytenr &&
1250 !(!next_block->is_metadata &&
1251 0 == next_block->logical_bytenr)) {
1252 printk(KERN_INFO
1253 "Referenced block @%llu (%s/%llu/%d)"
1254 " found in hash table, %c,"
1255 " bytenr mismatch (!= stored %llu).\n",
1256 (unsigned long long)next_bytenr,
1257 next_block_ctx->dev->name,
1258 (unsigned long long)next_block_ctx->dev_bytenr,
1259 *mirror_nump,
1260 btrfsic_get_block_type(state, next_block),
1261 (unsigned long long)next_block->logical_bytenr);
1262 } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1263 printk(KERN_INFO
1264 "Referenced block @%llu (%s/%llu/%d)"
1265 " found in hash table, %c.\n",
1266 (unsigned long long)next_bytenr,
1267 next_block_ctx->dev->name,
1268 (unsigned long long)next_block_ctx->dev_bytenr,
1269 *mirror_nump,
1270 btrfsic_get_block_type(state, next_block));
1271 next_block->logical_bytenr = next_bytenr;
1272
1273 next_block->mirror_num = *mirror_nump;
1274 l = btrfsic_block_link_hashtable_lookup(
1275 next_block_ctx->dev->bdev,
1276 next_block_ctx->dev_bytenr,
1277 block_ctx->dev->bdev,
1278 block_ctx->dev_bytenr,
1279 &state->block_link_hashtable);
1280 }
1281
1282 next_block->disk_key = *disk_key;
1283 if (NULL == l) {
1284 l = btrfsic_block_link_alloc();
1285 if (NULL == l) {
1286 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
1287 btrfsic_release_block_ctx(next_block_ctx);
1288 *next_blockp = NULL;
1289 return -1;
1290 }
1291
1292 did_alloc_block_link = 1;
1293 l->block_ref_to = next_block;
1294 l->block_ref_from = block;
1295 l->ref_cnt = 1;
1296 l->parent_generation = parent_generation;
1297
1298 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1299 btrfsic_print_add_link(state, l);
1300
1301 list_add(&l->node_ref_to, &block->ref_to_list);
1302 list_add(&l->node_ref_from, &next_block->ref_from_list);
1303
1304 btrfsic_block_link_hashtable_add(l,
1305 &state->block_link_hashtable);
1306 } else {
1307 did_alloc_block_link = 0;
1308 if (0 == limit_nesting) {
1309 l->ref_cnt++;
1310 l->parent_generation = parent_generation;
1311 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1312 btrfsic_print_add_link(state, l);
1313 }
1314 }
1315
1316 if (limit_nesting > 0 && did_alloc_block_link) {
1317 ret = btrfsic_read_block(state, next_block_ctx);
1318 if (ret < (int)BTRFSIC_BLOCK_SIZE) {
1319 printk(KERN_INFO
1320 "btrfsic: read block @logical %llu failed!\n",
1321 (unsigned long long)next_bytenr);
1322 btrfsic_release_block_ctx(next_block_ctx);
1323 *next_blockp = NULL;
1324 return -1;
1325 }
1326
1327 *next_blockp = next_block;
1328 } else {
1329 *next_blockp = NULL;
1330 }
1331 (*mirror_nump)++;
1332
1333 return 0;
1334}
1335
1336static int btrfsic_handle_extent_data(
1337 struct btrfsic_state *state,
1338 struct btrfsic_block *block,
1339 struct btrfsic_block_data_ctx *block_ctx,
1340 u32 item_offset, int force_iodone_flag)
1341{
1342 int ret;
1343 struct btrfs_file_extent_item *file_extent_item =
1344 (struct btrfs_file_extent_item *)(block_ctx->data +
1345 offsetof(struct btrfs_leaf,
1346 items) + item_offset);
1347 u64 next_bytenr =
1348 le64_to_cpu(file_extent_item->disk_bytenr) +
1349 le64_to_cpu(file_extent_item->offset);
1350 u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes);
1351 u64 generation = le64_to_cpu(file_extent_item->generation);
1352 struct btrfsic_block_link *l;
1353
1354 if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
1355 printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu,"
1356 " offset = %llu, num_bytes = %llu\n",
1357 file_extent_item->type,
1358 (unsigned long long)
1359 le64_to_cpu(file_extent_item->disk_bytenr),
1360 (unsigned long long)
1361 le64_to_cpu(file_extent_item->offset),
1362 (unsigned long long)
1363 le64_to_cpu(file_extent_item->num_bytes));
1364 if (BTRFS_FILE_EXTENT_REG != file_extent_item->type ||
1365 ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr))
1366 return 0;
1367 while (num_bytes > 0) {
1368 u32 chunk_len;
1369 int num_copies;
1370 int mirror_num;
1371
1372 if (num_bytes > BTRFSIC_BLOCK_SIZE)
1373 chunk_len = BTRFSIC_BLOCK_SIZE;
1374 else
1375 chunk_len = num_bytes;
1376
1377 num_copies =
1378 btrfs_num_copies(&state->root->fs_info->mapping_tree,
1379 next_bytenr, PAGE_SIZE);
1380 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
1381 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
1382 (unsigned long long)next_bytenr, num_copies);
1383 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
1384 struct btrfsic_block_data_ctx next_block_ctx;
1385 struct btrfsic_block *next_block;
1386 int block_was_created;
1387
1388 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1389 printk(KERN_INFO "btrfsic_handle_extent_data("
1390 "mirror_num=%d)\n", mirror_num);
1391 if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
1392 printk(KERN_INFO
1393 "\tdisk_bytenr = %llu, num_bytes %u\n",
1394 (unsigned long long)next_bytenr,
1395 chunk_len);
1396 ret = btrfsic_map_block(state, next_bytenr,
1397 chunk_len, &next_block_ctx,
1398 mirror_num);
1399 if (ret) {
1400 printk(KERN_INFO
1401 "btrfsic: btrfsic_map_block(@%llu,"
1402 " mirror=%d) failed!\n",
1403 (unsigned long long)next_bytenr,
1404 mirror_num);
1405 return -1;
1406 }
1407
1408 next_block = btrfsic_block_lookup_or_add(
1409 state,
1410 &next_block_ctx,
1411 "referenced ",
1412 0,
1413 force_iodone_flag,
1414 !force_iodone_flag,
1415 mirror_num,
1416 &block_was_created);
1417 if (NULL == next_block) {
1418 printk(KERN_INFO
1419 "btrfsic: error, kmalloc failed!\n");
1420 btrfsic_release_block_ctx(&next_block_ctx);
1421 return -1;
1422 }
1423 if (!block_was_created) {
1424 if (next_block->logical_bytenr != next_bytenr &&
1425 !(!next_block->is_metadata &&
1426 0 == next_block->logical_bytenr)) {
1427 printk(KERN_INFO
1428 "Referenced block"
1429 " @%llu (%s/%llu/%d)"
1430 " found in hash table, D,"
1431 " bytenr mismatch"
1432 " (!= stored %llu).\n",
1433 (unsigned long long)next_bytenr,
1434 next_block_ctx.dev->name,
1435 (unsigned long long)
1436 next_block_ctx.dev_bytenr,
1437 mirror_num,
1438 (unsigned long long)
1439 next_block->logical_bytenr);
1440 }
1441 next_block->logical_bytenr = next_bytenr;
1442 next_block->mirror_num = mirror_num;
1443 }
1444
1445 l = btrfsic_block_link_lookup_or_add(state,
1446 &next_block_ctx,
1447 next_block, block,
1448 generation);
1449 btrfsic_release_block_ctx(&next_block_ctx);
1450 if (NULL == l)
1451 return -1;
1452 }
1453
1454 next_bytenr += chunk_len;
1455 num_bytes -= chunk_len;
1456 }
1457
1458 return 0;
1459}
1460
1461static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
1462 struct btrfsic_block_data_ctx *block_ctx_out,
1463 int mirror_num)
1464{
1465 int ret;
1466 u64 length;
1467 struct btrfs_bio *multi = NULL;
1468 struct btrfs_device *device;
1469
1470 length = len;
1471 ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ,
1472 bytenr, &length, &multi, mirror_num);
1473
1474 device = multi->stripes[0].dev;
1475 block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
1476 block_ctx_out->dev_bytenr = multi->stripes[0].physical;
1477 block_ctx_out->start = bytenr;
1478 block_ctx_out->len = len;
1479 block_ctx_out->data = NULL;
1480 block_ctx_out->bh = NULL;
1481
1482 if (0 == ret)
1483 kfree(multi);
1484 if (NULL == block_ctx_out->dev) {
1485 ret = -ENXIO;
1486 printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
1487 }
1488
1489 return ret;
1490}
1491
1492static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
1493 u32 len, struct block_device *bdev,
1494 struct btrfsic_block_data_ctx *block_ctx_out)
1495{
1496 block_ctx_out->dev = btrfsic_dev_state_lookup(bdev);
1497 block_ctx_out->dev_bytenr = bytenr;
1498 block_ctx_out->start = bytenr;
1499 block_ctx_out->len = len;
1500 block_ctx_out->data = NULL;
1501 block_ctx_out->bh = NULL;
1502 if (NULL != block_ctx_out->dev) {
1503 return 0;
1504 } else {
1505 printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n");
1506 return -ENXIO;
1507 }
1508}
1509
1510static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
1511{
1512 if (NULL != block_ctx->bh) {
1513 brelse(block_ctx->bh);
1514 block_ctx->bh = NULL;
1515 }
1516}
1517
1518static int btrfsic_read_block(struct btrfsic_state *state,
1519 struct btrfsic_block_data_ctx *block_ctx)
1520{
1521 block_ctx->bh = NULL;
1522 if (block_ctx->dev_bytenr & 4095) {
1523 printk(KERN_INFO
1524 "btrfsic: read_block() with unaligned bytenr %llu\n",
1525 (unsigned long long)block_ctx->dev_bytenr);
1526 return -1;
1527 }
1528 if (block_ctx->len > 4096) {
1529 printk(KERN_INFO
1530 "btrfsic: read_block() with too huge size %d\n",
1531 block_ctx->len);
1532 return -1;
1533 }
1534
1535 block_ctx->bh = __bread(block_ctx->dev->bdev,
1536 block_ctx->dev_bytenr >> 12, 4096);
1537 if (NULL == block_ctx->bh)
1538 return -1;
1539 block_ctx->data = block_ctx->bh->b_data;
1540
1541 return block_ctx->len;
1542}
1543
1544static void btrfsic_dump_database(struct btrfsic_state *state)
1545{
1546 struct list_head *elem_all;
1547
1548 BUG_ON(NULL == state);
1549
1550 printk(KERN_INFO "all_blocks_list:\n");
1551 list_for_each(elem_all, &state->all_blocks_list) {
1552 const struct btrfsic_block *const b_all =
1553 list_entry(elem_all, struct btrfsic_block,
1554 all_blocks_node);
1555 struct list_head *elem_ref_to;
1556 struct list_head *elem_ref_from;
1557
1558 printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n",
1559 btrfsic_get_block_type(state, b_all),
1560 (unsigned long long)b_all->logical_bytenr,
1561 b_all->dev_state->name,
1562 (unsigned long long)b_all->dev_bytenr,
1563 b_all->mirror_num);
1564
1565 list_for_each(elem_ref_to, &b_all->ref_to_list) {
1566 const struct btrfsic_block_link *const l =
1567 list_entry(elem_ref_to,
1568 struct btrfsic_block_link,
1569 node_ref_to);
1570
1571 printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
1572 " refers %u* to"
1573 " %c @%llu (%s/%llu/%d)\n",
1574 btrfsic_get_block_type(state, b_all),
1575 (unsigned long long)b_all->logical_bytenr,
1576 b_all->dev_state->name,
1577 (unsigned long long)b_all->dev_bytenr,
1578 b_all->mirror_num,
1579 l->ref_cnt,
1580 btrfsic_get_block_type(state, l->block_ref_to),
1581 (unsigned long long)
1582 l->block_ref_to->logical_bytenr,
1583 l->block_ref_to->dev_state->name,
1584 (unsigned long long)l->block_ref_to->dev_bytenr,
1585 l->block_ref_to->mirror_num);
1586 }
1587
1588 list_for_each(elem_ref_from, &b_all->ref_from_list) {
1589 const struct btrfsic_block_link *const l =
1590 list_entry(elem_ref_from,
1591 struct btrfsic_block_link,
1592 node_ref_from);
1593
1594 printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
1595 " is ref %u* from"
1596 " %c @%llu (%s/%llu/%d)\n",
1597 btrfsic_get_block_type(state, b_all),
1598 (unsigned long long)b_all->logical_bytenr,
1599 b_all->dev_state->name,
1600 (unsigned long long)b_all->dev_bytenr,
1601 b_all->mirror_num,
1602 l->ref_cnt,
1603 btrfsic_get_block_type(state, l->block_ref_from),
1604 (unsigned long long)
1605 l->block_ref_from->logical_bytenr,
1606 l->block_ref_from->dev_state->name,
1607 (unsigned long long)
1608 l->block_ref_from->dev_bytenr,
1609 l->block_ref_from->mirror_num);
1610 }
1611
1612 printk(KERN_INFO "\n");
1613 }
1614}
1615
1616/*
1617 * Test whether the disk block contains a tree block (leaf or node)
1618 * (note that this test fails for the super block)
1619 */
1620static int btrfsic_test_for_metadata(struct btrfsic_state *state,
1621 const u8 *data, unsigned int size)
1622{
1623 struct btrfs_header *h;
1624 u8 csum[BTRFS_CSUM_SIZE];
1625 u32 crc = ~(u32)0;
1626 int fail = 0;
1627 int crc_fail = 0;
1628
1629 h = (struct btrfs_header *)data;
1630
1631 if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE))
1632 fail++;
1633
1634 crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE);
1635 btrfs_csum_final(crc, csum);
1636 if (memcmp(csum, h->csum, state->csum_size))
1637 crc_fail++;
1638
1639 return fail || crc_fail;
1640}
1641
1642static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1643 u64 dev_bytenr,
1644 u8 *mapped_data, unsigned int len,
1645 struct bio *bio,
1646 int *bio_is_patched,
1647 struct buffer_head *bh,
1648 int submit_bio_bh_rw)
1649{
1650 int is_metadata;
1651 struct btrfsic_block *block;
1652 struct btrfsic_block_data_ctx block_ctx;
1653 int ret;
1654 struct btrfsic_state *state = dev_state->state;
1655 struct block_device *bdev = dev_state->bdev;
1656
1657 WARN_ON(len > PAGE_SIZE);
1658 is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len));
1659 if (NULL != bio_is_patched)
1660 *bio_is_patched = 0;
1661
1662 block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
1663 &state->block_hashtable);
1664 if (NULL != block) {
1665 u64 bytenr;
1666 struct list_head *elem_ref_to;
1667 struct list_head *tmp_ref_to;
1668
1669 if (block->is_superblock) {
1670 bytenr = le64_to_cpu(((struct btrfs_super_block *)
1671 mapped_data)->bytenr);
1672 is_metadata = 1;
1673 if (state->print_mask &
1674 BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
1675 printk(KERN_INFO
1676 "[before new superblock is written]:\n");
1677 btrfsic_dump_tree_sub(state, block, 0);
1678 }
1679 }
1680 if (is_metadata) {
1681 if (!block->is_superblock) {
1682 bytenr = le64_to_cpu(((struct btrfs_header *)
1683 mapped_data)->bytenr);
1684 btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
1685 dev_state,
1686 dev_bytenr,
1687 mapped_data);
1688 }
1689 if (block->logical_bytenr != bytenr) {
1690 printk(KERN_INFO
1691 "Written block @%llu (%s/%llu/%d)"
1692 " found in hash table, %c,"
1693 " bytenr mismatch"
1694 " (!= stored %llu).\n",
1695 (unsigned long long)bytenr,
1696 dev_state->name,
1697 (unsigned long long)dev_bytenr,
1698 block->mirror_num,
1699 btrfsic_get_block_type(state, block),
1700 (unsigned long long)
1701 block->logical_bytenr);
1702 block->logical_bytenr = bytenr;
1703 } else if (state->print_mask &
1704 BTRFSIC_PRINT_MASK_VERBOSE)
1705 printk(KERN_INFO
1706 "Written block @%llu (%s/%llu/%d)"
1707 " found in hash table, %c.\n",
1708 (unsigned long long)bytenr,
1709 dev_state->name,
1710 (unsigned long long)dev_bytenr,
1711 block->mirror_num,
1712 btrfsic_get_block_type(state, block));
1713 } else {
1714 bytenr = block->logical_bytenr;
1715 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1716 printk(KERN_INFO
1717 "Written block @%llu (%s/%llu/%d)"
1718 " found in hash table, %c.\n",
1719 (unsigned long long)bytenr,
1720 dev_state->name,
1721 (unsigned long long)dev_bytenr,
1722 block->mirror_num,
1723 btrfsic_get_block_type(state, block));
1724 }
1725
1726 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1727 printk(KERN_INFO
1728 "ref_to_list: %cE, ref_from_list: %cE\n",
1729 list_empty(&block->ref_to_list) ? ' ' : '!',
1730 list_empty(&block->ref_from_list) ? ' ' : '!');
1731 if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
1732 printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
1733 " @%llu (%s/%llu/%d), old(gen=%llu,"
1734 " objectid=%llu, type=%d, offset=%llu),"
1735 " new(gen=%llu),"
1736 " which is referenced by most recent superblock"
1737 " (superblockgen=%llu)!\n",
1738 btrfsic_get_block_type(state, block),
1739 (unsigned long long)bytenr,
1740 dev_state->name,
1741 (unsigned long long)dev_bytenr,
1742 block->mirror_num,
1743 (unsigned long long)block->generation,
1744 (unsigned long long)
1745 le64_to_cpu(block->disk_key.objectid),
1746 block->disk_key.type,
1747 (unsigned long long)
1748 le64_to_cpu(block->disk_key.offset),
1749 (unsigned long long)
1750 le64_to_cpu(((struct btrfs_header *)
1751 mapped_data)->generation),
1752 (unsigned long long)
1753 state->max_superblock_generation);
1754 btrfsic_dump_tree(state);
1755 }
1756
1757 if (!block->is_iodone && !block->never_written) {
1758 printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
1759 " @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu,"
1760 " which is not yet iodone!\n",
1761 btrfsic_get_block_type(state, block),
1762 (unsigned long long)bytenr,
1763 dev_state->name,
1764 (unsigned long long)dev_bytenr,
1765 block->mirror_num,
1766 (unsigned long long)block->generation,
1767 (unsigned long long)
1768 le64_to_cpu(((struct btrfs_header *)
1769 mapped_data)->generation));
1770 /* it would not be safe to go on */
1771 btrfsic_dump_tree(state);
1772 return;
1773 }
1774
1775 /*
1776 * Clear all references of this block. Do not free
1777 * the block itself even if is not referenced anymore
1778 * because it still carries valueable information
1779 * like whether it was ever written and IO completed.
1780 */
1781 list_for_each_safe(elem_ref_to, tmp_ref_to,
1782 &block->ref_to_list) {
1783 struct btrfsic_block_link *const l =
1784 list_entry(elem_ref_to,
1785 struct btrfsic_block_link,
1786 node_ref_to);
1787
1788 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1789 btrfsic_print_rem_link(state, l);
1790 l->ref_cnt--;
1791 if (0 == l->ref_cnt) {
1792 list_del(&l->node_ref_to);
1793 list_del(&l->node_ref_from);
1794 btrfsic_block_link_hashtable_remove(l);
1795 btrfsic_block_link_free(l);
1796 }
1797 }
1798
1799 if (block->is_superblock)
1800 ret = btrfsic_map_superblock(state, bytenr, len,
1801 bdev, &block_ctx);
1802 else
1803 ret = btrfsic_map_block(state, bytenr, len,
1804 &block_ctx, 0);
1805 if (ret) {
1806 printk(KERN_INFO
1807 "btrfsic: btrfsic_map_block(root @%llu)"
1808 " failed!\n", (unsigned long long)bytenr);
1809 return;
1810 }
1811 block_ctx.data = mapped_data;
1812 /* the following is required in case of writes to mirrors,
1813 * use the same that was used for the lookup */
1814 block_ctx.dev = dev_state;
1815 block_ctx.dev_bytenr = dev_bytenr;
1816
1817 if (is_metadata || state->include_extent_data) {
1818 block->never_written = 0;
1819 block->iodone_w_error = 0;
1820 if (NULL != bio) {
1821 block->is_iodone = 0;
1822 BUG_ON(NULL == bio_is_patched);
1823 if (!*bio_is_patched) {
1824 block->orig_bio_bh_private =
1825 bio->bi_private;
1826 block->orig_bio_bh_end_io.bio =
1827 bio->bi_end_io;
1828 block->next_in_same_bio = NULL;
1829 bio->bi_private = block;
1830 bio->bi_end_io = btrfsic_bio_end_io;
1831 *bio_is_patched = 1;
1832 } else {
1833 struct btrfsic_block *chained_block =
1834 (struct btrfsic_block *)
1835 bio->bi_private;
1836
1837 BUG_ON(NULL == chained_block);
1838 block->orig_bio_bh_private =
1839 chained_block->orig_bio_bh_private;
1840 block->orig_bio_bh_end_io.bio =
1841 chained_block->orig_bio_bh_end_io.
1842 bio;
1843 block->next_in_same_bio = chained_block;
1844 bio->bi_private = block;
1845 }
1846 } else if (NULL != bh) {
1847 block->is_iodone = 0;
1848 block->orig_bio_bh_private = bh->b_private;
1849 block->orig_bio_bh_end_io.bh = bh->b_end_io;
1850 block->next_in_same_bio = NULL;
1851 bh->b_private = block;
1852 bh->b_end_io = btrfsic_bh_end_io;
1853 } else {
1854 block->is_iodone = 1;
1855 block->orig_bio_bh_private = NULL;
1856 block->orig_bio_bh_end_io.bio = NULL;
1857 block->next_in_same_bio = NULL;
1858 }
1859 }
1860
1861 block->flush_gen = dev_state->last_flush_gen + 1;
1862 block->submit_bio_bh_rw = submit_bio_bh_rw;
1863 if (is_metadata) {
1864 block->logical_bytenr = bytenr;
1865 block->is_metadata = 1;
1866 if (block->is_superblock) {
1867 ret = btrfsic_process_written_superblock(
1868 state,
1869 block,
1870 (struct btrfs_super_block *)
1871 mapped_data);
1872 if (state->print_mask &
1873 BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
1874 printk(KERN_INFO
1875 "[after new superblock is written]:\n");
1876 btrfsic_dump_tree_sub(state, block, 0);
1877 }
1878 } else {
1879 block->mirror_num = 0; /* unknown */
1880 ret = btrfsic_process_metablock(
1881 state,
1882 block,
1883 &block_ctx,
1884 (struct btrfs_header *)
1885 block_ctx.data,
1886 0, 0);
1887 }
1888 if (ret)
1889 printk(KERN_INFO
1890 "btrfsic: btrfsic_process_metablock"
1891 "(root @%llu) failed!\n",
1892 (unsigned long long)dev_bytenr);
1893 } else {
1894 block->is_metadata = 0;
1895 block->mirror_num = 0; /* unknown */
1896 block->generation = BTRFSIC_GENERATION_UNKNOWN;
1897 if (!state->include_extent_data
1898 && list_empty(&block->ref_from_list)) {
1899 /*
1900 * disk block is overwritten with extent
1901 * data (not meta data) and we are configured
1902 * to not include extent data: take the
1903 * chance and free the block's memory
1904 */
1905 btrfsic_block_hashtable_remove(block);
1906 list_del(&block->all_blocks_node);
1907 btrfsic_block_free(block);
1908 }
1909 }
1910 btrfsic_release_block_ctx(&block_ctx);
1911 } else {
1912 /* block has not been found in hash table */
1913 u64 bytenr;
1914
1915 if (!is_metadata) {
1916 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1917 printk(KERN_INFO "Written block (%s/%llu/?)"
1918 " !found in hash table, D.\n",
1919 dev_state->name,
1920 (unsigned long long)dev_bytenr);
1921 if (!state->include_extent_data)
1922 return; /* ignore that written D block */
1923
1924 /* this is getting ugly for the
1925 * include_extent_data case... */
1926 bytenr = 0; /* unknown */
1927 block_ctx.start = bytenr;
1928 block_ctx.len = len;
1929 block_ctx.bh = NULL;
1930 } else {
1931 bytenr = le64_to_cpu(((struct btrfs_header *)
1932 mapped_data)->bytenr);
1933 btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
1934 dev_bytenr,
1935 mapped_data);
1936 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1937 printk(KERN_INFO
1938 "Written block @%llu (%s/%llu/?)"
1939 " !found in hash table, M.\n",
1940 (unsigned long long)bytenr,
1941 dev_state->name,
1942 (unsigned long long)dev_bytenr);
1943
1944 ret = btrfsic_map_block(state, bytenr, len, &block_ctx,
1945 0);
1946 if (ret) {
1947 printk(KERN_INFO
1948 "btrfsic: btrfsic_map_block(root @%llu)"
1949 " failed!\n",
1950 (unsigned long long)dev_bytenr);
1951 return;
1952 }
1953 }
1954 block_ctx.data = mapped_data;
1955 /* the following is required in case of writes to mirrors,
1956 * use the same that was used for the lookup */
1957 block_ctx.dev = dev_state;
1958 block_ctx.dev_bytenr = dev_bytenr;
1959
1960 block = btrfsic_block_alloc();
1961 if (NULL == block) {
1962 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
1963 btrfsic_release_block_ctx(&block_ctx);
1964 return;
1965 }
1966 block->dev_state = dev_state;
1967 block->dev_bytenr = dev_bytenr;
1968 block->logical_bytenr = bytenr;
1969 block->is_metadata = is_metadata;
1970 block->never_written = 0;
1971 block->iodone_w_error = 0;
1972 block->mirror_num = 0; /* unknown */
1973 block->flush_gen = dev_state->last_flush_gen + 1;
1974 block->submit_bio_bh_rw = submit_bio_bh_rw;
1975 if (NULL != bio) {
1976 block->is_iodone = 0;
1977 BUG_ON(NULL == bio_is_patched);
1978 if (!*bio_is_patched) {
1979 block->orig_bio_bh_private = bio->bi_private;
1980 block->orig_bio_bh_end_io.bio = bio->bi_end_io;
1981 block->next_in_same_bio = NULL;
1982 bio->bi_private = block;
1983 bio->bi_end_io = btrfsic_bio_end_io;
1984 *bio_is_patched = 1;
1985 } else {
1986 struct btrfsic_block *chained_block =
1987 (struct btrfsic_block *)
1988 bio->bi_private;
1989
1990 BUG_ON(NULL == chained_block);
1991 block->orig_bio_bh_private =
1992 chained_block->orig_bio_bh_private;
1993 block->orig_bio_bh_end_io.bio =
1994 chained_block->orig_bio_bh_end_io.bio;
1995 block->next_in_same_bio = chained_block;
1996 bio->bi_private = block;
1997 }
1998 } else if (NULL != bh) {
1999 block->is_iodone = 0;
2000 block->orig_bio_bh_private = bh->b_private;
2001 block->orig_bio_bh_end_io.bh = bh->b_end_io;
2002 block->next_in_same_bio = NULL;
2003 bh->b_private = block;
2004 bh->b_end_io = btrfsic_bh_end_io;
2005 } else {
2006 block->is_iodone = 1;
2007 block->orig_bio_bh_private = NULL;
2008 block->orig_bio_bh_end_io.bio = NULL;
2009 block->next_in_same_bio = NULL;
2010 }
2011 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2012 printk(KERN_INFO
2013 "New written %c-block @%llu (%s/%llu/%d)\n",
2014 is_metadata ? 'M' : 'D',
2015 (unsigned long long)block->logical_bytenr,
2016 block->dev_state->name,
2017 (unsigned long long)block->dev_bytenr,
2018 block->mirror_num);
2019 list_add(&block->all_blocks_node, &state->all_blocks_list);
2020 btrfsic_block_hashtable_add(block, &state->block_hashtable);
2021
2022 if (is_metadata) {
2023 ret = btrfsic_process_metablock(state, block,
2024 &block_ctx,
2025 (struct btrfs_header *)
2026 block_ctx.data, 0, 0);
2027 if (ret)
2028 printk(KERN_INFO
2029 "btrfsic: process_metablock(root @%llu)"
2030 " failed!\n",
2031 (unsigned long long)dev_bytenr);
2032 }
2033 btrfsic_release_block_ctx(&block_ctx);
2034 }
2035}
2036
2037static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
2038{
2039 struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private;
2040 int iodone_w_error;
2041
2042 /* mutex is not held! This is not save if IO is not yet completed
2043 * on umount */
2044 iodone_w_error = 0;
2045 if (bio_error_status)
2046 iodone_w_error = 1;
2047
2048 BUG_ON(NULL == block);
2049 bp->bi_private = block->orig_bio_bh_private;
2050 bp->bi_end_io = block->orig_bio_bh_end_io.bio;
2051
2052 do {
2053 struct btrfsic_block *next_block;
2054 struct btrfsic_dev_state *const dev_state = block->dev_state;
2055
2056 if ((dev_state->state->print_mask &
2057 BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
2058 printk(KERN_INFO
2059 "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
2060 bio_error_status,
2061 btrfsic_get_block_type(dev_state->state, block),
2062 (unsigned long long)block->logical_bytenr,
2063 dev_state->name,
2064 (unsigned long long)block->dev_bytenr,
2065 block->mirror_num);
2066 next_block = block->next_in_same_bio;
2067 block->iodone_w_error = iodone_w_error;
2068 if (block->submit_bio_bh_rw & REQ_FLUSH) {
2069 dev_state->last_flush_gen++;
2070 if ((dev_state->state->print_mask &
2071 BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
2072 printk(KERN_INFO
2073 "bio_end_io() new %s flush_gen=%llu\n",
2074 dev_state->name,
2075 (unsigned long long)
2076 dev_state->last_flush_gen);
2077 }
2078 if (block->submit_bio_bh_rw & REQ_FUA)
2079 block->flush_gen = 0; /* FUA completed means block is
2080 * on disk */
2081 block->is_iodone = 1; /* for FLUSH, this releases the block */
2082 block = next_block;
2083 } while (NULL != block);
2084
2085 bp->bi_end_io(bp, bio_error_status);
2086}
2087
2088static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
2089{
2090 struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private;
2091 int iodone_w_error = !uptodate;
2092 struct btrfsic_dev_state *dev_state;
2093
2094 BUG_ON(NULL == block);
2095 dev_state = block->dev_state;
2096 if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
2097 printk(KERN_INFO
2098 "bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n",
2099 iodone_w_error,
2100 btrfsic_get_block_type(dev_state->state, block),
2101 (unsigned long long)block->logical_bytenr,
2102 block->dev_state->name,
2103 (unsigned long long)block->dev_bytenr,
2104 block->mirror_num);
2105
2106 block->iodone_w_error = iodone_w_error;
2107 if (block->submit_bio_bh_rw & REQ_FLUSH) {
2108 dev_state->last_flush_gen++;
2109 if ((dev_state->state->print_mask &
2110 BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
2111 printk(KERN_INFO
2112 "bh_end_io() new %s flush_gen=%llu\n",
2113 dev_state->name,
2114 (unsigned long long)dev_state->last_flush_gen);
2115 }
2116 if (block->submit_bio_bh_rw & REQ_FUA)
2117 block->flush_gen = 0; /* FUA completed means block is on disk */
2118
2119 bh->b_private = block->orig_bio_bh_private;
2120 bh->b_end_io = block->orig_bio_bh_end_io.bh;
2121 block->is_iodone = 1; /* for FLUSH, this releases the block */
2122 bh->b_end_io(bh, uptodate);
2123}
2124
2125static int btrfsic_process_written_superblock(
2126 struct btrfsic_state *state,
2127 struct btrfsic_block *const superblock,
2128 struct btrfs_super_block *const super_hdr)
2129{
2130 int pass;
2131
2132 superblock->generation = btrfs_super_generation(super_hdr);
2133 if (!(superblock->generation > state->max_superblock_generation ||
2134 0 == state->max_superblock_generation)) {
2135 if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
2136 printk(KERN_INFO
2137 "btrfsic: superblock @%llu (%s/%llu/%d)"
2138 " with old gen %llu <= %llu\n",
2139 (unsigned long long)superblock->logical_bytenr,
2140 superblock->dev_state->name,
2141 (unsigned long long)superblock->dev_bytenr,
2142 superblock->mirror_num,
2143 (unsigned long long)
2144 btrfs_super_generation(super_hdr),
2145 (unsigned long long)
2146 state->max_superblock_generation);
2147 } else {
2148 if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
2149 printk(KERN_INFO
2150 "btrfsic: got new superblock @%llu (%s/%llu/%d)"
2151 " with new gen %llu > %llu\n",
2152 (unsigned long long)superblock->logical_bytenr,
2153 superblock->dev_state->name,
2154 (unsigned long long)superblock->dev_bytenr,
2155 superblock->mirror_num,
2156 (unsigned long long)
2157 btrfs_super_generation(super_hdr),
2158 (unsigned long long)
2159 state->max_superblock_generation);
2160
2161 state->max_superblock_generation =
2162 btrfs_super_generation(super_hdr);
2163 state->latest_superblock = superblock;
2164 }
2165
2166 for (pass = 0; pass < 3; pass++) {
2167 int ret;
2168 u64 next_bytenr;
2169 struct btrfsic_block *next_block;
2170 struct btrfsic_block_data_ctx tmp_next_block_ctx;
2171 struct btrfsic_block_link *l;
2172 int num_copies;
2173 int mirror_num;
2174 const char *additional_string = NULL;
2175 struct btrfs_disk_key tmp_disk_key;
2176
2177 tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
2178 tmp_disk_key.offset = 0;
2179
2180 switch (pass) {
2181 case 0:
2182 tmp_disk_key.objectid =
2183 cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
2184 additional_string = "root ";
2185 next_bytenr = btrfs_super_root(super_hdr);
2186 if (state->print_mask &
2187 BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
2188 printk(KERN_INFO "root@%llu\n",
2189 (unsigned long long)next_bytenr);
2190 break;
2191 case 1:
2192 tmp_disk_key.objectid =
2193 cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
2194 additional_string = "chunk ";
2195 next_bytenr = btrfs_super_chunk_root(super_hdr);
2196 if (state->print_mask &
2197 BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
2198 printk(KERN_INFO "chunk@%llu\n",
2199 (unsigned long long)next_bytenr);
2200 break;
2201 case 2:
2202 tmp_disk_key.objectid =
2203 cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
2204 additional_string = "log ";
2205 next_bytenr = btrfs_super_log_root(super_hdr);
2206 if (0 == next_bytenr)
2207 continue;
2208 if (state->print_mask &
2209 BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
2210 printk(KERN_INFO "log@%llu\n",
2211 (unsigned long long)next_bytenr);
2212 break;
2213 }
2214
2215 num_copies =
2216 btrfs_num_copies(&state->root->fs_info->mapping_tree,
2217 next_bytenr, PAGE_SIZE);
2218 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
2219 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
2220 (unsigned long long)next_bytenr, num_copies);
2221 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
2222 int was_created;
2223
2224 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2225 printk(KERN_INFO
2226 "btrfsic_process_written_superblock("
2227 "mirror_num=%d)\n", mirror_num);
2228 ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
2229 &tmp_next_block_ctx,
2230 mirror_num);
2231 if (ret) {
2232 printk(KERN_INFO
2233 "btrfsic: btrfsic_map_block(@%llu,"
2234 " mirror=%d) failed!\n",
2235 (unsigned long long)next_bytenr,
2236 mirror_num);
2237 return -1;
2238 }
2239
2240 next_block = btrfsic_block_lookup_or_add(
2241 state,
2242 &tmp_next_block_ctx,
2243 additional_string,
2244 1, 0, 1,
2245 mirror_num,
2246 &was_created);
2247 if (NULL == next_block) {
2248 printk(KERN_INFO
2249 "btrfsic: error, kmalloc failed!\n");
2250 btrfsic_release_block_ctx(&tmp_next_block_ctx);
2251 return -1;
2252 }
2253
2254 next_block->disk_key = tmp_disk_key;
2255 if (was_created)
2256 next_block->generation =
2257 BTRFSIC_GENERATION_UNKNOWN;
2258 l = btrfsic_block_link_lookup_or_add(
2259 state,
2260 &tmp_next_block_ctx,
2261 next_block,
2262 superblock,
2263 BTRFSIC_GENERATION_UNKNOWN);
2264 btrfsic_release_block_ctx(&tmp_next_block_ctx);
2265 if (NULL == l)
2266 return -1;
2267 }
2268 }
2269
2270 if (-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)) {
2271 WARN_ON(1);
2272 btrfsic_dump_tree(state);
2273 }
2274
2275 return 0;
2276}
2277
2278static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
2279 struct btrfsic_block *const block,
2280 int recursion_level)
2281{
2282 struct list_head *elem_ref_to;
2283 int ret = 0;
2284
2285 if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
2286 /*
2287 * Note that this situation can happen and does not
2288 * indicate an error in regular cases. It happens
2289 * when disk blocks are freed and later reused.
2290 * The check-integrity module is not aware of any
2291 * block free operations, it just recognizes block
2292 * write operations. Therefore it keeps the linkage
2293 * information for a block until a block is
2294 * rewritten. This can temporarily cause incorrect
2295 * and even circular linkage informations. This
2296 * causes no harm unless such blocks are referenced
2297 * by the most recent super block.
2298 */
2299 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2300 printk(KERN_INFO
2301 "btrfsic: abort cyclic linkage (case 1).\n");
2302
2303 return ret;
2304 }
2305
2306 /*
2307 * This algorithm is recursive because the amount of used stack
2308 * space is very small and the max recursion depth is limited.
2309 */
2310 list_for_each(elem_ref_to, &block->ref_to_list) {
2311 const struct btrfsic_block_link *const l =
2312 list_entry(elem_ref_to, struct btrfsic_block_link,
2313 node_ref_to);
2314
2315 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2316 printk(KERN_INFO
2317 "rl=%d, %c @%llu (%s/%llu/%d)"
2318 " %u* refers to %c @%llu (%s/%llu/%d)\n",
2319 recursion_level,
2320 btrfsic_get_block_type(state, block),
2321 (unsigned long long)block->logical_bytenr,
2322 block->dev_state->name,
2323 (unsigned long long)block->dev_bytenr,
2324 block->mirror_num,
2325 l->ref_cnt,
2326 btrfsic_get_block_type(state, l->block_ref_to),
2327 (unsigned long long)
2328 l->block_ref_to->logical_bytenr,
2329 l->block_ref_to->dev_state->name,
2330 (unsigned long long)l->block_ref_to->dev_bytenr,
2331 l->block_ref_to->mirror_num);
2332 if (l->block_ref_to->never_written) {
2333 printk(KERN_INFO "btrfs: attempt to write superblock"
2334 " which references block %c @%llu (%s/%llu/%d)"
2335 " which is never written!\n",
2336 btrfsic_get_block_type(state, l->block_ref_to),
2337 (unsigned long long)
2338 l->block_ref_to->logical_bytenr,
2339 l->block_ref_to->dev_state->name,
2340 (unsigned long long)l->block_ref_to->dev_bytenr,
2341 l->block_ref_to->mirror_num);
2342 ret = -1;
2343 } else if (!l->block_ref_to->is_iodone) {
2344 printk(KERN_INFO "btrfs: attempt to write superblock"
2345 " which references block %c @%llu (%s/%llu/%d)"
2346 " which is not yet iodone!\n",
2347 btrfsic_get_block_type(state, l->block_ref_to),
2348 (unsigned long long)
2349 l->block_ref_to->logical_bytenr,
2350 l->block_ref_to->dev_state->name,
2351 (unsigned long long)l->block_ref_to->dev_bytenr,
2352 l->block_ref_to->mirror_num);
2353 ret = -1;
2354 } else if (l->parent_generation !=
2355 l->block_ref_to->generation &&
2356 BTRFSIC_GENERATION_UNKNOWN !=
2357 l->parent_generation &&
2358 BTRFSIC_GENERATION_UNKNOWN !=
2359 l->block_ref_to->generation) {
2360 printk(KERN_INFO "btrfs: attempt to write superblock"
2361 " which references block %c @%llu (%s/%llu/%d)"
2362 " with generation %llu !="
2363 " parent generation %llu!\n",
2364 btrfsic_get_block_type(state, l->block_ref_to),
2365 (unsigned long long)
2366 l->block_ref_to->logical_bytenr,
2367 l->block_ref_to->dev_state->name,
2368 (unsigned long long)l->block_ref_to->dev_bytenr,
2369 l->block_ref_to->mirror_num,
2370 (unsigned long long)l->block_ref_to->generation,
2371 (unsigned long long)l->parent_generation);
2372 ret = -1;
2373 } else if (l->block_ref_to->flush_gen >
2374 l->block_ref_to->dev_state->last_flush_gen) {
2375 printk(KERN_INFO "btrfs: attempt to write superblock"
2376 " which references block %c @%llu (%s/%llu/%d)"
2377 " which is not flushed out of disk's write cache"
2378 " (block flush_gen=%llu,"
2379 " dev->flush_gen=%llu)!\n",
2380 btrfsic_get_block_type(state, l->block_ref_to),
2381 (unsigned long long)
2382 l->block_ref_to->logical_bytenr,
2383 l->block_ref_to->dev_state->name,
2384 (unsigned long long)l->block_ref_to->dev_bytenr,
2385 l->block_ref_to->mirror_num,
2386 (unsigned long long)block->flush_gen,
2387 (unsigned long long)
2388 l->block_ref_to->dev_state->last_flush_gen);
2389 ret = -1;
2390 } else if (-1 == btrfsic_check_all_ref_blocks(state,
2391 l->block_ref_to,
2392 recursion_level +
2393 1)) {
2394 ret = -1;
2395 }
2396 }
2397
2398 return ret;
2399}
2400
2401static int btrfsic_is_block_ref_by_superblock(
2402 const struct btrfsic_state *state,
2403 const struct btrfsic_block *block,
2404 int recursion_level)
2405{
2406 struct list_head *elem_ref_from;
2407
2408 if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
2409 /* refer to comment at "abort cyclic linkage (case 1)" */
2410 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2411 printk(KERN_INFO
2412 "btrfsic: abort cyclic linkage (case 2).\n");
2413
2414 return 0;
2415 }
2416
2417 /*
2418 * This algorithm is recursive because the amount of used stack space
2419 * is very small and the max recursion depth is limited.
2420 */
2421 list_for_each(elem_ref_from, &block->ref_from_list) {
2422 const struct btrfsic_block_link *const l =
2423 list_entry(elem_ref_from, struct btrfsic_block_link,
2424 node_ref_from);
2425
2426 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2427 printk(KERN_INFO
2428 "rl=%d, %c @%llu (%s/%llu/%d)"
2429 " is ref %u* from %c @%llu (%s/%llu/%d)\n",
2430 recursion_level,
2431 btrfsic_get_block_type(state, block),
2432 (unsigned long long)block->logical_bytenr,
2433 block->dev_state->name,
2434 (unsigned long long)block->dev_bytenr,
2435 block->mirror_num,
2436 l->ref_cnt,
2437 btrfsic_get_block_type(state, l->block_ref_from),
2438 (unsigned long long)
2439 l->block_ref_from->logical_bytenr,
2440 l->block_ref_from->dev_state->name,
2441 (unsigned long long)
2442 l->block_ref_from->dev_bytenr,
2443 l->block_ref_from->mirror_num);
2444 if (l->block_ref_from->is_superblock &&
2445 state->latest_superblock->dev_bytenr ==
2446 l->block_ref_from->dev_bytenr &&
2447 state->latest_superblock->dev_state->bdev ==
2448 l->block_ref_from->dev_state->bdev)
2449 return 1;
2450 else if (btrfsic_is_block_ref_by_superblock(state,
2451 l->block_ref_from,
2452 recursion_level +
2453 1))
2454 return 1;
2455 }
2456
2457 return 0;
2458}
2459
2460static void btrfsic_print_add_link(const struct btrfsic_state *state,
2461 const struct btrfsic_block_link *l)
2462{
2463 printk(KERN_INFO
2464 "Add %u* link from %c @%llu (%s/%llu/%d)"
2465 " to %c @%llu (%s/%llu/%d).\n",
2466 l->ref_cnt,
2467 btrfsic_get_block_type(state, l->block_ref_from),
2468 (unsigned long long)l->block_ref_from->logical_bytenr,
2469 l->block_ref_from->dev_state->name,
2470 (unsigned long long)l->block_ref_from->dev_bytenr,
2471 l->block_ref_from->mirror_num,
2472 btrfsic_get_block_type(state, l->block_ref_to),
2473 (unsigned long long)l->block_ref_to->logical_bytenr,
2474 l->block_ref_to->dev_state->name,
2475 (unsigned long long)l->block_ref_to->dev_bytenr,
2476 l->block_ref_to->mirror_num);
2477}
2478
2479static void btrfsic_print_rem_link(const struct btrfsic_state *state,
2480 const struct btrfsic_block_link *l)
2481{
2482 printk(KERN_INFO
2483 "Rem %u* link from %c @%llu (%s/%llu/%d)"
2484 " to %c @%llu (%s/%llu/%d).\n",
2485 l->ref_cnt,
2486 btrfsic_get_block_type(state, l->block_ref_from),
2487 (unsigned long long)l->block_ref_from->logical_bytenr,
2488 l->block_ref_from->dev_state->name,
2489 (unsigned long long)l->block_ref_from->dev_bytenr,
2490 l->block_ref_from->mirror_num,
2491 btrfsic_get_block_type(state, l->block_ref_to),
2492 (unsigned long long)l->block_ref_to->logical_bytenr,
2493 l->block_ref_to->dev_state->name,
2494 (unsigned long long)l->block_ref_to->dev_bytenr,
2495 l->block_ref_to->mirror_num);
2496}
2497
2498static char btrfsic_get_block_type(const struct btrfsic_state *state,
2499 const struct btrfsic_block *block)
2500{
2501 if (block->is_superblock &&
2502 state->latest_superblock->dev_bytenr == block->dev_bytenr &&
2503 state->latest_superblock->dev_state->bdev == block->dev_state->bdev)
2504 return 'S';
2505 else if (block->is_superblock)
2506 return 's';
2507 else if (block->is_metadata)
2508 return 'M';
2509 else
2510 return 'D';
2511}
2512
2513static void btrfsic_dump_tree(const struct btrfsic_state *state)
2514{
2515 btrfsic_dump_tree_sub(state, state->latest_superblock, 0);
2516}
2517
2518static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
2519 const struct btrfsic_block *block,
2520 int indent_level)
2521{
2522 struct list_head *elem_ref_to;
2523 int indent_add;
2524 static char buf[80];
2525 int cursor_position;
2526
2527 /*
2528 * Should better fill an on-stack buffer with a complete line and
2529 * dump it at once when it is time to print a newline character.
2530 */
2531
2532 /*
2533 * This algorithm is recursive because the amount of used stack space
2534 * is very small and the max recursion depth is limited.
2535 */
2536 indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)",
2537 btrfsic_get_block_type(state, block),
2538 (unsigned long long)block->logical_bytenr,
2539 block->dev_state->name,
2540 (unsigned long long)block->dev_bytenr,
2541 block->mirror_num);
2542 if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
2543 printk("[...]\n");
2544 return;
2545 }
2546 printk(buf);
2547 indent_level += indent_add;
2548 if (list_empty(&block->ref_to_list)) {
2549 printk("\n");
2550 return;
2551 }
2552 if (block->mirror_num > 1 &&
2553 !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) {
2554 printk(" [...]\n");
2555 return;
2556 }
2557
2558 cursor_position = indent_level;
2559 list_for_each(elem_ref_to, &block->ref_to_list) {
2560 const struct btrfsic_block_link *const l =
2561 list_entry(elem_ref_to, struct btrfsic_block_link,
2562 node_ref_to);
2563
2564 while (cursor_position < indent_level) {
2565 printk(" ");
2566 cursor_position++;
2567 }
2568 if (l->ref_cnt > 1)
2569 indent_add = sprintf(buf, " %d*--> ", l->ref_cnt);
2570 else
2571 indent_add = sprintf(buf, " --> ");
2572 if (indent_level + indent_add >
2573 BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
2574 printk("[...]\n");
2575 cursor_position = 0;
2576 continue;
2577 }
2578
2579 printk(buf);
2580
2581 btrfsic_dump_tree_sub(state, l->block_ref_to,
2582 indent_level + indent_add);
2583 cursor_position = 0;
2584 }
2585}
2586
2587static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
2588 struct btrfsic_state *state,
2589 struct btrfsic_block_data_ctx *next_block_ctx,
2590 struct btrfsic_block *next_block,
2591 struct btrfsic_block *from_block,
2592 u64 parent_generation)
2593{
2594 struct btrfsic_block_link *l;
2595
2596 l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev,
2597 next_block_ctx->dev_bytenr,
2598 from_block->dev_state->bdev,
2599 from_block->dev_bytenr,
2600 &state->block_link_hashtable);
2601 if (NULL == l) {
2602 l = btrfsic_block_link_alloc();
2603 if (NULL == l) {
2604 printk(KERN_INFO
2605 "btrfsic: error, kmalloc" " failed!\n");
2606 return NULL;
2607 }
2608
2609 l->block_ref_to = next_block;
2610 l->block_ref_from = from_block;
2611 l->ref_cnt = 1;
2612 l->parent_generation = parent_generation;
2613
2614 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2615 btrfsic_print_add_link(state, l);
2616
2617 list_add(&l->node_ref_to, &from_block->ref_to_list);
2618 list_add(&l->node_ref_from, &next_block->ref_from_list);
2619
2620 btrfsic_block_link_hashtable_add(l,
2621 &state->block_link_hashtable);
2622 } else {
2623 l->ref_cnt++;
2624 l->parent_generation = parent_generation;
2625 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2626 btrfsic_print_add_link(state, l);
2627 }
2628
2629 return l;
2630}
2631
2632static struct btrfsic_block *btrfsic_block_lookup_or_add(
2633 struct btrfsic_state *state,
2634 struct btrfsic_block_data_ctx *block_ctx,
2635 const char *additional_string,
2636 int is_metadata,
2637 int is_iodone,
2638 int never_written,
2639 int mirror_num,
2640 int *was_created)
2641{
2642 struct btrfsic_block *block;
2643
2644 block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev,
2645 block_ctx->dev_bytenr,
2646 &state->block_hashtable);
2647 if (NULL == block) {
2648 struct btrfsic_dev_state *dev_state;
2649
2650 block = btrfsic_block_alloc();
2651 if (NULL == block) {
2652 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
2653 return NULL;
2654 }
2655 dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev);
2656 if (NULL == dev_state) {
2657 printk(KERN_INFO
2658 "btrfsic: error, lookup dev_state failed!\n");
2659 btrfsic_block_free(block);
2660 return NULL;
2661 }
2662 block->dev_state = dev_state;
2663 block->dev_bytenr = block_ctx->dev_bytenr;
2664 block->logical_bytenr = block_ctx->start;
2665 block->is_metadata = is_metadata;
2666 block->is_iodone = is_iodone;
2667 block->never_written = never_written;
2668 block->mirror_num = mirror_num;
2669 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2670 printk(KERN_INFO
2671 "New %s%c-block @%llu (%s/%llu/%d)\n",
2672 additional_string,
2673 btrfsic_get_block_type(state, block),
2674 (unsigned long long)block->logical_bytenr,
2675 dev_state->name,
2676 (unsigned long long)block->dev_bytenr,
2677 mirror_num);
2678 list_add(&block->all_blocks_node, &state->all_blocks_list);
2679 btrfsic_block_hashtable_add(block, &state->block_hashtable);
2680 if (NULL != was_created)
2681 *was_created = 1;
2682 } else {
2683 if (NULL != was_created)
2684 *was_created = 0;
2685 }
2686
2687 return block;
2688}
2689
2690static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
2691 u64 bytenr,
2692 struct btrfsic_dev_state *dev_state,
2693 u64 dev_bytenr, char *data)
2694{
2695 int num_copies;
2696 int mirror_num;
2697 int ret;
2698 struct btrfsic_block_data_ctx block_ctx;
2699 int match = 0;
2700
2701 num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
2702 bytenr, PAGE_SIZE);
2703
2704 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
2705 ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
2706 &block_ctx, mirror_num);
2707 if (ret) {
2708 printk(KERN_INFO "btrfsic:"
2709 " btrfsic_map_block(logical @%llu,"
2710 " mirror %d) failed!\n",
2711 (unsigned long long)bytenr, mirror_num);
2712 continue;
2713 }
2714
2715 if (dev_state->bdev == block_ctx.dev->bdev &&
2716 dev_bytenr == block_ctx.dev_bytenr) {
2717 match++;
2718 btrfsic_release_block_ctx(&block_ctx);
2719 break;
2720 }
2721 btrfsic_release_block_ctx(&block_ctx);
2722 }
2723
2724 if (!match) {
2725 printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio,"
2726 " buffer->log_bytenr=%llu, submit_bio(bdev=%s,"
2727 " phys_bytenr=%llu)!\n",
2728 (unsigned long long)bytenr, dev_state->name,
2729 (unsigned long long)dev_bytenr);
2730 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
2731 ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
2732 &block_ctx, mirror_num);
2733 if (ret)
2734 continue;
2735
2736 printk(KERN_INFO "Read logical bytenr @%llu maps to"
2737 " (%s/%llu/%d)\n",
2738 (unsigned long long)bytenr,
2739 block_ctx.dev->name,
2740 (unsigned long long)block_ctx.dev_bytenr,
2741 mirror_num);
2742 }
2743 WARN_ON(1);
2744 }
2745}
2746
2747static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
2748 struct block_device *bdev)
2749{
2750 struct btrfsic_dev_state *ds;
2751
2752 ds = btrfsic_dev_state_hashtable_lookup(bdev,
2753 &btrfsic_dev_state_hashtable);
2754 return ds;
2755}
2756
2757int btrfsic_submit_bh(int rw, struct buffer_head *bh)
2758{
2759 struct btrfsic_dev_state *dev_state;
2760
2761 if (!btrfsic_is_initialized)
2762 return submit_bh(rw, bh);
2763
2764 mutex_lock(&btrfsic_mutex);
2765 /* since btrfsic_submit_bh() might also be called before
2766 * btrfsic_mount(), this might return NULL */
2767 dev_state = btrfsic_dev_state_lookup(bh->b_bdev);
2768
2769 /* Only called to write the superblock (incl. FLUSH/FUA) */
2770 if (NULL != dev_state &&
2771 (rw & WRITE) && bh->b_size > 0) {
2772 u64 dev_bytenr;
2773
2774 dev_bytenr = 4096 * bh->b_blocknr;
2775 if (dev_state->state->print_mask &
2776 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
2777 printk(KERN_INFO
2778 "submit_bh(rw=0x%x, blocknr=%lu (bytenr %llu),"
2779 " size=%lu, data=%p, bdev=%p)\n",
2780 rw, bh->b_blocknr,
2781 (unsigned long long)dev_bytenr, bh->b_size,
2782 bh->b_data, bh->b_bdev);
2783 btrfsic_process_written_block(dev_state, dev_bytenr,
2784 bh->b_data, bh->b_size, NULL,
2785 NULL, bh, rw);
2786 } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
2787 if (dev_state->state->print_mask &
2788 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
2789 printk(KERN_INFO
2790 "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n",
2791 rw, bh->b_bdev);
2792 if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
2793 if ((dev_state->state->print_mask &
2794 (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
2795 BTRFSIC_PRINT_MASK_VERBOSE)))
2796 printk(KERN_INFO
2797 "btrfsic_submit_bh(%s) with FLUSH"
2798 " but dummy block already in use"
2799 " (ignored)!\n",
2800 dev_state->name);
2801 } else {
2802 struct btrfsic_block *const block =
2803 &dev_state->dummy_block_for_bio_bh_flush;
2804
2805 block->is_iodone = 0;
2806 block->never_written = 0;
2807 block->iodone_w_error = 0;
2808 block->flush_gen = dev_state->last_flush_gen + 1;
2809 block->submit_bio_bh_rw = rw;
2810 block->orig_bio_bh_private = bh->b_private;
2811 block->orig_bio_bh_end_io.bh = bh->b_end_io;
2812 block->next_in_same_bio = NULL;
2813 bh->b_private = block;
2814 bh->b_end_io = btrfsic_bh_end_io;
2815 }
2816 }
2817 mutex_unlock(&btrfsic_mutex);
2818 return submit_bh(rw, bh);
2819}
2820
2821void btrfsic_submit_bio(int rw, struct bio *bio)
2822{
2823 struct btrfsic_dev_state *dev_state;
2824
2825 if (!btrfsic_is_initialized) {
2826 submit_bio(rw, bio);
2827 return;
2828 }
2829
2830 mutex_lock(&btrfsic_mutex);
2831 /* since btrfsic_submit_bio() is also called before
2832 * btrfsic_mount(), this might return NULL */
2833 dev_state = btrfsic_dev_state_lookup(bio->bi_bdev);
2834 if (NULL != dev_state &&
2835 (rw & WRITE) && NULL != bio->bi_io_vec) {
2836 unsigned int i;
2837 u64 dev_bytenr;
2838 int bio_is_patched;
2839
2840 dev_bytenr = 512 * bio->bi_sector;
2841 bio_is_patched = 0;
2842 if (dev_state->state->print_mask &
2843 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
2844 printk(KERN_INFO
2845 "submit_bio(rw=0x%x, bi_vcnt=%u,"
2846 " bi_sector=%lu (bytenr %llu), bi_bdev=%p)\n",
2847 rw, bio->bi_vcnt, bio->bi_sector,
2848 (unsigned long long)dev_bytenr,
2849 bio->bi_bdev);
2850
2851 for (i = 0; i < bio->bi_vcnt; i++) {
2852 u8 *mapped_data;
2853
2854 mapped_data = kmap(bio->bi_io_vec[i].bv_page);
2855 if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
2856 BTRFSIC_PRINT_MASK_VERBOSE) ==
2857 (dev_state->state->print_mask &
2858 (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
2859 BTRFSIC_PRINT_MASK_VERBOSE)))
2860 printk(KERN_INFO
2861 "#%u: page=%p, mapped=%p, len=%u,"
2862 " offset=%u\n",
2863 i, bio->bi_io_vec[i].bv_page,
2864 mapped_data,
2865 bio->bi_io_vec[i].bv_len,
2866 bio->bi_io_vec[i].bv_offset);
2867 btrfsic_process_written_block(dev_state, dev_bytenr,
2868 mapped_data,
2869 bio->bi_io_vec[i].bv_len,
2870 bio, &bio_is_patched,
2871 NULL, rw);
2872 kunmap(bio->bi_io_vec[i].bv_page);
2873 dev_bytenr += bio->bi_io_vec[i].bv_len;
2874 }
2875 } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
2876 if (dev_state->state->print_mask &
2877 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
2878 printk(KERN_INFO
2879 "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n",
2880 rw, bio->bi_bdev);
2881 if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
2882 if ((dev_state->state->print_mask &
2883 (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
2884 BTRFSIC_PRINT_MASK_VERBOSE)))
2885 printk(KERN_INFO
2886 "btrfsic_submit_bio(%s) with FLUSH"
2887 " but dummy block already in use"
2888 " (ignored)!\n",
2889 dev_state->name);
2890 } else {
2891 struct btrfsic_block *const block =
2892 &dev_state->dummy_block_for_bio_bh_flush;
2893
2894 block->is_iodone = 0;
2895 block->never_written = 0;
2896 block->iodone_w_error = 0;
2897 block->flush_gen = dev_state->last_flush_gen + 1;
2898 block->submit_bio_bh_rw = rw;
2899 block->orig_bio_bh_private = bio->bi_private;
2900 block->orig_bio_bh_end_io.bio = bio->bi_end_io;
2901 block->next_in_same_bio = NULL;
2902 bio->bi_private = block;
2903 bio->bi_end_io = btrfsic_bio_end_io;
2904 }
2905 }
2906 mutex_unlock(&btrfsic_mutex);
2907
2908 submit_bio(rw, bio);
2909}
2910
2911int btrfsic_mount(struct btrfs_root *root,
2912 struct btrfs_fs_devices *fs_devices,
2913 int including_extent_data, u32 print_mask)
2914{
2915 int ret;
2916 struct btrfsic_state *state;
2917 struct list_head *dev_head = &fs_devices->devices;
2918 struct btrfs_device *device;
2919
2920 state = kzalloc(sizeof(*state), GFP_NOFS);
2921 if (NULL == state) {
2922 printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n");
2923 return -1;
2924 }
2925
2926 if (!btrfsic_is_initialized) {
2927 mutex_init(&btrfsic_mutex);
2928 btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable);
2929 btrfsic_is_initialized = 1;
2930 }
2931 mutex_lock(&btrfsic_mutex);
2932 state->root = root;
2933 state->print_mask = print_mask;
2934 state->include_extent_data = including_extent_data;
2935 state->csum_size = 0;
2936 INIT_LIST_HEAD(&state->all_blocks_list);
2937 btrfsic_block_hashtable_init(&state->block_hashtable);
2938 btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
2939 state->max_superblock_generation = 0;
2940 state->latest_superblock = NULL;
2941
2942 list_for_each_entry(device, dev_head, dev_list) {
2943 struct btrfsic_dev_state *ds;
2944 char *p;
2945
2946 if (!device->bdev || !device->name)
2947 continue;
2948
2949 ds = btrfsic_dev_state_alloc();
2950 if (NULL == ds) {
2951 printk(KERN_INFO
2952 "btrfs check-integrity: kmalloc() failed!\n");
2953 mutex_unlock(&btrfsic_mutex);
2954 return -1;
2955 }
2956 ds->bdev = device->bdev;
2957 ds->state = state;
2958 bdevname(ds->bdev, ds->name);
2959 ds->name[BDEVNAME_SIZE - 1] = '\0';
2960 for (p = ds->name; *p != '\0'; p++);
2961 while (p > ds->name && *p != '/')
2962 p--;
2963 if (*p == '/')
2964 p++;
2965 strlcpy(ds->name, p, sizeof(ds->name));
2966 btrfsic_dev_state_hashtable_add(ds,
2967 &btrfsic_dev_state_hashtable);
2968 }
2969
2970 ret = btrfsic_process_superblock(state, fs_devices);
2971 if (0 != ret) {
2972 mutex_unlock(&btrfsic_mutex);
2973 btrfsic_unmount(root, fs_devices);
2974 return ret;
2975 }
2976
2977 if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE)
2978 btrfsic_dump_database(state);
2979 if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE)
2980 btrfsic_dump_tree(state);
2981
2982 mutex_unlock(&btrfsic_mutex);
2983 return 0;
2984}
2985
2986void btrfsic_unmount(struct btrfs_root *root,
2987 struct btrfs_fs_devices *fs_devices)
2988{
2989 struct list_head *elem_all;
2990 struct list_head *tmp_all;
2991 struct btrfsic_state *state;
2992 struct list_head *dev_head = &fs_devices->devices;
2993 struct btrfs_device *device;
2994
2995 if (!btrfsic_is_initialized)
2996 return;
2997
2998 mutex_lock(&btrfsic_mutex);
2999
3000 state = NULL;
3001 list_for_each_entry(device, dev_head, dev_list) {
3002 struct btrfsic_dev_state *ds;
3003
3004 if (!device->bdev || !device->name)
3005 continue;
3006
3007 ds = btrfsic_dev_state_hashtable_lookup(
3008 device->bdev,
3009 &btrfsic_dev_state_hashtable);
3010 if (NULL != ds) {
3011 state = ds->state;
3012 btrfsic_dev_state_hashtable_remove(ds);
3013 btrfsic_dev_state_free(ds);
3014 }
3015 }
3016
3017 if (NULL == state) {
3018 printk(KERN_INFO
3019 "btrfsic: error, cannot find state information"
3020 " on umount!\n");
3021 mutex_unlock(&btrfsic_mutex);
3022 return;
3023 }
3024
3025 /*
3026 * Don't care about keeping the lists' state up to date,
3027 * just free all memory that was allocated dynamically.
3028 * Free the blocks and the block_links.
3029 */
3030 list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) {
3031 struct btrfsic_block *const b_all =
3032 list_entry(elem_all, struct btrfsic_block,
3033 all_blocks_node);
3034 struct list_head *elem_ref_to;
3035 struct list_head *tmp_ref_to;
3036
3037 list_for_each_safe(elem_ref_to, tmp_ref_to,
3038 &b_all->ref_to_list) {
3039 struct btrfsic_block_link *const l =
3040 list_entry(elem_ref_to,
3041 struct btrfsic_block_link,
3042 node_ref_to);
3043
3044 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
3045 btrfsic_print_rem_link(state, l);
3046
3047 l->ref_cnt--;
3048 if (0 == l->ref_cnt)
3049 btrfsic_block_link_free(l);
3050 }
3051
3052 if (b_all->is_iodone)
3053 btrfsic_block_free(b_all);
3054 else
3055 printk(KERN_INFO "btrfs: attempt to free %c-block"
3056 " @%llu (%s/%llu/%d) on umount which is"
3057 " not yet iodone!\n",
3058 btrfsic_get_block_type(state, b_all),
3059 (unsigned long long)b_all->logical_bytenr,
3060 b_all->dev_state->name,
3061 (unsigned long long)b_all->dev_bytenr,
3062 b_all->mirror_num);
3063 }
3064
3065 mutex_unlock(&btrfsic_mutex);
3066
3067 kfree(state);
3068}
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
new file mode 100644
index 000000000000..8b59175cc502
--- /dev/null
+++ b/fs/btrfs/check-integrity.h
@@ -0,0 +1,36 @@
1/*
2 * Copyright (C) STRATO AG 2011. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#if !defined(__BTRFS_CHECK_INTEGRITY__)
20#define __BTRFS_CHECK_INTEGRITY__
21
22#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
23int btrfsic_submit_bh(int rw, struct buffer_head *bh);
24void btrfsic_submit_bio(int rw, struct bio *bio);
25#else
26#define btrfsic_submit_bh submit_bh
27#define btrfsic_submit_bio submit_bio
28#endif
29
30int btrfsic_mount(struct btrfs_root *root,
31 struct btrfs_fs_devices *fs_devices,
32 int including_extent_data, u32 print_mask);
33void btrfsic_unmount(struct btrfs_root *root,
34 struct btrfs_fs_devices *fs_devices);
35
36#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index dede441bdeee..0639a555e16e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -240,7 +240,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
240 240
241 cow = btrfs_alloc_free_block(trans, root, buf->len, 0, 241 cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
242 new_root_objectid, &disk_key, level, 242 new_root_objectid, &disk_key, level,
243 buf->start, 0); 243 buf->start, 0, 1);
244 if (IS_ERR(cow)) 244 if (IS_ERR(cow))
245 return PTR_ERR(cow); 245 return PTR_ERR(cow);
246 246
@@ -261,9 +261,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
261 261
262 WARN_ON(btrfs_header_generation(buf) > trans->transid); 262 WARN_ON(btrfs_header_generation(buf) > trans->transid);
263 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 263 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
264 ret = btrfs_inc_ref(trans, root, cow, 1); 264 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
265 else 265 else
266 ret = btrfs_inc_ref(trans, root, cow, 0); 266 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
267 267
268 if (ret) 268 if (ret)
269 return ret; 269 return ret;
@@ -350,14 +350,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
350 if ((owner == root->root_key.objectid || 350 if ((owner == root->root_key.objectid ||
351 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && 351 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
352 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { 352 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
353 ret = btrfs_inc_ref(trans, root, buf, 1); 353 ret = btrfs_inc_ref(trans, root, buf, 1, 1);
354 BUG_ON(ret); 354 BUG_ON(ret);
355 355
356 if (root->root_key.objectid == 356 if (root->root_key.objectid ==
357 BTRFS_TREE_RELOC_OBJECTID) { 357 BTRFS_TREE_RELOC_OBJECTID) {
358 ret = btrfs_dec_ref(trans, root, buf, 0); 358 ret = btrfs_dec_ref(trans, root, buf, 0, 1);
359 BUG_ON(ret); 359 BUG_ON(ret);
360 ret = btrfs_inc_ref(trans, root, cow, 1); 360 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
361 BUG_ON(ret); 361 BUG_ON(ret);
362 } 362 }
363 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 363 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
@@ -365,9 +365,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
365 365
366 if (root->root_key.objectid == 366 if (root->root_key.objectid ==
367 BTRFS_TREE_RELOC_OBJECTID) 367 BTRFS_TREE_RELOC_OBJECTID)
368 ret = btrfs_inc_ref(trans, root, cow, 1); 368 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
369 else 369 else
370 ret = btrfs_inc_ref(trans, root, cow, 0); 370 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
371 BUG_ON(ret); 371 BUG_ON(ret);
372 } 372 }
373 if (new_flags != 0) { 373 if (new_flags != 0) {
@@ -381,11 +381,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
381 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 381 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
382 if (root->root_key.objectid == 382 if (root->root_key.objectid ==
383 BTRFS_TREE_RELOC_OBJECTID) 383 BTRFS_TREE_RELOC_OBJECTID)
384 ret = btrfs_inc_ref(trans, root, cow, 1); 384 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
385 else 385 else
386 ret = btrfs_inc_ref(trans, root, cow, 0); 386 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
387 BUG_ON(ret); 387 BUG_ON(ret);
388 ret = btrfs_dec_ref(trans, root, buf, 1); 388 ret = btrfs_dec_ref(trans, root, buf, 1, 1);
389 BUG_ON(ret); 389 BUG_ON(ret);
390 } 390 }
391 clean_tree_block(trans, root, buf); 391 clean_tree_block(trans, root, buf);
@@ -446,7 +446,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
446 446
447 cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, 447 cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
448 root->root_key.objectid, &disk_key, 448 root->root_key.objectid, &disk_key,
449 level, search_start, empty_size); 449 level, search_start, empty_size, 1);
450 if (IS_ERR(cow)) 450 if (IS_ERR(cow))
451 return PTR_ERR(cow); 451 return PTR_ERR(cow);
452 452
@@ -484,7 +484,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
484 rcu_assign_pointer(root->node, cow); 484 rcu_assign_pointer(root->node, cow);
485 485
486 btrfs_free_tree_block(trans, root, buf, parent_start, 486 btrfs_free_tree_block(trans, root, buf, parent_start,
487 last_ref); 487 last_ref, 1);
488 free_extent_buffer(buf); 488 free_extent_buffer(buf);
489 add_root_to_dirty_list(root); 489 add_root_to_dirty_list(root);
490 } else { 490 } else {
@@ -500,7 +500,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
500 trans->transid); 500 trans->transid);
501 btrfs_mark_buffer_dirty(parent); 501 btrfs_mark_buffer_dirty(parent);
502 btrfs_free_tree_block(trans, root, buf, parent_start, 502 btrfs_free_tree_block(trans, root, buf, parent_start,
503 last_ref); 503 last_ref, 1);
504 } 504 }
505 if (unlock_orig) 505 if (unlock_orig)
506 btrfs_tree_unlock(buf); 506 btrfs_tree_unlock(buf);
@@ -957,7 +957,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
957 free_extent_buffer(mid); 957 free_extent_buffer(mid);
958 958
959 root_sub_used(root, mid->len); 959 root_sub_used(root, mid->len);
960 btrfs_free_tree_block(trans, root, mid, 0, 1); 960 btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
961 /* once for the root ptr */ 961 /* once for the root ptr */
962 free_extent_buffer(mid); 962 free_extent_buffer(mid);
963 return 0; 963 return 0;
@@ -1015,7 +1015,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1015 if (wret) 1015 if (wret)
1016 ret = wret; 1016 ret = wret;
1017 root_sub_used(root, right->len); 1017 root_sub_used(root, right->len);
1018 btrfs_free_tree_block(trans, root, right, 0, 1); 1018 btrfs_free_tree_block(trans, root, right, 0, 1, 0);
1019 free_extent_buffer(right); 1019 free_extent_buffer(right);
1020 right = NULL; 1020 right = NULL;
1021 } else { 1021 } else {
@@ -1055,7 +1055,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1055 if (wret) 1055 if (wret)
1056 ret = wret; 1056 ret = wret;
1057 root_sub_used(root, mid->len); 1057 root_sub_used(root, mid->len);
1058 btrfs_free_tree_block(trans, root, mid, 0, 1); 1058 btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
1059 free_extent_buffer(mid); 1059 free_extent_buffer(mid);
1060 mid = NULL; 1060 mid = NULL;
1061 } else { 1061 } else {
@@ -2089,7 +2089,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2089 2089
2090 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 2090 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2091 root->root_key.objectid, &lower_key, 2091 root->root_key.objectid, &lower_key,
2092 level, root->node->start, 0); 2092 level, root->node->start, 0, 0);
2093 if (IS_ERR(c)) 2093 if (IS_ERR(c))
2094 return PTR_ERR(c); 2094 return PTR_ERR(c);
2095 2095
@@ -2216,7 +2216,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2216 2216
2217 split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 2217 split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2218 root->root_key.objectid, 2218 root->root_key.objectid,
2219 &disk_key, level, c->start, 0); 2219 &disk_key, level, c->start, 0, 0);
2220 if (IS_ERR(split)) 2220 if (IS_ERR(split))
2221 return PTR_ERR(split); 2221 return PTR_ERR(split);
2222 2222
@@ -2970,7 +2970,7 @@ again:
2970 2970
2971 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 2971 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
2972 root->root_key.objectid, 2972 root->root_key.objectid,
2973 &disk_key, 0, l->start, 0); 2973 &disk_key, 0, l->start, 0, 0);
2974 if (IS_ERR(right)) 2974 if (IS_ERR(right))
2975 return PTR_ERR(right); 2975 return PTR_ERR(right);
2976 2976
@@ -3781,7 +3781,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3781 3781
3782 root_sub_used(root, leaf->len); 3782 root_sub_used(root, leaf->len);
3783 3783
3784 btrfs_free_tree_block(trans, root, leaf, 0, 1); 3784 btrfs_free_tree_block(trans, root, leaf, 0, 1, 0);
3785 return 0; 3785 return 0;
3786} 3786}
3787/* 3787/*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 67385033323d..27ebe61d3ccc 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -86,6 +86,9 @@ struct btrfs_ordered_sum;
86/* holds checksums of all the data extents */ 86/* holds checksums of all the data extents */
87#define BTRFS_CSUM_TREE_OBJECTID 7ULL 87#define BTRFS_CSUM_TREE_OBJECTID 7ULL
88 88
89/* for storing balance parameters in the root tree */
90#define BTRFS_BALANCE_OBJECTID -4ULL
91
89/* orhpan objectid for tracking unlinked/truncated files */ 92/* orhpan objectid for tracking unlinked/truncated files */
90#define BTRFS_ORPHAN_OBJECTID -5ULL 93#define BTRFS_ORPHAN_OBJECTID -5ULL
91 94
@@ -692,6 +695,54 @@ struct btrfs_root_ref {
692 __le16 name_len; 695 __le16 name_len;
693} __attribute__ ((__packed__)); 696} __attribute__ ((__packed__));
694 697
698struct btrfs_disk_balance_args {
699 /*
700 * profiles to operate on, single is denoted by
701 * BTRFS_AVAIL_ALLOC_BIT_SINGLE
702 */
703 __le64 profiles;
704
705 /* usage filter */
706 __le64 usage;
707
708 /* devid filter */
709 __le64 devid;
710
711 /* devid subset filter [pstart..pend) */
712 __le64 pstart;
713 __le64 pend;
714
715 /* btrfs virtual address space subset filter [vstart..vend) */
716 __le64 vstart;
717 __le64 vend;
718
719 /*
720 * profile to convert to, single is denoted by
721 * BTRFS_AVAIL_ALLOC_BIT_SINGLE
722 */
723 __le64 target;
724
725 /* BTRFS_BALANCE_ARGS_* */
726 __le64 flags;
727
728 __le64 unused[8];
729} __attribute__ ((__packed__));
730
731/*
732 * store balance parameters to disk so that balance can be properly
733 * resumed after crash or unmount
734 */
735struct btrfs_balance_item {
736 /* BTRFS_BALANCE_* */
737 __le64 flags;
738
739 struct btrfs_disk_balance_args data;
740 struct btrfs_disk_balance_args meta;
741 struct btrfs_disk_balance_args sys;
742
743 __le64 unused[4];
744} __attribute__ ((__packed__));
745
695#define BTRFS_FILE_EXTENT_INLINE 0 746#define BTRFS_FILE_EXTENT_INLINE 0
696#define BTRFS_FILE_EXTENT_REG 1 747#define BTRFS_FILE_EXTENT_REG 1
697#define BTRFS_FILE_EXTENT_PREALLOC 2 748#define BTRFS_FILE_EXTENT_PREALLOC 2
@@ -751,14 +802,32 @@ struct btrfs_csum_item {
751} __attribute__ ((__packed__)); 802} __attribute__ ((__packed__));
752 803
753/* different types of block groups (and chunks) */ 804/* different types of block groups (and chunks) */
754#define BTRFS_BLOCK_GROUP_DATA (1 << 0) 805#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0)
755#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1) 806#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1)
756#define BTRFS_BLOCK_GROUP_METADATA (1 << 2) 807#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2)
757#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3) 808#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3)
758#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) 809#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
759#define BTRFS_BLOCK_GROUP_DUP (1 << 5) 810#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
760#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) 811#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
761#define BTRFS_NR_RAID_TYPES 5 812#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
813#define BTRFS_NR_RAID_TYPES 5
814
815#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \
816 BTRFS_BLOCK_GROUP_SYSTEM | \
817 BTRFS_BLOCK_GROUP_METADATA)
818
819#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
820 BTRFS_BLOCK_GROUP_RAID1 | \
821 BTRFS_BLOCK_GROUP_DUP | \
822 BTRFS_BLOCK_GROUP_RAID10)
823/*
824 * We need a bit for restriper to be able to tell when chunks of type
825 * SINGLE are available. This "extended" profile format is used in
826 * fs_info->avail_*_alloc_bits (in-memory) and balance item fields
827 * (on-disk). The corresponding on-disk bit in chunk.type is reserved
828 * to avoid remappings between two formats in future.
829 */
830#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48)
762 831
763struct btrfs_block_group_item { 832struct btrfs_block_group_item {
764 __le64 used; 833 __le64 used;
@@ -916,6 +985,7 @@ struct btrfs_block_group_cache {
916struct reloc_control; 985struct reloc_control;
917struct btrfs_device; 986struct btrfs_device;
918struct btrfs_fs_devices; 987struct btrfs_fs_devices;
988struct btrfs_balance_control;
919struct btrfs_delayed_root; 989struct btrfs_delayed_root;
920struct btrfs_fs_info { 990struct btrfs_fs_info {
921 u8 fsid[BTRFS_FSID_SIZE]; 991 u8 fsid[BTRFS_FSID_SIZE];
@@ -971,7 +1041,7 @@ struct btrfs_fs_info {
971 * is required instead of the faster short fsync log commits 1041 * is required instead of the faster short fsync log commits
972 */ 1042 */
973 u64 last_trans_log_full_commit; 1043 u64 last_trans_log_full_commit;
974 unsigned long mount_opt:20; 1044 unsigned long mount_opt:21;
975 unsigned long compress_type:4; 1045 unsigned long compress_type:4;
976 u64 max_inline; 1046 u64 max_inline;
977 u64 alloc_start; 1047 u64 alloc_start;
@@ -1132,12 +1202,23 @@ struct btrfs_fs_info {
1132 spinlock_t ref_cache_lock; 1202 spinlock_t ref_cache_lock;
1133 u64 total_ref_cache_size; 1203 u64 total_ref_cache_size;
1134 1204
1205 /*
1206 * these three are in extended format (availability of single
1207 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
1208 * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits)
1209 */
1135 u64 avail_data_alloc_bits; 1210 u64 avail_data_alloc_bits;
1136 u64 avail_metadata_alloc_bits; 1211 u64 avail_metadata_alloc_bits;
1137 u64 avail_system_alloc_bits; 1212 u64 avail_system_alloc_bits;
1138 u64 data_alloc_profile; 1213
1139 u64 metadata_alloc_profile; 1214 /* restriper state */
1140 u64 system_alloc_profile; 1215 spinlock_t balance_lock;
1216 struct mutex balance_mutex;
1217 atomic_t balance_running;
1218 atomic_t balance_pause_req;
1219 atomic_t balance_cancel_req;
1220 struct btrfs_balance_control *balance_ctl;
1221 wait_queue_head_t balance_wait_q;
1141 1222
1142 unsigned data_chunk_allocations; 1223 unsigned data_chunk_allocations;
1143 unsigned metadata_ratio; 1224 unsigned metadata_ratio;
@@ -1155,6 +1236,10 @@ struct btrfs_fs_info {
1155 int scrub_workers_refcnt; 1236 int scrub_workers_refcnt;
1156 struct btrfs_workers scrub_workers; 1237 struct btrfs_workers scrub_workers;
1157 1238
1239#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1240 u32 check_integrity_print_mask;
1241#endif
1242
1158 /* filesystem state */ 1243 /* filesystem state */
1159 u64 fs_state; 1244 u64 fs_state;
1160 1245
@@ -1383,6 +1468,8 @@ struct btrfs_ioctl_defrag_range_args {
1383#define BTRFS_DEV_ITEM_KEY 216 1468#define BTRFS_DEV_ITEM_KEY 216
1384#define BTRFS_CHUNK_ITEM_KEY 228 1469#define BTRFS_CHUNK_ITEM_KEY 228
1385 1470
1471#define BTRFS_BALANCE_ITEM_KEY 248
1472
1386/* 1473/*
1387 * string items are for debugging. They just store a short string of 1474 * string items are for debugging. They just store a short string of
1388 * data in the FS 1475 * data in the FS
@@ -1413,6 +1500,9 @@ struct btrfs_ioctl_defrag_range_args {
1413#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) 1500#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
1414#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) 1501#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
1415#define BTRFS_MOUNT_RECOVERY (1 << 18) 1502#define BTRFS_MOUNT_RECOVERY (1 << 18)
1503#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19)
1504#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20)
1505#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
1416 1506
1417#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1507#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1418#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1508#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -2077,8 +2167,86 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
2077BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, 2167BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
2078 num_devices, 64); 2168 num_devices, 64);
2079 2169
2080/* struct btrfs_super_block */ 2170/* struct btrfs_balance_item */
2171BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
2081 2172
2173static inline void btrfs_balance_data(struct extent_buffer *eb,
2174 struct btrfs_balance_item *bi,
2175 struct btrfs_disk_balance_args *ba)
2176{
2177 read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
2178}
2179
2180static inline void btrfs_set_balance_data(struct extent_buffer *eb,
2181 struct btrfs_balance_item *bi,
2182 struct btrfs_disk_balance_args *ba)
2183{
2184 write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
2185}
2186
2187static inline void btrfs_balance_meta(struct extent_buffer *eb,
2188 struct btrfs_balance_item *bi,
2189 struct btrfs_disk_balance_args *ba)
2190{
2191 read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
2192}
2193
2194static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
2195 struct btrfs_balance_item *bi,
2196 struct btrfs_disk_balance_args *ba)
2197{
2198 write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
2199}
2200
2201static inline void btrfs_balance_sys(struct extent_buffer *eb,
2202 struct btrfs_balance_item *bi,
2203 struct btrfs_disk_balance_args *ba)
2204{
2205 read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
2206}
2207
2208static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
2209 struct btrfs_balance_item *bi,
2210 struct btrfs_disk_balance_args *ba)
2211{
2212 write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
2213}
2214
2215static inline void
2216btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
2217 struct btrfs_disk_balance_args *disk)
2218{
2219 memset(cpu, 0, sizeof(*cpu));
2220
2221 cpu->profiles = le64_to_cpu(disk->profiles);
2222 cpu->usage = le64_to_cpu(disk->usage);
2223 cpu->devid = le64_to_cpu(disk->devid);
2224 cpu->pstart = le64_to_cpu(disk->pstart);
2225 cpu->pend = le64_to_cpu(disk->pend);
2226 cpu->vstart = le64_to_cpu(disk->vstart);
2227 cpu->vend = le64_to_cpu(disk->vend);
2228 cpu->target = le64_to_cpu(disk->target);
2229 cpu->flags = le64_to_cpu(disk->flags);
2230}
2231
2232static inline void
2233btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
2234 struct btrfs_balance_args *cpu)
2235{
2236 memset(disk, 0, sizeof(*disk));
2237
2238 disk->profiles = cpu_to_le64(cpu->profiles);
2239 disk->usage = cpu_to_le64(cpu->usage);
2240 disk->devid = cpu_to_le64(cpu->devid);
2241 disk->pstart = cpu_to_le64(cpu->pstart);
2242 disk->pend = cpu_to_le64(cpu->pend);
2243 disk->vstart = cpu_to_le64(cpu->vstart);
2244 disk->vend = cpu_to_le64(cpu->vend);
2245 disk->target = cpu_to_le64(cpu->target);
2246 disk->flags = cpu_to_le64(cpu->flags);
2247}
2248
2249/* struct btrfs_super_block */
2082BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); 2250BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
2083BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); 2251BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
2084BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, 2252BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
@@ -2196,7 +2364,7 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
2196 return btrfs_item_size(eb, e) - offset; 2364 return btrfs_item_size(eb, e) - offset;
2197} 2365}
2198 2366
2199static inline struct btrfs_root *btrfs_sb(struct super_block *sb) 2367static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
2200{ 2368{
2201 return sb->s_fs_info; 2369 return sb->s_fs_info;
2202} 2370}
@@ -2277,11 +2445,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
2277 struct btrfs_root *root, u32 blocksize, 2445 struct btrfs_root *root, u32 blocksize,
2278 u64 parent, u64 root_objectid, 2446 u64 parent, u64 root_objectid,
2279 struct btrfs_disk_key *key, int level, 2447 struct btrfs_disk_key *key, int level,
2280 u64 hint, u64 empty_size); 2448 u64 hint, u64 empty_size, int for_cow);
2281void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 2449void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
2282 struct btrfs_root *root, 2450 struct btrfs_root *root,
2283 struct extent_buffer *buf, 2451 struct extent_buffer *buf,
2284 u64 parent, int last_ref); 2452 u64 parent, int last_ref, int for_cow);
2285struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 2453struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
2286 struct btrfs_root *root, 2454 struct btrfs_root *root,
2287 u64 bytenr, u32 blocksize, 2455 u64 bytenr, u32 blocksize,
@@ -2301,17 +2469,17 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
2301 u64 search_end, struct btrfs_key *ins, 2469 u64 search_end, struct btrfs_key *ins,
2302 u64 data); 2470 u64 data);
2303int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2471int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2304 struct extent_buffer *buf, int full_backref); 2472 struct extent_buffer *buf, int full_backref, int for_cow);
2305int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2473int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2306 struct extent_buffer *buf, int full_backref); 2474 struct extent_buffer *buf, int full_backref, int for_cow);
2307int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 2475int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2308 struct btrfs_root *root, 2476 struct btrfs_root *root,
2309 u64 bytenr, u64 num_bytes, u64 flags, 2477 u64 bytenr, u64 num_bytes, u64 flags,
2310 int is_data); 2478 int is_data);
2311int btrfs_free_extent(struct btrfs_trans_handle *trans, 2479int btrfs_free_extent(struct btrfs_trans_handle *trans,
2312 struct btrfs_root *root, 2480 struct btrfs_root *root,
2313 u64 bytenr, u64 num_bytes, u64 parent, 2481 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
2314 u64 root_objectid, u64 owner, u64 offset); 2482 u64 owner, u64 offset, int for_cow);
2315 2483
2316int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); 2484int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
2317int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, 2485int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
@@ -2323,7 +2491,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2323int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2491int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2324 struct btrfs_root *root, 2492 struct btrfs_root *root,
2325 u64 bytenr, u64 num_bytes, u64 parent, 2493 u64 bytenr, u64 num_bytes, u64 parent,
2326 u64 root_objectid, u64 owner, u64 offset); 2494 u64 root_objectid, u64 owner, u64 offset, int for_cow);
2327 2495
2328int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 2496int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2329 struct btrfs_root *root); 2497 struct btrfs_root *root);
@@ -2482,10 +2650,18 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
2482} 2650}
2483 2651
2484int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); 2652int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
2653static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
2654{
2655 ++p->slots[0];
2656 if (p->slots[0] >= btrfs_header_nritems(p->nodes[0]))
2657 return btrfs_next_leaf(root, p);
2658 return 0;
2659}
2485int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); 2660int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
2486int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); 2661int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
2487void btrfs_drop_snapshot(struct btrfs_root *root, 2662void btrfs_drop_snapshot(struct btrfs_root *root,
2488 struct btrfs_block_rsv *block_rsv, int update_ref); 2663 struct btrfs_block_rsv *block_rsv, int update_ref,
2664 int for_reloc);
2489int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 2665int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
2490 struct btrfs_root *root, 2666 struct btrfs_root *root,
2491 struct extent_buffer *node, 2667 struct extent_buffer *node,
@@ -2500,6 +2676,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
2500} 2676}
2501static inline void free_fs_info(struct btrfs_fs_info *fs_info) 2677static inline void free_fs_info(struct btrfs_fs_info *fs_info)
2502{ 2678{
2679 kfree(fs_info->balance_ctl);
2503 kfree(fs_info->delayed_root); 2680 kfree(fs_info->delayed_root);
2504 kfree(fs_info->extent_root); 2681 kfree(fs_info->extent_root);
2505 kfree(fs_info->tree_root); 2682 kfree(fs_info->tree_root);
@@ -2510,6 +2687,24 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
2510 kfree(fs_info->super_for_commit); 2687 kfree(fs_info->super_for_commit);
2511 kfree(fs_info); 2688 kfree(fs_info);
2512} 2689}
2690/**
2691 * profile_is_valid - tests whether a given profile is valid and reduced
2692 * @flags: profile to validate
2693 * @extended: if true @flags is treated as an extended profile
2694 */
2695static inline int profile_is_valid(u64 flags, int extended)
2696{
2697 u64 mask = ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
2698
2699 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
2700 if (extended)
2701 mask &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2702
2703 if (flags & mask)
2704 return 0;
2705 /* true if zero or exactly one bit set */
2706 return (flags & (~flags + 1)) == flags;
2707}
2513 2708
2514/* root-item.c */ 2709/* root-item.c */
2515int btrfs_find_root_ref(struct btrfs_root *tree_root, 2710int btrfs_find_root_ref(struct btrfs_root *tree_root,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 9c1eccc2c503..fe4cd0f1cef1 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -595,8 +595,12 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
595 595
596 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 596 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
597 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); 597 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
598 if (!ret) 598 if (!ret) {
599 trace_btrfs_space_reservation(root->fs_info, "delayed_item",
600 item->key.objectid,
601 num_bytes, 1);
599 item->bytes_reserved = num_bytes; 602 item->bytes_reserved = num_bytes;
603 }
600 604
601 return ret; 605 return ret;
602} 606}
@@ -610,6 +614,9 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
610 return; 614 return;
611 615
612 rsv = &root->fs_info->delayed_block_rsv; 616 rsv = &root->fs_info->delayed_block_rsv;
617 trace_btrfs_space_reservation(root->fs_info, "delayed_item",
618 item->key.objectid, item->bytes_reserved,
619 0);
613 btrfs_block_rsv_release(root, rsv, 620 btrfs_block_rsv_release(root, rsv,
614 item->bytes_reserved); 621 item->bytes_reserved);
615} 622}
@@ -624,7 +631,7 @@ static int btrfs_delayed_inode_reserve_metadata(
624 struct btrfs_block_rsv *dst_rsv; 631 struct btrfs_block_rsv *dst_rsv;
625 u64 num_bytes; 632 u64 num_bytes;
626 int ret; 633 int ret;
627 int release = false; 634 bool release = false;
628 635
629 src_rsv = trans->block_rsv; 636 src_rsv = trans->block_rsv;
630 dst_rsv = &root->fs_info->delayed_block_rsv; 637 dst_rsv = &root->fs_info->delayed_block_rsv;
@@ -651,8 +658,13 @@ static int btrfs_delayed_inode_reserve_metadata(
651 */ 658 */
652 if (ret == -EAGAIN) 659 if (ret == -EAGAIN)
653 ret = -ENOSPC; 660 ret = -ENOSPC;
654 if (!ret) 661 if (!ret) {
655 node->bytes_reserved = num_bytes; 662 node->bytes_reserved = num_bytes;
663 trace_btrfs_space_reservation(root->fs_info,
664 "delayed_inode",
665 btrfs_ino(inode),
666 num_bytes, 1);
667 }
656 return ret; 668 return ret;
657 } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { 669 } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
658 spin_lock(&BTRFS_I(inode)->lock); 670 spin_lock(&BTRFS_I(inode)->lock);
@@ -707,11 +719,17 @@ out:
707 * reservation here. I think it may be time for a documentation page on 719 * reservation here. I think it may be time for a documentation page on
708 * how block rsvs. work. 720 * how block rsvs. work.
709 */ 721 */
710 if (!ret) 722 if (!ret) {
723 trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
724 btrfs_ino(inode), num_bytes, 1);
711 node->bytes_reserved = num_bytes; 725 node->bytes_reserved = num_bytes;
726 }
712 727
713 if (release) 728 if (release) {
729 trace_btrfs_space_reservation(root->fs_info, "delalloc",
730 btrfs_ino(inode), num_bytes, 0);
714 btrfs_block_rsv_release(root, src_rsv, num_bytes); 731 btrfs_block_rsv_release(root, src_rsv, num_bytes);
732 }
715 733
716 return ret; 734 return ret;
717} 735}
@@ -725,6 +743,8 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
725 return; 743 return;
726 744
727 rsv = &root->fs_info->delayed_block_rsv; 745 rsv = &root->fs_info->delayed_block_rsv;
746 trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
747 node->inode_id, node->bytes_reserved, 0);
728 btrfs_block_rsv_release(root, rsv, 748 btrfs_block_rsv_release(root, rsv,
729 node->bytes_reserved); 749 node->bytes_reserved);
730 node->bytes_reserved = 0; 750 node->bytes_reserved = 0;
@@ -1372,13 +1392,6 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
1372 goto release_node; 1392 goto release_node;
1373 } 1393 }
1374 1394
1375 ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
1376 /*
1377 * we have reserved enough space when we start a new transaction,
1378 * so reserving metadata failure is impossible
1379 */
1380 BUG_ON(ret);
1381
1382 delayed_item->key.objectid = btrfs_ino(dir); 1395 delayed_item->key.objectid = btrfs_ino(dir);
1383 btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY); 1396 btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY);
1384 delayed_item->key.offset = index; 1397 delayed_item->key.offset = index;
@@ -1391,6 +1404,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
1391 dir_item->type = type; 1404 dir_item->type = type;
1392 memcpy((char *)(dir_item + 1), name, name_len); 1405 memcpy((char *)(dir_item + 1), name, name_len);
1393 1406
1407 ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
1408 /*
1409 * we have reserved enough space when we start a new transaction,
1410 * so reserving metadata failure is impossible
1411 */
1412 BUG_ON(ret);
1413
1414
1394 mutex_lock(&delayed_node->mutex); 1415 mutex_lock(&delayed_node->mutex);
1395 ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item); 1416 ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
1396 if (unlikely(ret)) { 1417 if (unlikely(ret)) {
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 125cf76fcd08..66e4f29505a3 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -101,6 +101,11 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
101 return -1; 101 return -1;
102 if (ref1->type > ref2->type) 102 if (ref1->type > ref2->type)
103 return 1; 103 return 1;
104 /* merging of sequenced refs is not allowed */
105 if (ref1->seq < ref2->seq)
106 return -1;
107 if (ref1->seq > ref2->seq)
108 return 1;
104 if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || 109 if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
105 ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { 110 ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
106 return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), 111 return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
@@ -150,16 +155,22 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
150 155
151/* 156/*
152 * find an head entry based on bytenr. This returns the delayed ref 157 * find an head entry based on bytenr. This returns the delayed ref
153 * head if it was able to find one, or NULL if nothing was in that spot 158 * head if it was able to find one, or NULL if nothing was in that spot.
159 * If return_bigger is given, the next bigger entry is returned if no exact
160 * match is found.
154 */ 161 */
155static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, 162static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
156 u64 bytenr, 163 u64 bytenr,
157 struct btrfs_delayed_ref_node **last) 164 struct btrfs_delayed_ref_node **last,
165 int return_bigger)
158{ 166{
159 struct rb_node *n = root->rb_node; 167 struct rb_node *n;
160 struct btrfs_delayed_ref_node *entry; 168 struct btrfs_delayed_ref_node *entry;
161 int cmp; 169 int cmp = 0;
162 170
171again:
172 n = root->rb_node;
173 entry = NULL;
163 while (n) { 174 while (n) {
164 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 175 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
165 WARN_ON(!entry->in_tree); 176 WARN_ON(!entry->in_tree);
@@ -182,6 +193,19 @@ static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
182 else 193 else
183 return entry; 194 return entry;
184 } 195 }
196 if (entry && return_bigger) {
197 if (cmp > 0) {
198 n = rb_next(&entry->rb_node);
199 if (!n)
200 n = rb_first(root);
201 entry = rb_entry(n, struct btrfs_delayed_ref_node,
202 rb_node);
203 bytenr = entry->bytenr;
204 return_bigger = 0;
205 goto again;
206 }
207 return entry;
208 }
185 return NULL; 209 return NULL;
186} 210}
187 211
@@ -209,6 +233,24 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
209 return 0; 233 return 0;
210} 234}
211 235
236int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
237 u64 seq)
238{
239 struct seq_list *elem;
240
241 assert_spin_locked(&delayed_refs->lock);
242 if (list_empty(&delayed_refs->seq_head))
243 return 0;
244
245 elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list);
246 if (seq >= elem->seq) {
247 pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n",
248 seq, elem->seq, delayed_refs);
249 return 1;
250 }
251 return 0;
252}
253
212int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 254int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
213 struct list_head *cluster, u64 start) 255 struct list_head *cluster, u64 start)
214{ 256{
@@ -223,20 +265,8 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
223 node = rb_first(&delayed_refs->root); 265 node = rb_first(&delayed_refs->root);
224 } else { 266 } else {
225 ref = NULL; 267 ref = NULL;
226 find_ref_head(&delayed_refs->root, start, &ref); 268 find_ref_head(&delayed_refs->root, start + 1, &ref, 1);
227 if (ref) { 269 if (ref) {
228 struct btrfs_delayed_ref_node *tmp;
229
230 node = rb_prev(&ref->rb_node);
231 while (node) {
232 tmp = rb_entry(node,
233 struct btrfs_delayed_ref_node,
234 rb_node);
235 if (tmp->bytenr < start)
236 break;
237 ref = tmp;
238 node = rb_prev(&ref->rb_node);
239 }
240 node = &ref->rb_node; 270 node = &ref->rb_node;
241 } else 271 } else
242 node = rb_first(&delayed_refs->root); 272 node = rb_first(&delayed_refs->root);
@@ -390,7 +420,8 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
390 * this does all the dirty work in terms of maintaining the correct 420 * this does all the dirty work in terms of maintaining the correct
391 * overall modification count. 421 * overall modification count.
392 */ 422 */
393static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, 423static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info,
424 struct btrfs_trans_handle *trans,
394 struct btrfs_delayed_ref_node *ref, 425 struct btrfs_delayed_ref_node *ref,
395 u64 bytenr, u64 num_bytes, 426 u64 bytenr, u64 num_bytes,
396 int action, int is_data) 427 int action, int is_data)
@@ -437,6 +468,7 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
437 ref->action = 0; 468 ref->action = 0;
438 ref->is_head = 1; 469 ref->is_head = 1;
439 ref->in_tree = 1; 470 ref->in_tree = 1;
471 ref->seq = 0;
440 472
441 head_ref = btrfs_delayed_node_to_head(ref); 473 head_ref = btrfs_delayed_node_to_head(ref);
442 head_ref->must_insert_reserved = must_insert_reserved; 474 head_ref->must_insert_reserved = must_insert_reserved;
@@ -468,14 +500,17 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
468/* 500/*
469 * helper to insert a delayed tree ref into the rbtree. 501 * helper to insert a delayed tree ref into the rbtree.
470 */ 502 */
471static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans, 503static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
504 struct btrfs_trans_handle *trans,
472 struct btrfs_delayed_ref_node *ref, 505 struct btrfs_delayed_ref_node *ref,
473 u64 bytenr, u64 num_bytes, u64 parent, 506 u64 bytenr, u64 num_bytes, u64 parent,
474 u64 ref_root, int level, int action) 507 u64 ref_root, int level, int action,
508 int for_cow)
475{ 509{
476 struct btrfs_delayed_ref_node *existing; 510 struct btrfs_delayed_ref_node *existing;
477 struct btrfs_delayed_tree_ref *full_ref; 511 struct btrfs_delayed_tree_ref *full_ref;
478 struct btrfs_delayed_ref_root *delayed_refs; 512 struct btrfs_delayed_ref_root *delayed_refs;
513 u64 seq = 0;
479 514
480 if (action == BTRFS_ADD_DELAYED_EXTENT) 515 if (action == BTRFS_ADD_DELAYED_EXTENT)
481 action = BTRFS_ADD_DELAYED_REF; 516 action = BTRFS_ADD_DELAYED_REF;
@@ -491,14 +526,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
491 ref->is_head = 0; 526 ref->is_head = 0;
492 ref->in_tree = 1; 527 ref->in_tree = 1;
493 528
529 if (need_ref_seq(for_cow, ref_root))
530 seq = inc_delayed_seq(delayed_refs);
531 ref->seq = seq;
532
494 full_ref = btrfs_delayed_node_to_tree_ref(ref); 533 full_ref = btrfs_delayed_node_to_tree_ref(ref);
495 if (parent) { 534 full_ref->parent = parent;
496 full_ref->parent = parent; 535 full_ref->root = ref_root;
536 if (parent)
497 ref->type = BTRFS_SHARED_BLOCK_REF_KEY; 537 ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
498 } else { 538 else
499 full_ref->root = ref_root;
500 ref->type = BTRFS_TREE_BLOCK_REF_KEY; 539 ref->type = BTRFS_TREE_BLOCK_REF_KEY;
501 }
502 full_ref->level = level; 540 full_ref->level = level;
503 541
504 trace_btrfs_delayed_tree_ref(ref, full_ref, action); 542 trace_btrfs_delayed_tree_ref(ref, full_ref, action);
@@ -522,15 +560,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
522/* 560/*
523 * helper to insert a delayed data ref into the rbtree. 561 * helper to insert a delayed data ref into the rbtree.
524 */ 562 */
525static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, 563static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info,
564 struct btrfs_trans_handle *trans,
526 struct btrfs_delayed_ref_node *ref, 565 struct btrfs_delayed_ref_node *ref,
527 u64 bytenr, u64 num_bytes, u64 parent, 566 u64 bytenr, u64 num_bytes, u64 parent,
528 u64 ref_root, u64 owner, u64 offset, 567 u64 ref_root, u64 owner, u64 offset,
529 int action) 568 int action, int for_cow)
530{ 569{
531 struct btrfs_delayed_ref_node *existing; 570 struct btrfs_delayed_ref_node *existing;
532 struct btrfs_delayed_data_ref *full_ref; 571 struct btrfs_delayed_data_ref *full_ref;
533 struct btrfs_delayed_ref_root *delayed_refs; 572 struct btrfs_delayed_ref_root *delayed_refs;
573 u64 seq = 0;
534 574
535 if (action == BTRFS_ADD_DELAYED_EXTENT) 575 if (action == BTRFS_ADD_DELAYED_EXTENT)
536 action = BTRFS_ADD_DELAYED_REF; 576 action = BTRFS_ADD_DELAYED_REF;
@@ -546,14 +586,18 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
546 ref->is_head = 0; 586 ref->is_head = 0;
547 ref->in_tree = 1; 587 ref->in_tree = 1;
548 588
589 if (need_ref_seq(for_cow, ref_root))
590 seq = inc_delayed_seq(delayed_refs);
591 ref->seq = seq;
592
549 full_ref = btrfs_delayed_node_to_data_ref(ref); 593 full_ref = btrfs_delayed_node_to_data_ref(ref);
550 if (parent) { 594 full_ref->parent = parent;
551 full_ref->parent = parent; 595 full_ref->root = ref_root;
596 if (parent)
552 ref->type = BTRFS_SHARED_DATA_REF_KEY; 597 ref->type = BTRFS_SHARED_DATA_REF_KEY;
553 } else { 598 else
554 full_ref->root = ref_root;
555 ref->type = BTRFS_EXTENT_DATA_REF_KEY; 599 ref->type = BTRFS_EXTENT_DATA_REF_KEY;
556 } 600
557 full_ref->objectid = owner; 601 full_ref->objectid = owner;
558 full_ref->offset = offset; 602 full_ref->offset = offset;
559 603
@@ -580,10 +624,12 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
580 * to make sure the delayed ref is eventually processed before this 624 * to make sure the delayed ref is eventually processed before this
581 * transaction commits. 625 * transaction commits.
582 */ 626 */
583int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, 627int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
628 struct btrfs_trans_handle *trans,
584 u64 bytenr, u64 num_bytes, u64 parent, 629 u64 bytenr, u64 num_bytes, u64 parent,
585 u64 ref_root, int level, int action, 630 u64 ref_root, int level, int action,
586 struct btrfs_delayed_extent_op *extent_op) 631 struct btrfs_delayed_extent_op *extent_op,
632 int for_cow)
587{ 633{
588 struct btrfs_delayed_tree_ref *ref; 634 struct btrfs_delayed_tree_ref *ref;
589 struct btrfs_delayed_ref_head *head_ref; 635 struct btrfs_delayed_ref_head *head_ref;
@@ -610,13 +656,17 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
610 * insert both the head node and the new ref without dropping 656 * insert both the head node and the new ref without dropping
611 * the spin lock 657 * the spin lock
612 */ 658 */
613 ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, 659 ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
614 action, 0); 660 num_bytes, action, 0);
615 BUG_ON(ret); 661 BUG_ON(ret);
616 662
617 ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes, 663 ret = add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
618 parent, ref_root, level, action); 664 num_bytes, parent, ref_root, level, action,
665 for_cow);
619 BUG_ON(ret); 666 BUG_ON(ret);
667 if (!need_ref_seq(for_cow, ref_root) &&
668 waitqueue_active(&delayed_refs->seq_wait))
669 wake_up(&delayed_refs->seq_wait);
620 spin_unlock(&delayed_refs->lock); 670 spin_unlock(&delayed_refs->lock);
621 return 0; 671 return 0;
622} 672}
@@ -624,11 +674,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
624/* 674/*
625 * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. 675 * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
626 */ 676 */
627int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, 677int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
678 struct btrfs_trans_handle *trans,
628 u64 bytenr, u64 num_bytes, 679 u64 bytenr, u64 num_bytes,
629 u64 parent, u64 ref_root, 680 u64 parent, u64 ref_root,
630 u64 owner, u64 offset, int action, 681 u64 owner, u64 offset, int action,
631 struct btrfs_delayed_extent_op *extent_op) 682 struct btrfs_delayed_extent_op *extent_op,
683 int for_cow)
632{ 684{
633 struct btrfs_delayed_data_ref *ref; 685 struct btrfs_delayed_data_ref *ref;
634 struct btrfs_delayed_ref_head *head_ref; 686 struct btrfs_delayed_ref_head *head_ref;
@@ -655,18 +707,23 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
655 * insert both the head node and the new ref without dropping 707 * insert both the head node and the new ref without dropping
656 * the spin lock 708 * the spin lock
657 */ 709 */
658 ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, 710 ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
659 action, 1); 711 num_bytes, action, 1);
660 BUG_ON(ret); 712 BUG_ON(ret);
661 713
662 ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes, 714 ret = add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
663 parent, ref_root, owner, offset, action); 715 num_bytes, parent, ref_root, owner, offset,
716 action, for_cow);
664 BUG_ON(ret); 717 BUG_ON(ret);
718 if (!need_ref_seq(for_cow, ref_root) &&
719 waitqueue_active(&delayed_refs->seq_wait))
720 wake_up(&delayed_refs->seq_wait);
665 spin_unlock(&delayed_refs->lock); 721 spin_unlock(&delayed_refs->lock);
666 return 0; 722 return 0;
667} 723}
668 724
669int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, 725int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
726 struct btrfs_trans_handle *trans,
670 u64 bytenr, u64 num_bytes, 727 u64 bytenr, u64 num_bytes,
671 struct btrfs_delayed_extent_op *extent_op) 728 struct btrfs_delayed_extent_op *extent_op)
672{ 729{
@@ -683,11 +740,13 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
683 delayed_refs = &trans->transaction->delayed_refs; 740 delayed_refs = &trans->transaction->delayed_refs;
684 spin_lock(&delayed_refs->lock); 741 spin_lock(&delayed_refs->lock);
685 742
686 ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, 743 ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
687 num_bytes, BTRFS_UPDATE_DELAYED_HEAD, 744 num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
688 extent_op->is_data); 745 extent_op->is_data);
689 BUG_ON(ret); 746 BUG_ON(ret);
690 747
748 if (waitqueue_active(&delayed_refs->seq_wait))
749 wake_up(&delayed_refs->seq_wait);
691 spin_unlock(&delayed_refs->lock); 750 spin_unlock(&delayed_refs->lock);
692 return 0; 751 return 0;
693} 752}
@@ -704,7 +763,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
704 struct btrfs_delayed_ref_root *delayed_refs; 763 struct btrfs_delayed_ref_root *delayed_refs;
705 764
706 delayed_refs = &trans->transaction->delayed_refs; 765 delayed_refs = &trans->transaction->delayed_refs;
707 ref = find_ref_head(&delayed_refs->root, bytenr, NULL); 766 ref = find_ref_head(&delayed_refs->root, bytenr, NULL, 0);
708 if (ref) 767 if (ref)
709 return btrfs_delayed_node_to_head(ref); 768 return btrfs_delayed_node_to_head(ref);
710 return NULL; 769 return NULL;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index e287e3b0eab0..d8f244d94925 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -33,6 +33,9 @@ struct btrfs_delayed_ref_node {
33 /* the size of the extent */ 33 /* the size of the extent */
34 u64 num_bytes; 34 u64 num_bytes;
35 35
36 /* seq number to keep track of insertion order */
37 u64 seq;
38
36 /* ref count on this data structure */ 39 /* ref count on this data structure */
37 atomic_t refs; 40 atomic_t refs;
38 41
@@ -98,19 +101,15 @@ struct btrfs_delayed_ref_head {
98 101
99struct btrfs_delayed_tree_ref { 102struct btrfs_delayed_tree_ref {
100 struct btrfs_delayed_ref_node node; 103 struct btrfs_delayed_ref_node node;
101 union { 104 u64 root;
102 u64 root; 105 u64 parent;
103 u64 parent;
104 };
105 int level; 106 int level;
106}; 107};
107 108
108struct btrfs_delayed_data_ref { 109struct btrfs_delayed_data_ref {
109 struct btrfs_delayed_ref_node node; 110 struct btrfs_delayed_ref_node node;
110 union { 111 u64 root;
111 u64 root; 112 u64 parent;
112 u64 parent;
113 };
114 u64 objectid; 113 u64 objectid;
115 u64 offset; 114 u64 offset;
116}; 115};
@@ -140,6 +139,26 @@ struct btrfs_delayed_ref_root {
140 int flushing; 139 int flushing;
141 140
142 u64 run_delayed_start; 141 u64 run_delayed_start;
142
143 /*
144 * seq number of delayed refs. We need to know if a backref was being
145 * added before the currently processed ref or afterwards.
146 */
147 u64 seq;
148
149 /*
150 * seq_list holds a list of all seq numbers that are currently being
151 * added to the list. While walking backrefs (btrfs_find_all_roots,
152 * qgroups), which might take some time, no newer ref must be processed,
153 * as it might influence the outcome of the walk.
154 */
155 struct list_head seq_head;
156
157 /*
158 * when the only refs we have in the list must not be processed, we want
159 * to wait for more refs to show up or for the end of backref walking.
160 */
161 wait_queue_head_t seq_wait;
143}; 162};
144 163
145static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) 164static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
@@ -151,16 +170,21 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
151 } 170 }
152} 171}
153 172
154int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, 173int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
174 struct btrfs_trans_handle *trans,
155 u64 bytenr, u64 num_bytes, u64 parent, 175 u64 bytenr, u64 num_bytes, u64 parent,
156 u64 ref_root, int level, int action, 176 u64 ref_root, int level, int action,
157 struct btrfs_delayed_extent_op *extent_op); 177 struct btrfs_delayed_extent_op *extent_op,
158int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, 178 int for_cow);
179int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
180 struct btrfs_trans_handle *trans,
159 u64 bytenr, u64 num_bytes, 181 u64 bytenr, u64 num_bytes,
160 u64 parent, u64 ref_root, 182 u64 parent, u64 ref_root,
161 u64 owner, u64 offset, int action, 183 u64 owner, u64 offset, int action,
162 struct btrfs_delayed_extent_op *extent_op); 184 struct btrfs_delayed_extent_op *extent_op,
163int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, 185 int for_cow);
186int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
187 struct btrfs_trans_handle *trans,
164 u64 bytenr, u64 num_bytes, 188 u64 bytenr, u64 num_bytes,
165 struct btrfs_delayed_extent_op *extent_op); 189 struct btrfs_delayed_extent_op *extent_op);
166 190
@@ -170,6 +194,60 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
170 struct btrfs_delayed_ref_head *head); 194 struct btrfs_delayed_ref_head *head);
171int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 195int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
172 struct list_head *cluster, u64 search_start); 196 struct list_head *cluster, u64 search_start);
197
198struct seq_list {
199 struct list_head list;
200 u64 seq;
201};
202
203static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs)
204{
205 assert_spin_locked(&delayed_refs->lock);
206 ++delayed_refs->seq;
207 return delayed_refs->seq;
208}
209
210static inline void
211btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
212 struct seq_list *elem)
213{
214 assert_spin_locked(&delayed_refs->lock);
215 elem->seq = delayed_refs->seq;
216 list_add_tail(&elem->list, &delayed_refs->seq_head);
217}
218
219static inline void
220btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
221 struct seq_list *elem)
222{
223 spin_lock(&delayed_refs->lock);
224 list_del(&elem->list);
225 wake_up(&delayed_refs->seq_wait);
226 spin_unlock(&delayed_refs->lock);
227}
228
229int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
230 u64 seq);
231
232/*
233 * delayed refs with a ref_seq > 0 must be held back during backref walking.
234 * this only applies to items in one of the fs-trees. for_cow items never need
235 * to be held back, so they won't get a ref_seq number.
236 */
237static inline int need_ref_seq(int for_cow, u64 rootid)
238{
239 if (for_cow)
240 return 0;
241
242 if (rootid == BTRFS_FS_TREE_OBJECTID)
243 return 1;
244
245 if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
246 return 1;
247
248 return 0;
249}
250
173/* 251/*
174 * a node might live in a head or a regular ref, this lets you 252 * a node might live in a head or a regular ref, this lets you
175 * test for the proper type to use. 253 * test for the proper type to use.
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f44b3928dc2d..7aa9cd36bf1b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -43,6 +43,7 @@
43#include "tree-log.h" 43#include "tree-log.h"
44#include "free-space-cache.h" 44#include "free-space-cache.h"
45#include "inode-map.h" 45#include "inode-map.h"
46#include "check-integrity.h"
46 47
47static struct extent_io_ops btree_extent_io_ops; 48static struct extent_io_ops btree_extent_io_ops;
48static void end_workqueue_fn(struct btrfs_work *work); 49static void end_workqueue_fn(struct btrfs_work *work);
@@ -872,7 +873,8 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
872 873
873#ifdef CONFIG_MIGRATION 874#ifdef CONFIG_MIGRATION
874static int btree_migratepage(struct address_space *mapping, 875static int btree_migratepage(struct address_space *mapping,
875 struct page *newpage, struct page *page) 876 struct page *newpage, struct page *page,
877 enum migrate_mode mode)
876{ 878{
877 /* 879 /*
878 * we can't safely write a btree page from here, 880 * we can't safely write a btree page from here,
@@ -887,7 +889,7 @@ static int btree_migratepage(struct address_space *mapping,
887 if (page_has_private(page) && 889 if (page_has_private(page) &&
888 !try_to_release_page(page, GFP_KERNEL)) 890 !try_to_release_page(page, GFP_KERNEL))
889 return -EAGAIN; 891 return -EAGAIN;
890 return migrate_page(mapping, newpage, page); 892 return migrate_page(mapping, newpage, page, mode);
891} 893}
892#endif 894#endif
893 895
@@ -1142,7 +1144,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1142 root->orphan_item_inserted = 0; 1144 root->orphan_item_inserted = 0;
1143 root->orphan_cleanup_state = 0; 1145 root->orphan_cleanup_state = 0;
1144 1146
1145 root->fs_info = fs_info;
1146 root->objectid = objectid; 1147 root->objectid = objectid;
1147 root->last_trans = 0; 1148 root->last_trans = 0;
1148 root->highest_objectid = 0; 1149 root->highest_objectid = 0;
@@ -1216,6 +1217,14 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
1216 return 0; 1217 return 0;
1217} 1218}
1218 1219
1220static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
1221{
1222 struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
1223 if (root)
1224 root->fs_info = fs_info;
1225 return root;
1226}
1227
1219static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, 1228static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1220 struct btrfs_fs_info *fs_info) 1229 struct btrfs_fs_info *fs_info)
1221{ 1230{
@@ -1223,7 +1232,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1223 struct btrfs_root *tree_root = fs_info->tree_root; 1232 struct btrfs_root *tree_root = fs_info->tree_root;
1224 struct extent_buffer *leaf; 1233 struct extent_buffer *leaf;
1225 1234
1226 root = kzalloc(sizeof(*root), GFP_NOFS); 1235 root = btrfs_alloc_root(fs_info);
1227 if (!root) 1236 if (!root)
1228 return ERR_PTR(-ENOMEM); 1237 return ERR_PTR(-ENOMEM);
1229 1238
@@ -1243,7 +1252,8 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1243 root->ref_cows = 0; 1252 root->ref_cows = 0;
1244 1253
1245 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 1254 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
1246 BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0); 1255 BTRFS_TREE_LOG_OBJECTID, NULL,
1256 0, 0, 0, 0);
1247 if (IS_ERR(leaf)) { 1257 if (IS_ERR(leaf)) {
1248 kfree(root); 1258 kfree(root);
1249 return ERR_CAST(leaf); 1259 return ERR_CAST(leaf);
@@ -1317,7 +1327,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1317 u32 blocksize; 1327 u32 blocksize;
1318 int ret = 0; 1328 int ret = 0;
1319 1329
1320 root = kzalloc(sizeof(*root), GFP_NOFS); 1330 root = btrfs_alloc_root(fs_info);
1321 if (!root) 1331 if (!root)
1322 return ERR_PTR(-ENOMEM); 1332 return ERR_PTR(-ENOMEM);
1323 if (location->offset == (u64)-1) { 1333 if (location->offset == (u64)-1) {
@@ -1579,9 +1589,7 @@ static int cleaner_kthread(void *arg)
1579 btrfs_run_defrag_inodes(root->fs_info); 1589 btrfs_run_defrag_inodes(root->fs_info);
1580 } 1590 }
1581 1591
1582 if (freezing(current)) { 1592 if (!try_to_freeze()) {
1583 refrigerator();
1584 } else {
1585 set_current_state(TASK_INTERRUPTIBLE); 1593 set_current_state(TASK_INTERRUPTIBLE);
1586 if (!kthread_should_stop()) 1594 if (!kthread_should_stop())
1587 schedule(); 1595 schedule();
@@ -1635,9 +1643,7 @@ sleep:
1635 wake_up_process(root->fs_info->cleaner_kthread); 1643 wake_up_process(root->fs_info->cleaner_kthread);
1636 mutex_unlock(&root->fs_info->transaction_kthread_mutex); 1644 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
1637 1645
1638 if (freezing(current)) { 1646 if (!try_to_freeze()) {
1639 refrigerator();
1640 } else {
1641 set_current_state(TASK_INTERRUPTIBLE); 1647 set_current_state(TASK_INTERRUPTIBLE);
1642 if (!kthread_should_stop() && 1648 if (!kthread_should_stop() &&
1643 !btrfs_transaction_blocked(root->fs_info)) 1649 !btrfs_transaction_blocked(root->fs_info))
@@ -1877,9 +1883,9 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
1877} 1883}
1878 1884
1879 1885
1880struct btrfs_root *open_ctree(struct super_block *sb, 1886int open_ctree(struct super_block *sb,
1881 struct btrfs_fs_devices *fs_devices, 1887 struct btrfs_fs_devices *fs_devices,
1882 char *options) 1888 char *options)
1883{ 1889{
1884 u32 sectorsize; 1890 u32 sectorsize;
1885 u32 nodesize; 1891 u32 nodesize;
@@ -1891,8 +1897,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1891 struct btrfs_key location; 1897 struct btrfs_key location;
1892 struct buffer_head *bh; 1898 struct buffer_head *bh;
1893 struct btrfs_super_block *disk_super; 1899 struct btrfs_super_block *disk_super;
1894 struct btrfs_root *tree_root = btrfs_sb(sb); 1900 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1895 struct btrfs_fs_info *fs_info = tree_root->fs_info; 1901 struct btrfs_root *tree_root;
1896 struct btrfs_root *extent_root; 1902 struct btrfs_root *extent_root;
1897 struct btrfs_root *csum_root; 1903 struct btrfs_root *csum_root;
1898 struct btrfs_root *chunk_root; 1904 struct btrfs_root *chunk_root;
@@ -1903,16 +1909,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1903 int num_backups_tried = 0; 1909 int num_backups_tried = 0;
1904 int backup_index = 0; 1910 int backup_index = 0;
1905 1911
1906 extent_root = fs_info->extent_root = 1912 tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
1907 kzalloc(sizeof(struct btrfs_root), GFP_NOFS); 1913 extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info);
1908 csum_root = fs_info->csum_root = 1914 csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
1909 kzalloc(sizeof(struct btrfs_root), GFP_NOFS); 1915 chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
1910 chunk_root = fs_info->chunk_root = 1916 dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
1911 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1912 dev_root = fs_info->dev_root =
1913 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1914 1917
1915 if (!extent_root || !csum_root || !chunk_root || !dev_root) { 1918 if (!tree_root || !extent_root || !csum_root ||
1919 !chunk_root || !dev_root) {
1916 err = -ENOMEM; 1920 err = -ENOMEM;
1917 goto fail; 1921 goto fail;
1918 } 1922 }
@@ -2001,6 +2005,17 @@ struct btrfs_root *open_ctree(struct super_block *sb,
2001 init_waitqueue_head(&fs_info->scrub_pause_wait); 2005 init_waitqueue_head(&fs_info->scrub_pause_wait);
2002 init_rwsem(&fs_info->scrub_super_lock); 2006 init_rwsem(&fs_info->scrub_super_lock);
2003 fs_info->scrub_workers_refcnt = 0; 2007 fs_info->scrub_workers_refcnt = 0;
2008#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
2009 fs_info->check_integrity_print_mask = 0;
2010#endif
2011
2012 spin_lock_init(&fs_info->balance_lock);
2013 mutex_init(&fs_info->balance_mutex);
2014 atomic_set(&fs_info->balance_running, 0);
2015 atomic_set(&fs_info->balance_pause_req, 0);
2016 atomic_set(&fs_info->balance_cancel_req, 0);
2017 fs_info->balance_ctl = NULL;
2018 init_waitqueue_head(&fs_info->balance_wait_q);
2004 2019
2005 sb->s_blocksize = 4096; 2020 sb->s_blocksize = 4096;
2006 sb->s_blocksize_bits = blksize_bits(4096); 2021 sb->s_blocksize_bits = blksize_bits(4096);
@@ -2270,9 +2285,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
2270 (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), 2285 (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
2271 BTRFS_UUID_SIZE); 2286 BTRFS_UUID_SIZE);
2272 2287
2273 mutex_lock(&fs_info->chunk_mutex);
2274 ret = btrfs_read_chunk_tree(chunk_root); 2288 ret = btrfs_read_chunk_tree(chunk_root);
2275 mutex_unlock(&fs_info->chunk_mutex);
2276 if (ret) { 2289 if (ret) {
2277 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", 2290 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
2278 sb->s_id); 2291 sb->s_id);
@@ -2321,9 +2334,6 @@ retry_root_backup:
2321 2334
2322 fs_info->generation = generation; 2335 fs_info->generation = generation;
2323 fs_info->last_trans_committed = generation; 2336 fs_info->last_trans_committed = generation;
2324 fs_info->data_alloc_profile = (u64)-1;
2325 fs_info->metadata_alloc_profile = (u64)-1;
2326 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
2327 2337
2328 ret = btrfs_init_space_info(fs_info); 2338 ret = btrfs_init_space_info(fs_info);
2329 if (ret) { 2339 if (ret) {
@@ -2356,6 +2366,19 @@ retry_root_backup:
2356 btrfs_set_opt(fs_info->mount_opt, SSD); 2366 btrfs_set_opt(fs_info->mount_opt, SSD);
2357 } 2367 }
2358 2368
2369#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
2370 if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
2371 ret = btrfsic_mount(tree_root, fs_devices,
2372 btrfs_test_opt(tree_root,
2373 CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
2374 1 : 0,
2375 fs_info->check_integrity_print_mask);
2376 if (ret)
2377 printk(KERN_WARNING "btrfs: failed to initialize"
2378 " integrity check module %s\n", sb->s_id);
2379 }
2380#endif
2381
2359 /* do not make disk changes in broken FS */ 2382 /* do not make disk changes in broken FS */
2360 if (btrfs_super_log_root(disk_super) != 0 && 2383 if (btrfs_super_log_root(disk_super) != 0 &&
2361 !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) { 2384 !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
@@ -2371,7 +2394,7 @@ retry_root_backup:
2371 btrfs_level_size(tree_root, 2394 btrfs_level_size(tree_root,
2372 btrfs_super_log_root_level(disk_super)); 2395 btrfs_super_log_root_level(disk_super));
2373 2396
2374 log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); 2397 log_tree_root = btrfs_alloc_root(fs_info);
2375 if (!log_tree_root) { 2398 if (!log_tree_root) {
2376 err = -ENOMEM; 2399 err = -ENOMEM;
2377 goto fail_trans_kthread; 2400 goto fail_trans_kthread;
@@ -2426,13 +2449,17 @@ retry_root_backup:
2426 if (!err) 2449 if (!err)
2427 err = btrfs_orphan_cleanup(fs_info->tree_root); 2450 err = btrfs_orphan_cleanup(fs_info->tree_root);
2428 up_read(&fs_info->cleanup_work_sem); 2451 up_read(&fs_info->cleanup_work_sem);
2452
2453 if (!err)
2454 err = btrfs_recover_balance(fs_info->tree_root);
2455
2429 if (err) { 2456 if (err) {
2430 close_ctree(tree_root); 2457 close_ctree(tree_root);
2431 return ERR_PTR(err); 2458 return err;
2432 } 2459 }
2433 } 2460 }
2434 2461
2435 return tree_root; 2462 return 0;
2436 2463
2437fail_trans_kthread: 2464fail_trans_kthread:
2438 kthread_stop(fs_info->transaction_kthread); 2465 kthread_stop(fs_info->transaction_kthread);
@@ -2478,8 +2505,7 @@ fail_srcu:
2478 cleanup_srcu_struct(&fs_info->subvol_srcu); 2505 cleanup_srcu_struct(&fs_info->subvol_srcu);
2479fail: 2506fail:
2480 btrfs_close_devices(fs_info->fs_devices); 2507 btrfs_close_devices(fs_info->fs_devices);
2481 free_fs_info(fs_info); 2508 return err;
2482 return ERR_PTR(err);
2483 2509
2484recovery_tree_root: 2510recovery_tree_root:
2485 if (!btrfs_test_opt(tree_root, RECOVERY)) 2511 if (!btrfs_test_opt(tree_root, RECOVERY))
@@ -2634,7 +2660,7 @@ static int write_dev_supers(struct btrfs_device *device,
2634 * we fua the first super. The others we allow 2660 * we fua the first super. The others we allow
2635 * to go down lazy. 2661 * to go down lazy.
2636 */ 2662 */
2637 ret = submit_bh(WRITE_FUA, bh); 2663 ret = btrfsic_submit_bh(WRITE_FUA, bh);
2638 if (ret) 2664 if (ret)
2639 errors++; 2665 errors++;
2640 } 2666 }
@@ -2711,7 +2737,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
2711 device->flush_bio = bio; 2737 device->flush_bio = bio;
2712 2738
2713 bio_get(bio); 2739 bio_get(bio);
2714 submit_bio(WRITE_FLUSH, bio); 2740 btrfsic_submit_bio(WRITE_FLUSH, bio);
2715 2741
2716 return 0; 2742 return 0;
2717} 2743}
@@ -2975,6 +3001,9 @@ int close_ctree(struct btrfs_root *root)
2975 fs_info->closing = 1; 3001 fs_info->closing = 1;
2976 smp_mb(); 3002 smp_mb();
2977 3003
3004 /* pause restriper - we want to resume on mount */
3005 btrfs_pause_balance(root->fs_info);
3006
2978 btrfs_scrub_cancel(root); 3007 btrfs_scrub_cancel(root);
2979 3008
2980 /* wait for any defraggers to finish */ 3009 /* wait for any defraggers to finish */
@@ -2982,7 +3011,7 @@ int close_ctree(struct btrfs_root *root)
2982 (atomic_read(&fs_info->defrag_running) == 0)); 3011 (atomic_read(&fs_info->defrag_running) == 0));
2983 3012
2984 /* clear out the rbtree of defraggable inodes */ 3013 /* clear out the rbtree of defraggable inodes */
2985 btrfs_run_defrag_inodes(root->fs_info); 3014 btrfs_run_defrag_inodes(fs_info);
2986 3015
2987 /* 3016 /*
2988 * Here come 2 situations when btrfs is broken to flip readonly: 3017 * Here come 2 situations when btrfs is broken to flip readonly:
@@ -3011,8 +3040,8 @@ int close_ctree(struct btrfs_root *root)
3011 3040
3012 btrfs_put_block_group_cache(fs_info); 3041 btrfs_put_block_group_cache(fs_info);
3013 3042
3014 kthread_stop(root->fs_info->transaction_kthread); 3043 kthread_stop(fs_info->transaction_kthread);
3015 kthread_stop(root->fs_info->cleaner_kthread); 3044 kthread_stop(fs_info->cleaner_kthread);
3016 3045
3017 fs_info->closing = 2; 3046 fs_info->closing = 2;
3018 smp_mb(); 3047 smp_mb();
@@ -3030,14 +3059,14 @@ int close_ctree(struct btrfs_root *root)
3030 free_extent_buffer(fs_info->extent_root->commit_root); 3059 free_extent_buffer(fs_info->extent_root->commit_root);
3031 free_extent_buffer(fs_info->tree_root->node); 3060 free_extent_buffer(fs_info->tree_root->node);
3032 free_extent_buffer(fs_info->tree_root->commit_root); 3061 free_extent_buffer(fs_info->tree_root->commit_root);
3033 free_extent_buffer(root->fs_info->chunk_root->node); 3062 free_extent_buffer(fs_info->chunk_root->node);
3034 free_extent_buffer(root->fs_info->chunk_root->commit_root); 3063 free_extent_buffer(fs_info->chunk_root->commit_root);
3035 free_extent_buffer(root->fs_info->dev_root->node); 3064 free_extent_buffer(fs_info->dev_root->node);
3036 free_extent_buffer(root->fs_info->dev_root->commit_root); 3065 free_extent_buffer(fs_info->dev_root->commit_root);
3037 free_extent_buffer(root->fs_info->csum_root->node); 3066 free_extent_buffer(fs_info->csum_root->node);
3038 free_extent_buffer(root->fs_info->csum_root->commit_root); 3067 free_extent_buffer(fs_info->csum_root->commit_root);
3039 3068
3040 btrfs_free_block_groups(root->fs_info); 3069 btrfs_free_block_groups(fs_info);
3041 3070
3042 del_fs_roots(fs_info); 3071 del_fs_roots(fs_info);
3043 3072
@@ -3057,14 +3086,17 @@ int close_ctree(struct btrfs_root *root)
3057 btrfs_stop_workers(&fs_info->caching_workers); 3086 btrfs_stop_workers(&fs_info->caching_workers);
3058 btrfs_stop_workers(&fs_info->readahead_workers); 3087 btrfs_stop_workers(&fs_info->readahead_workers);
3059 3088
3089#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3090 if (btrfs_test_opt(root, CHECK_INTEGRITY))
3091 btrfsic_unmount(root, fs_info->fs_devices);
3092#endif
3093
3060 btrfs_close_devices(fs_info->fs_devices); 3094 btrfs_close_devices(fs_info->fs_devices);
3061 btrfs_mapping_tree_free(&fs_info->mapping_tree); 3095 btrfs_mapping_tree_free(&fs_info->mapping_tree);
3062 3096
3063 bdi_destroy(&fs_info->bdi); 3097 bdi_destroy(&fs_info->bdi);
3064 cleanup_srcu_struct(&fs_info->subvol_srcu); 3098 cleanup_srcu_struct(&fs_info->subvol_srcu);
3065 3099
3066 free_fs_info(fs_info);
3067
3068 return 0; 3100 return 0;
3069} 3101}
3070 3102
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c99d0a8f13fa..e4bc4741319b 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -46,9 +46,9 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
46 u64 bytenr, u32 blocksize); 46 u64 bytenr, u32 blocksize);
47int clean_tree_block(struct btrfs_trans_handle *trans, 47int clean_tree_block(struct btrfs_trans_handle *trans,
48 struct btrfs_root *root, struct extent_buffer *buf); 48 struct btrfs_root *root, struct extent_buffer *buf);
49struct btrfs_root *open_ctree(struct super_block *sb, 49int open_ctree(struct super_block *sb,
50 struct btrfs_fs_devices *fs_devices, 50 struct btrfs_fs_devices *fs_devices,
51 char *options); 51 char *options);
52int close_ctree(struct btrfs_root *root); 52int close_ctree(struct btrfs_root *root);
53int write_ctree_super(struct btrfs_trans_handle *trans, 53int write_ctree_super(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root, int max_mirrors); 54 struct btrfs_root *root, int max_mirrors);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 1b8dc33778f9..5f77166fd01c 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -67,7 +67,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
67 u64 root_objectid, u32 generation, 67 u64 root_objectid, u32 generation,
68 int check_generation) 68 int check_generation)
69{ 69{
70 struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info; 70 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
71 struct btrfs_root *root; 71 struct btrfs_root *root;
72 struct inode *inode; 72 struct inode *inode;
73 struct btrfs_key key; 73 struct btrfs_key key;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5fbe576d2ba..700879ed64cf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -618,8 +618,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
618 struct list_head *head = &info->space_info; 618 struct list_head *head = &info->space_info;
619 struct btrfs_space_info *found; 619 struct btrfs_space_info *found;
620 620
621 flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM | 621 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
622 BTRFS_BLOCK_GROUP_METADATA;
623 622
624 rcu_read_lock(); 623 rcu_read_lock();
625 list_for_each_entry_rcu(found, head, list) { 624 list_for_each_entry_rcu(found, head, list) {
@@ -1872,20 +1871,24 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1872int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1871int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1873 struct btrfs_root *root, 1872 struct btrfs_root *root,
1874 u64 bytenr, u64 num_bytes, u64 parent, 1873 u64 bytenr, u64 num_bytes, u64 parent,
1875 u64 root_objectid, u64 owner, u64 offset) 1874 u64 root_objectid, u64 owner, u64 offset, int for_cow)
1876{ 1875{
1877 int ret; 1876 int ret;
1877 struct btrfs_fs_info *fs_info = root->fs_info;
1878
1878 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && 1879 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1879 root_objectid == BTRFS_TREE_LOG_OBJECTID); 1880 root_objectid == BTRFS_TREE_LOG_OBJECTID);
1880 1881
1881 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1882 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1882 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, 1883 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
1884 num_bytes,
1883 parent, root_objectid, (int)owner, 1885 parent, root_objectid, (int)owner,
1884 BTRFS_ADD_DELAYED_REF, NULL); 1886 BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1885 } else { 1887 } else {
1886 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, 1888 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
1889 num_bytes,
1887 parent, root_objectid, owner, offset, 1890 parent, root_objectid, owner, offset,
1888 BTRFS_ADD_DELAYED_REF, NULL); 1891 BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1889 } 1892 }
1890 return ret; 1893 return ret;
1891} 1894}
@@ -2233,6 +2236,28 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2233 } 2236 }
2234 2237
2235 /* 2238 /*
2239 * locked_ref is the head node, so we have to go one
2240 * node back for any delayed ref updates
2241 */
2242 ref = select_delayed_ref(locked_ref);
2243
2244 if (ref && ref->seq &&
2245 btrfs_check_delayed_seq(delayed_refs, ref->seq)) {
2246 /*
2247 * there are still refs with lower seq numbers in the
2248 * process of being added. Don't run this ref yet.
2249 */
2250 list_del_init(&locked_ref->cluster);
2251 mutex_unlock(&locked_ref->mutex);
2252 locked_ref = NULL;
2253 delayed_refs->num_heads_ready++;
2254 spin_unlock(&delayed_refs->lock);
2255 cond_resched();
2256 spin_lock(&delayed_refs->lock);
2257 continue;
2258 }
2259
2260 /*
2236 * record the must insert reserved flag before we 2261 * record the must insert reserved flag before we
2237 * drop the spin lock. 2262 * drop the spin lock.
2238 */ 2263 */
@@ -2242,11 +2267,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2242 extent_op = locked_ref->extent_op; 2267 extent_op = locked_ref->extent_op;
2243 locked_ref->extent_op = NULL; 2268 locked_ref->extent_op = NULL;
2244 2269
2245 /*
2246 * locked_ref is the head node, so we have to go one
2247 * node back for any delayed ref updates
2248 */
2249 ref = select_delayed_ref(locked_ref);
2250 if (!ref) { 2270 if (!ref) {
2251 /* All delayed refs have been processed, Go ahead 2271 /* All delayed refs have been processed, Go ahead
2252 * and send the head node to run_one_delayed_ref, 2272 * and send the head node to run_one_delayed_ref,
@@ -2267,9 +2287,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2267 BUG_ON(ret); 2287 BUG_ON(ret);
2268 kfree(extent_op); 2288 kfree(extent_op);
2269 2289
2270 cond_resched(); 2290 goto next;
2271 spin_lock(&delayed_refs->lock);
2272 continue;
2273 } 2291 }
2274 2292
2275 list_del_init(&locked_ref->cluster); 2293 list_del_init(&locked_ref->cluster);
@@ -2279,7 +2297,12 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2279 ref->in_tree = 0; 2297 ref->in_tree = 0;
2280 rb_erase(&ref->rb_node, &delayed_refs->root); 2298 rb_erase(&ref->rb_node, &delayed_refs->root);
2281 delayed_refs->num_entries--; 2299 delayed_refs->num_entries--;
2282 2300 /*
2301 * we modified num_entries, but as we're currently running
2302 * delayed refs, skip
2303 * wake_up(&delayed_refs->seq_wait);
2304 * here.
2305 */
2283 spin_unlock(&delayed_refs->lock); 2306 spin_unlock(&delayed_refs->lock);
2284 2307
2285 ret = run_one_delayed_ref(trans, root, ref, extent_op, 2308 ret = run_one_delayed_ref(trans, root, ref, extent_op,
@@ -2289,13 +2312,34 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2289 btrfs_put_delayed_ref(ref); 2312 btrfs_put_delayed_ref(ref);
2290 kfree(extent_op); 2313 kfree(extent_op);
2291 count++; 2314 count++;
2292 2315next:
2316 do_chunk_alloc(trans, root->fs_info->extent_root,
2317 2 * 1024 * 1024,
2318 btrfs_get_alloc_profile(root, 0),
2319 CHUNK_ALLOC_NO_FORCE);
2293 cond_resched(); 2320 cond_resched();
2294 spin_lock(&delayed_refs->lock); 2321 spin_lock(&delayed_refs->lock);
2295 } 2322 }
2296 return count; 2323 return count;
2297} 2324}
2298 2325
2326
2327static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs,
2328 unsigned long num_refs)
2329{
2330 struct list_head *first_seq = delayed_refs->seq_head.next;
2331
2332 spin_unlock(&delayed_refs->lock);
2333 pr_debug("waiting for more refs (num %ld, first %p)\n",
2334 num_refs, first_seq);
2335 wait_event(delayed_refs->seq_wait,
2336 num_refs != delayed_refs->num_entries ||
2337 delayed_refs->seq_head.next != first_seq);
2338 pr_debug("done waiting for more refs (num %ld, first %p)\n",
2339 delayed_refs->num_entries, delayed_refs->seq_head.next);
2340 spin_lock(&delayed_refs->lock);
2341}
2342
2299/* 2343/*
2300 * this starts processing the delayed reference count updates and 2344 * this starts processing the delayed reference count updates and
2301 * extent insertions we have queued up so far. count can be 2345 * extent insertions we have queued up so far. count can be
@@ -2311,15 +2355,23 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2311 struct btrfs_delayed_ref_node *ref; 2355 struct btrfs_delayed_ref_node *ref;
2312 struct list_head cluster; 2356 struct list_head cluster;
2313 int ret; 2357 int ret;
2358 u64 delayed_start;
2314 int run_all = count == (unsigned long)-1; 2359 int run_all = count == (unsigned long)-1;
2315 int run_most = 0; 2360 int run_most = 0;
2361 unsigned long num_refs = 0;
2362 int consider_waiting;
2316 2363
2317 if (root == root->fs_info->extent_root) 2364 if (root == root->fs_info->extent_root)
2318 root = root->fs_info->tree_root; 2365 root = root->fs_info->tree_root;
2319 2366
2367 do_chunk_alloc(trans, root->fs_info->extent_root,
2368 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
2369 CHUNK_ALLOC_NO_FORCE);
2370
2320 delayed_refs = &trans->transaction->delayed_refs; 2371 delayed_refs = &trans->transaction->delayed_refs;
2321 INIT_LIST_HEAD(&cluster); 2372 INIT_LIST_HEAD(&cluster);
2322again: 2373again:
2374 consider_waiting = 0;
2323 spin_lock(&delayed_refs->lock); 2375 spin_lock(&delayed_refs->lock);
2324 if (count == 0) { 2376 if (count == 0) {
2325 count = delayed_refs->num_entries * 2; 2377 count = delayed_refs->num_entries * 2;
@@ -2336,11 +2388,35 @@ again:
2336 * of refs to process starting at the first one we are able to 2388 * of refs to process starting at the first one we are able to
2337 * lock 2389 * lock
2338 */ 2390 */
2391 delayed_start = delayed_refs->run_delayed_start;
2339 ret = btrfs_find_ref_cluster(trans, &cluster, 2392 ret = btrfs_find_ref_cluster(trans, &cluster,
2340 delayed_refs->run_delayed_start); 2393 delayed_refs->run_delayed_start);
2341 if (ret) 2394 if (ret)
2342 break; 2395 break;
2343 2396
2397 if (delayed_start >= delayed_refs->run_delayed_start) {
2398 if (consider_waiting == 0) {
2399 /*
2400 * btrfs_find_ref_cluster looped. let's do one
2401 * more cycle. if we don't run any delayed ref
2402 * during that cycle (because we can't because
2403 * all of them are blocked) and if the number of
2404 * refs doesn't change, we avoid busy waiting.
2405 */
2406 consider_waiting = 1;
2407 num_refs = delayed_refs->num_entries;
2408 } else {
2409 wait_for_more_refs(delayed_refs, num_refs);
2410 /*
2411 * after waiting, things have changed. we
2412 * dropped the lock and someone else might have
2413 * run some refs, built new clusters and so on.
2414 * therefore, we restart staleness detection.
2415 */
2416 consider_waiting = 0;
2417 }
2418 }
2419
2344 ret = run_clustered_refs(trans, root, &cluster); 2420 ret = run_clustered_refs(trans, root, &cluster);
2345 BUG_ON(ret < 0); 2421 BUG_ON(ret < 0);
2346 2422
@@ -2348,6 +2424,11 @@ again:
2348 2424
2349 if (count == 0) 2425 if (count == 0)
2350 break; 2426 break;
2427
2428 if (ret || delayed_refs->run_delayed_start == 0) {
2429 /* refs were run, let's reset staleness detection */
2430 consider_waiting = 0;
2431 }
2351 } 2432 }
2352 2433
2353 if (run_all) { 2434 if (run_all) {
@@ -2405,7 +2486,8 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2405 extent_op->update_key = 0; 2486 extent_op->update_key = 0;
2406 extent_op->is_data = is_data ? 1 : 0; 2487 extent_op->is_data = is_data ? 1 : 0;
2407 2488
2408 ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); 2489 ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
2490 num_bytes, extent_op);
2409 if (ret) 2491 if (ret)
2410 kfree(extent_op); 2492 kfree(extent_op);
2411 return ret; 2493 return ret;
@@ -2590,7 +2672,7 @@ out:
2590static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 2672static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2591 struct btrfs_root *root, 2673 struct btrfs_root *root,
2592 struct extent_buffer *buf, 2674 struct extent_buffer *buf,
2593 int full_backref, int inc) 2675 int full_backref, int inc, int for_cow)
2594{ 2676{
2595 u64 bytenr; 2677 u64 bytenr;
2596 u64 num_bytes; 2678 u64 num_bytes;
@@ -2603,7 +2685,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2603 int level; 2685 int level;
2604 int ret = 0; 2686 int ret = 0;
2605 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 2687 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2606 u64, u64, u64, u64, u64, u64); 2688 u64, u64, u64, u64, u64, u64, int);
2607 2689
2608 ref_root = btrfs_header_owner(buf); 2690 ref_root = btrfs_header_owner(buf);
2609 nritems = btrfs_header_nritems(buf); 2691 nritems = btrfs_header_nritems(buf);
@@ -2640,14 +2722,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2640 key.offset -= btrfs_file_extent_offset(buf, fi); 2722 key.offset -= btrfs_file_extent_offset(buf, fi);
2641 ret = process_func(trans, root, bytenr, num_bytes, 2723 ret = process_func(trans, root, bytenr, num_bytes,
2642 parent, ref_root, key.objectid, 2724 parent, ref_root, key.objectid,
2643 key.offset); 2725 key.offset, for_cow);
2644 if (ret) 2726 if (ret)
2645 goto fail; 2727 goto fail;
2646 } else { 2728 } else {
2647 bytenr = btrfs_node_blockptr(buf, i); 2729 bytenr = btrfs_node_blockptr(buf, i);
2648 num_bytes = btrfs_level_size(root, level - 1); 2730 num_bytes = btrfs_level_size(root, level - 1);
2649 ret = process_func(trans, root, bytenr, num_bytes, 2731 ret = process_func(trans, root, bytenr, num_bytes,
2650 parent, ref_root, level - 1, 0); 2732 parent, ref_root, level - 1, 0,
2733 for_cow);
2651 if (ret) 2734 if (ret)
2652 goto fail; 2735 goto fail;
2653 } 2736 }
@@ -2659,15 +2742,15 @@ fail:
2659} 2742}
2660 2743
2661int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2744int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2662 struct extent_buffer *buf, int full_backref) 2745 struct extent_buffer *buf, int full_backref, int for_cow)
2663{ 2746{
2664 return __btrfs_mod_ref(trans, root, buf, full_backref, 1); 2747 return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
2665} 2748}
2666 2749
2667int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2750int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2668 struct extent_buffer *buf, int full_backref) 2751 struct extent_buffer *buf, int full_backref, int for_cow)
2669{ 2752{
2670 return __btrfs_mod_ref(trans, root, buf, full_backref, 0); 2753 return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
2671} 2754}
2672 2755
2673static int write_one_cache_group(struct btrfs_trans_handle *trans, 2756static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -2993,9 +3076,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2993 INIT_LIST_HEAD(&found->block_groups[i]); 3076 INIT_LIST_HEAD(&found->block_groups[i]);
2994 init_rwsem(&found->groups_sem); 3077 init_rwsem(&found->groups_sem);
2995 spin_lock_init(&found->lock); 3078 spin_lock_init(&found->lock);
2996 found->flags = flags & (BTRFS_BLOCK_GROUP_DATA | 3079 found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
2997 BTRFS_BLOCK_GROUP_SYSTEM |
2998 BTRFS_BLOCK_GROUP_METADATA);
2999 found->total_bytes = total_bytes; 3080 found->total_bytes = total_bytes;
3000 found->disk_total = total_bytes * factor; 3081 found->disk_total = total_bytes * factor;
3001 found->bytes_used = bytes_used; 3082 found->bytes_used = bytes_used;
@@ -3016,20 +3097,27 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3016 3097
3017static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 3098static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3018{ 3099{
3019 u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | 3100 u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
3020 BTRFS_BLOCK_GROUP_RAID1 | 3101
3021 BTRFS_BLOCK_GROUP_RAID10 | 3102 /* chunk -> extended profile */
3022 BTRFS_BLOCK_GROUP_DUP); 3103 if (extra_flags == 0)
3023 if (extra_flags) { 3104 extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
3024 if (flags & BTRFS_BLOCK_GROUP_DATA) 3105
3025 fs_info->avail_data_alloc_bits |= extra_flags; 3106 if (flags & BTRFS_BLOCK_GROUP_DATA)
3026 if (flags & BTRFS_BLOCK_GROUP_METADATA) 3107 fs_info->avail_data_alloc_bits |= extra_flags;
3027 fs_info->avail_metadata_alloc_bits |= extra_flags; 3108 if (flags & BTRFS_BLOCK_GROUP_METADATA)
3028 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3109 fs_info->avail_metadata_alloc_bits |= extra_flags;
3029 fs_info->avail_system_alloc_bits |= extra_flags; 3110 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3030 } 3111 fs_info->avail_system_alloc_bits |= extra_flags;
3031} 3112}
3032 3113
3114/*
3115 * @flags: available profiles in extended format (see ctree.h)
3116 *
3117 * Returns reduced profile in chunk format. If profile changing is in
3118 * progress (either running or paused) picks the target profile (if it's
3119 * already available), otherwise falls back to plain reducing.
3120 */
3033u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 3121u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3034{ 3122{
3035 /* 3123 /*
@@ -3040,6 +3128,34 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3040 u64 num_devices = root->fs_info->fs_devices->rw_devices + 3128 u64 num_devices = root->fs_info->fs_devices->rw_devices +
3041 root->fs_info->fs_devices->missing_devices; 3129 root->fs_info->fs_devices->missing_devices;
3042 3130
3131 /* pick restriper's target profile if it's available */
3132 spin_lock(&root->fs_info->balance_lock);
3133 if (root->fs_info->balance_ctl) {
3134 struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
3135 u64 tgt = 0;
3136
3137 if ((flags & BTRFS_BLOCK_GROUP_DATA) &&
3138 (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3139 (flags & bctl->data.target)) {
3140 tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3141 } else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) &&
3142 (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3143 (flags & bctl->sys.target)) {
3144 tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3145 } else if ((flags & BTRFS_BLOCK_GROUP_METADATA) &&
3146 (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3147 (flags & bctl->meta.target)) {
3148 tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3149 }
3150
3151 if (tgt) {
3152 spin_unlock(&root->fs_info->balance_lock);
3153 flags = tgt;
3154 goto out;
3155 }
3156 }
3157 spin_unlock(&root->fs_info->balance_lock);
3158
3043 if (num_devices == 1) 3159 if (num_devices == 1)
3044 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); 3160 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
3045 if (num_devices < 4) 3161 if (num_devices < 4)
@@ -3059,22 +3175,25 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3059 if ((flags & BTRFS_BLOCK_GROUP_RAID0) && 3175 if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
3060 ((flags & BTRFS_BLOCK_GROUP_RAID1) | 3176 ((flags & BTRFS_BLOCK_GROUP_RAID1) |
3061 (flags & BTRFS_BLOCK_GROUP_RAID10) | 3177 (flags & BTRFS_BLOCK_GROUP_RAID10) |
3062 (flags & BTRFS_BLOCK_GROUP_DUP))) 3178 (flags & BTRFS_BLOCK_GROUP_DUP))) {
3063 flags &= ~BTRFS_BLOCK_GROUP_RAID0; 3179 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
3180 }
3181
3182out:
3183 /* extended -> chunk profile */
3184 flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
3064 return flags; 3185 return flags;
3065} 3186}
3066 3187
3067static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) 3188static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3068{ 3189{
3069 if (flags & BTRFS_BLOCK_GROUP_DATA) 3190 if (flags & BTRFS_BLOCK_GROUP_DATA)
3070 flags |= root->fs_info->avail_data_alloc_bits & 3191 flags |= root->fs_info->avail_data_alloc_bits;
3071 root->fs_info->data_alloc_profile;
3072 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3192 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3073 flags |= root->fs_info->avail_system_alloc_bits & 3193 flags |= root->fs_info->avail_system_alloc_bits;
3074 root->fs_info->system_alloc_profile;
3075 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 3194 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3076 flags |= root->fs_info->avail_metadata_alloc_bits & 3195 flags |= root->fs_info->avail_metadata_alloc_bits;
3077 root->fs_info->metadata_alloc_profile; 3196
3078 return btrfs_reduce_alloc_profile(root, flags); 3197 return btrfs_reduce_alloc_profile(root, flags);
3079} 3198}
3080 3199
@@ -3191,6 +3310,8 @@ commit_trans:
3191 return -ENOSPC; 3310 return -ENOSPC;
3192 } 3311 }
3193 data_sinfo->bytes_may_use += bytes; 3312 data_sinfo->bytes_may_use += bytes;
3313 trace_btrfs_space_reservation(root->fs_info, "space_info",
3314 (u64)data_sinfo, bytes, 1);
3194 spin_unlock(&data_sinfo->lock); 3315 spin_unlock(&data_sinfo->lock);
3195 3316
3196 return 0; 3317 return 0;
@@ -3210,6 +3331,8 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3210 data_sinfo = BTRFS_I(inode)->space_info; 3331 data_sinfo = BTRFS_I(inode)->space_info;
3211 spin_lock(&data_sinfo->lock); 3332 spin_lock(&data_sinfo->lock);
3212 data_sinfo->bytes_may_use -= bytes; 3333 data_sinfo->bytes_may_use -= bytes;
3334 trace_btrfs_space_reservation(root->fs_info, "space_info",
3335 (u64)data_sinfo, bytes, 0);
3213 spin_unlock(&data_sinfo->lock); 3336 spin_unlock(&data_sinfo->lock);
3214} 3337}
3215 3338
@@ -3257,27 +3380,15 @@ static int should_alloc_chunk(struct btrfs_root *root,
3257 if (num_bytes - num_allocated < thresh) 3380 if (num_bytes - num_allocated < thresh)
3258 return 1; 3381 return 1;
3259 } 3382 }
3260
3261 /*
3262 * we have two similar checks here, one based on percentage
3263 * and once based on a hard number of 256MB. The idea
3264 * is that if we have a good amount of free
3265 * room, don't allocate a chunk. A good mount is
3266 * less than 80% utilized of the chunks we have allocated,
3267 * or more than 256MB free
3268 */
3269 if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
3270 return 0;
3271
3272 if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
3273 return 0;
3274
3275 thresh = btrfs_super_total_bytes(root->fs_info->super_copy); 3383 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3276 3384
3277 /* 256MB or 5% of the FS */ 3385 /* 256MB or 2% of the FS */
3278 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); 3386 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
3387 /* system chunks need a much small threshold */
3388 if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM)
3389 thresh = 32 * 1024 * 1024;
3279 3390
3280 if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3)) 3391 if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
3281 return 0; 3392 return 0;
3282 return 1; 3393 return 1;
3283} 3394}
@@ -3291,7 +3402,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3291 int wait_for_alloc = 0; 3402 int wait_for_alloc = 0;
3292 int ret = 0; 3403 int ret = 0;
3293 3404
3294 flags = btrfs_reduce_alloc_profile(extent_root, flags); 3405 BUG_ON(!profile_is_valid(flags, 0));
3295 3406
3296 space_info = __find_space_info(extent_root->fs_info, flags); 3407 space_info = __find_space_info(extent_root->fs_info, flags);
3297 if (!space_info) { 3408 if (!space_info) {
@@ -3582,6 +3693,10 @@ again:
3582 if (used <= space_info->total_bytes) { 3693 if (used <= space_info->total_bytes) {
3583 if (used + orig_bytes <= space_info->total_bytes) { 3694 if (used + orig_bytes <= space_info->total_bytes) {
3584 space_info->bytes_may_use += orig_bytes; 3695 space_info->bytes_may_use += orig_bytes;
3696 trace_btrfs_space_reservation(root->fs_info,
3697 "space_info",
3698 (u64)space_info,
3699 orig_bytes, 1);
3585 ret = 0; 3700 ret = 0;
3586 } else { 3701 } else {
3587 /* 3702 /*
@@ -3649,6 +3764,10 @@ again:
3649 3764
3650 if (used + num_bytes < space_info->total_bytes + avail) { 3765 if (used + num_bytes < space_info->total_bytes + avail) {
3651 space_info->bytes_may_use += orig_bytes; 3766 space_info->bytes_may_use += orig_bytes;
3767 trace_btrfs_space_reservation(root->fs_info,
3768 "space_info",
3769 (u64)space_info,
3770 orig_bytes, 1);
3652 ret = 0; 3771 ret = 0;
3653 } else { 3772 } else {
3654 wait_ordered = true; 3773 wait_ordered = true;
@@ -3755,7 +3874,8 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
3755 spin_unlock(&block_rsv->lock); 3874 spin_unlock(&block_rsv->lock);
3756} 3875}
3757 3876
3758static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, 3877static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
3878 struct btrfs_block_rsv *block_rsv,
3759 struct btrfs_block_rsv *dest, u64 num_bytes) 3879 struct btrfs_block_rsv *dest, u64 num_bytes)
3760{ 3880{
3761 struct btrfs_space_info *space_info = block_rsv->space_info; 3881 struct btrfs_space_info *space_info = block_rsv->space_info;
@@ -3791,6 +3911,9 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3791 if (num_bytes) { 3911 if (num_bytes) {
3792 spin_lock(&space_info->lock); 3912 spin_lock(&space_info->lock);
3793 space_info->bytes_may_use -= num_bytes; 3913 space_info->bytes_may_use -= num_bytes;
3914 trace_btrfs_space_reservation(fs_info, "space_info",
3915 (u64)space_info,
3916 num_bytes, 0);
3794 space_info->reservation_progress++; 3917 space_info->reservation_progress++;
3795 spin_unlock(&space_info->lock); 3918 spin_unlock(&space_info->lock);
3796 } 3919 }
@@ -3947,7 +4070,8 @@ void btrfs_block_rsv_release(struct btrfs_root *root,
3947 if (global_rsv->full || global_rsv == block_rsv || 4070 if (global_rsv->full || global_rsv == block_rsv ||
3948 block_rsv->space_info != global_rsv->space_info) 4071 block_rsv->space_info != global_rsv->space_info)
3949 global_rsv = NULL; 4072 global_rsv = NULL;
3950 block_rsv_release_bytes(block_rsv, global_rsv, num_bytes); 4073 block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
4074 num_bytes);
3951} 4075}
3952 4076
3953/* 4077/*
@@ -4006,11 +4130,15 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
4006 num_bytes = sinfo->total_bytes - num_bytes; 4130 num_bytes = sinfo->total_bytes - num_bytes;
4007 block_rsv->reserved += num_bytes; 4131 block_rsv->reserved += num_bytes;
4008 sinfo->bytes_may_use += num_bytes; 4132 sinfo->bytes_may_use += num_bytes;
4133 trace_btrfs_space_reservation(fs_info, "space_info",
4134 (u64)sinfo, num_bytes, 1);
4009 } 4135 }
4010 4136
4011 if (block_rsv->reserved >= block_rsv->size) { 4137 if (block_rsv->reserved >= block_rsv->size) {
4012 num_bytes = block_rsv->reserved - block_rsv->size; 4138 num_bytes = block_rsv->reserved - block_rsv->size;
4013 sinfo->bytes_may_use -= num_bytes; 4139 sinfo->bytes_may_use -= num_bytes;
4140 trace_btrfs_space_reservation(fs_info, "space_info",
4141 (u64)sinfo, num_bytes, 0);
4014 sinfo->reservation_progress++; 4142 sinfo->reservation_progress++;
4015 block_rsv->reserved = block_rsv->size; 4143 block_rsv->reserved = block_rsv->size;
4016 block_rsv->full = 1; 4144 block_rsv->full = 1;
@@ -4045,7 +4173,8 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
4045 4173
4046static void release_global_block_rsv(struct btrfs_fs_info *fs_info) 4174static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
4047{ 4175{
4048 block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1); 4176 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
4177 (u64)-1);
4049 WARN_ON(fs_info->delalloc_block_rsv.size > 0); 4178 WARN_ON(fs_info->delalloc_block_rsv.size > 0);
4050 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); 4179 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
4051 WARN_ON(fs_info->trans_block_rsv.size > 0); 4180 WARN_ON(fs_info->trans_block_rsv.size > 0);
@@ -4062,6 +4191,8 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
4062 if (!trans->bytes_reserved) 4191 if (!trans->bytes_reserved)
4063 return; 4192 return;
4064 4193
4194 trace_btrfs_space_reservation(root->fs_info, "transaction", (u64)trans,
4195 trans->bytes_reserved, 0);
4065 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); 4196 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
4066 trans->bytes_reserved = 0; 4197 trans->bytes_reserved = 0;
4067} 4198}
@@ -4079,6 +4210,8 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
4079 * when we are truly done with the orphan item. 4210 * when we are truly done with the orphan item.
4080 */ 4211 */
4081 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 4212 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4213 trace_btrfs_space_reservation(root->fs_info, "orphan",
4214 btrfs_ino(inode), num_bytes, 1);
4082 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4215 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4083} 4216}
4084 4217
@@ -4086,6 +4219,8 @@ void btrfs_orphan_release_metadata(struct inode *inode)
4086{ 4219{
4087 struct btrfs_root *root = BTRFS_I(inode)->root; 4220 struct btrfs_root *root = BTRFS_I(inode)->root;
4088 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 4221 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4222 trace_btrfs_space_reservation(root->fs_info, "orphan",
4223 btrfs_ino(inode), num_bytes, 0);
4089 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); 4224 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
4090} 4225}
4091 4226
@@ -4213,12 +4348,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4213 /* Need to be holding the i_mutex here if we aren't free space cache */ 4348 /* Need to be holding the i_mutex here if we aren't free space cache */
4214 if (btrfs_is_free_space_inode(root, inode)) 4349 if (btrfs_is_free_space_inode(root, inode))
4215 flush = 0; 4350 flush = 0;
4216 else
4217 WARN_ON(!mutex_is_locked(&inode->i_mutex));
4218 4351
4219 if (flush && btrfs_transaction_in_commit(root->fs_info)) 4352 if (flush && btrfs_transaction_in_commit(root->fs_info))
4220 schedule_timeout(1); 4353 schedule_timeout(1);
4221 4354
4355 mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
4222 num_bytes = ALIGN(num_bytes, root->sectorsize); 4356 num_bytes = ALIGN(num_bytes, root->sectorsize);
4223 4357
4224 spin_lock(&BTRFS_I(inode)->lock); 4358 spin_lock(&BTRFS_I(inode)->lock);
@@ -4266,8 +4400,14 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4266 if (dropped) 4400 if (dropped)
4267 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4401 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4268 4402
4269 if (to_free) 4403 if (to_free) {
4270 btrfs_block_rsv_release(root, block_rsv, to_free); 4404 btrfs_block_rsv_release(root, block_rsv, to_free);
4405 trace_btrfs_space_reservation(root->fs_info,
4406 "delalloc",
4407 btrfs_ino(inode),
4408 to_free, 0);
4409 }
4410 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4271 return ret; 4411 return ret;
4272 } 4412 }
4273 4413
@@ -4278,7 +4418,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4278 } 4418 }
4279 BTRFS_I(inode)->reserved_extents += nr_extents; 4419 BTRFS_I(inode)->reserved_extents += nr_extents;
4280 spin_unlock(&BTRFS_I(inode)->lock); 4420 spin_unlock(&BTRFS_I(inode)->lock);
4421 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4281 4422
4423 if (to_reserve)
4424 trace_btrfs_space_reservation(root->fs_info,"delalloc",
4425 btrfs_ino(inode), to_reserve, 1);
4282 block_rsv_add_bytes(block_rsv, to_reserve, 1); 4426 block_rsv_add_bytes(block_rsv, to_reserve, 1);
4283 4427
4284 return 0; 4428 return 0;
@@ -4308,6 +4452,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4308 if (dropped > 0) 4452 if (dropped > 0)
4309 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4453 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4310 4454
4455 trace_btrfs_space_reservation(root->fs_info, "delalloc",
4456 btrfs_ino(inode), to_free, 0);
4311 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, 4457 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
4312 to_free); 4458 to_free);
4313} 4459}
@@ -4562,7 +4708,10 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4562 cache->reserved += num_bytes; 4708 cache->reserved += num_bytes;
4563 space_info->bytes_reserved += num_bytes; 4709 space_info->bytes_reserved += num_bytes;
4564 if (reserve == RESERVE_ALLOC) { 4710 if (reserve == RESERVE_ALLOC) {
4565 BUG_ON(space_info->bytes_may_use < num_bytes); 4711 trace_btrfs_space_reservation(cache->fs_info,
4712 "space_info",
4713 (u64)space_info,
4714 num_bytes, 0);
4566 space_info->bytes_may_use -= num_bytes; 4715 space_info->bytes_may_use -= num_bytes;
4567 } 4716 }
4568 } 4717 }
@@ -4928,6 +5077,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
4928 rb_erase(&head->node.rb_node, &delayed_refs->root); 5077 rb_erase(&head->node.rb_node, &delayed_refs->root);
4929 5078
4930 delayed_refs->num_entries--; 5079 delayed_refs->num_entries--;
5080 if (waitqueue_active(&delayed_refs->seq_wait))
5081 wake_up(&delayed_refs->seq_wait);
4931 5082
4932 /* 5083 /*
4933 * we don't take a ref on the node because we're removing it from the 5084 * we don't take a ref on the node because we're removing it from the
@@ -4955,16 +5106,17 @@ out:
4955void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 5106void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4956 struct btrfs_root *root, 5107 struct btrfs_root *root,
4957 struct extent_buffer *buf, 5108 struct extent_buffer *buf,
4958 u64 parent, int last_ref) 5109 u64 parent, int last_ref, int for_cow)
4959{ 5110{
4960 struct btrfs_block_group_cache *cache = NULL; 5111 struct btrfs_block_group_cache *cache = NULL;
4961 int ret; 5112 int ret;
4962 5113
4963 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 5114 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4964 ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len, 5115 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
4965 parent, root->root_key.objectid, 5116 buf->start, buf->len,
4966 btrfs_header_level(buf), 5117 parent, root->root_key.objectid,
4967 BTRFS_DROP_DELAYED_REF, NULL); 5118 btrfs_header_level(buf),
5119 BTRFS_DROP_DELAYED_REF, NULL, for_cow);
4968 BUG_ON(ret); 5120 BUG_ON(ret);
4969 } 5121 }
4970 5122
@@ -4999,12 +5151,12 @@ out:
4999 btrfs_put_block_group(cache); 5151 btrfs_put_block_group(cache);
5000} 5152}
5001 5153
5002int btrfs_free_extent(struct btrfs_trans_handle *trans, 5154int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5003 struct btrfs_root *root, 5155 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
5004 u64 bytenr, u64 num_bytes, u64 parent, 5156 u64 owner, u64 offset, int for_cow)
5005 u64 root_objectid, u64 owner, u64 offset)
5006{ 5157{
5007 int ret; 5158 int ret;
5159 struct btrfs_fs_info *fs_info = root->fs_info;
5008 5160
5009 /* 5161 /*
5010 * tree log blocks never actually go into the extent allocation 5162 * tree log blocks never actually go into the extent allocation
@@ -5016,14 +5168,17 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
5016 btrfs_pin_extent(root, bytenr, num_bytes, 1); 5168 btrfs_pin_extent(root, bytenr, num_bytes, 1);
5017 ret = 0; 5169 ret = 0;
5018 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { 5170 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
5019 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, 5171 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
5172 num_bytes,
5020 parent, root_objectid, (int)owner, 5173 parent, root_objectid, (int)owner,
5021 BTRFS_DROP_DELAYED_REF, NULL); 5174 BTRFS_DROP_DELAYED_REF, NULL, for_cow);
5022 BUG_ON(ret); 5175 BUG_ON(ret);
5023 } else { 5176 } else {
5024 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, 5177 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
5025 parent, root_objectid, owner, 5178 num_bytes,
5026 offset, BTRFS_DROP_DELAYED_REF, NULL); 5179 parent, root_objectid, owner,
5180 offset, BTRFS_DROP_DELAYED_REF,
5181 NULL, for_cow);
5027 BUG_ON(ret); 5182 BUG_ON(ret);
5028 } 5183 }
5029 return ret; 5184 return ret;
@@ -5146,6 +5301,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5146 ins->objectid = 0; 5301 ins->objectid = 0;
5147 ins->offset = 0; 5302 ins->offset = 0;
5148 5303
5304 trace_find_free_extent(orig_root, num_bytes, empty_size, data);
5305
5149 space_info = __find_space_info(root->fs_info, data); 5306 space_info = __find_space_info(root->fs_info, data);
5150 if (!space_info) { 5307 if (!space_info) {
5151 printk(KERN_ERR "No space info for %llu\n", data); 5308 printk(KERN_ERR "No space info for %llu\n", data);
@@ -5295,15 +5452,6 @@ alloc:
5295 if (unlikely(block_group->ro)) 5452 if (unlikely(block_group->ro))
5296 goto loop; 5453 goto loop;
5297 5454
5298 spin_lock(&block_group->free_space_ctl->tree_lock);
5299 if (cached &&
5300 block_group->free_space_ctl->free_space <
5301 num_bytes + empty_cluster + empty_size) {
5302 spin_unlock(&block_group->free_space_ctl->tree_lock);
5303 goto loop;
5304 }
5305 spin_unlock(&block_group->free_space_ctl->tree_lock);
5306
5307 /* 5455 /*
5308 * Ok we want to try and use the cluster allocator, so 5456 * Ok we want to try and use the cluster allocator, so
5309 * lets look there 5457 * lets look there
@@ -5331,6 +5479,8 @@ alloc:
5331 if (offset) { 5479 if (offset) {
5332 /* we have a block, we're done */ 5480 /* we have a block, we're done */
5333 spin_unlock(&last_ptr->refill_lock); 5481 spin_unlock(&last_ptr->refill_lock);
5482 trace_btrfs_reserve_extent_cluster(root,
5483 block_group, search_start, num_bytes);
5334 goto checks; 5484 goto checks;
5335 } 5485 }
5336 5486
@@ -5349,8 +5499,15 @@ refill_cluster:
5349 * plenty of times and not have found 5499 * plenty of times and not have found
5350 * anything, so we are likely way too 5500 * anything, so we are likely way too
5351 * fragmented for the clustering stuff to find 5501 * fragmented for the clustering stuff to find
5352 * anything. */ 5502 * anything.
5353 if (loop >= LOOP_NO_EMPTY_SIZE) { 5503 *
5504 * However, if the cluster is taken from the
5505 * current block group, release the cluster
5506 * first, so that we stand a better chance of
5507 * succeeding in the unclustered
5508 * allocation. */
5509 if (loop >= LOOP_NO_EMPTY_SIZE &&
5510 last_ptr->block_group != block_group) {
5354 spin_unlock(&last_ptr->refill_lock); 5511 spin_unlock(&last_ptr->refill_lock);
5355 goto unclustered_alloc; 5512 goto unclustered_alloc;
5356 } 5513 }
@@ -5361,6 +5518,11 @@ refill_cluster:
5361 */ 5518 */
5362 btrfs_return_cluster_to_free_space(NULL, last_ptr); 5519 btrfs_return_cluster_to_free_space(NULL, last_ptr);
5363 5520
5521 if (loop >= LOOP_NO_EMPTY_SIZE) {
5522 spin_unlock(&last_ptr->refill_lock);
5523 goto unclustered_alloc;
5524 }
5525
5364 /* allocate a cluster in this block group */ 5526 /* allocate a cluster in this block group */
5365 ret = btrfs_find_space_cluster(trans, root, 5527 ret = btrfs_find_space_cluster(trans, root,
5366 block_group, last_ptr, 5528 block_group, last_ptr,
@@ -5377,6 +5539,9 @@ refill_cluster:
5377 if (offset) { 5539 if (offset) {
5378 /* we found one, proceed */ 5540 /* we found one, proceed */
5379 spin_unlock(&last_ptr->refill_lock); 5541 spin_unlock(&last_ptr->refill_lock);
5542 trace_btrfs_reserve_extent_cluster(root,
5543 block_group, search_start,
5544 num_bytes);
5380 goto checks; 5545 goto checks;
5381 } 5546 }
5382 } else if (!cached && loop > LOOP_CACHING_NOWAIT 5547 } else if (!cached && loop > LOOP_CACHING_NOWAIT
@@ -5401,6 +5566,15 @@ refill_cluster:
5401 } 5566 }
5402 5567
5403unclustered_alloc: 5568unclustered_alloc:
5569 spin_lock(&block_group->free_space_ctl->tree_lock);
5570 if (cached &&
5571 block_group->free_space_ctl->free_space <
5572 num_bytes + empty_cluster + empty_size) {
5573 spin_unlock(&block_group->free_space_ctl->tree_lock);
5574 goto loop;
5575 }
5576 spin_unlock(&block_group->free_space_ctl->tree_lock);
5577
5404 offset = btrfs_find_space_for_alloc(block_group, search_start, 5578 offset = btrfs_find_space_for_alloc(block_group, search_start,
5405 num_bytes, empty_size); 5579 num_bytes, empty_size);
5406 /* 5580 /*
@@ -5438,9 +5612,6 @@ checks:
5438 goto loop; 5612 goto loop;
5439 } 5613 }
5440 5614
5441 ins->objectid = search_start;
5442 ins->offset = num_bytes;
5443
5444 if (offset < search_start) 5615 if (offset < search_start)
5445 btrfs_add_free_space(used_block_group, offset, 5616 btrfs_add_free_space(used_block_group, offset,
5446 search_start - offset); 5617 search_start - offset);
@@ -5457,6 +5628,8 @@ checks:
5457 ins->objectid = search_start; 5628 ins->objectid = search_start;
5458 ins->offset = num_bytes; 5629 ins->offset = num_bytes;
5459 5630
5631 trace_btrfs_reserve_extent(orig_root, block_group,
5632 search_start, num_bytes);
5460 if (offset < search_start) 5633 if (offset < search_start)
5461 btrfs_add_free_space(used_block_group, offset, 5634 btrfs_add_free_space(used_block_group, offset,
5462 search_start - offset); 5635 search_start - offset);
@@ -5842,9 +6015,10 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5842 6015
5843 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); 6016 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
5844 6017
5845 ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset, 6018 ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
5846 0, root_objectid, owner, offset, 6019 ins->offset, 0,
5847 BTRFS_ADD_DELAYED_EXTENT, NULL); 6020 root_objectid, owner, offset,
6021 BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
5848 return ret; 6022 return ret;
5849} 6023}
5850 6024
@@ -5997,10 +6171,11 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5997 return ERR_PTR(-ENOSPC); 6171 return ERR_PTR(-ENOSPC);
5998} 6172}
5999 6173
6000static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize) 6174static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
6175 struct btrfs_block_rsv *block_rsv, u32 blocksize)
6001{ 6176{
6002 block_rsv_add_bytes(block_rsv, blocksize, 0); 6177 block_rsv_add_bytes(block_rsv, blocksize, 0);
6003 block_rsv_release_bytes(block_rsv, NULL, 0); 6178 block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
6004} 6179}
6005 6180
6006/* 6181/*
@@ -6014,7 +6189,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6014 struct btrfs_root *root, u32 blocksize, 6189 struct btrfs_root *root, u32 blocksize,
6015 u64 parent, u64 root_objectid, 6190 u64 parent, u64 root_objectid,
6016 struct btrfs_disk_key *key, int level, 6191 struct btrfs_disk_key *key, int level,
6017 u64 hint, u64 empty_size) 6192 u64 hint, u64 empty_size, int for_cow)
6018{ 6193{
6019 struct btrfs_key ins; 6194 struct btrfs_key ins;
6020 struct btrfs_block_rsv *block_rsv; 6195 struct btrfs_block_rsv *block_rsv;
@@ -6030,7 +6205,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6030 ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, 6205 ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
6031 empty_size, hint, (u64)-1, &ins, 0); 6206 empty_size, hint, (u64)-1, &ins, 0);
6032 if (ret) { 6207 if (ret) {
6033 unuse_block_rsv(block_rsv, blocksize); 6208 unuse_block_rsv(root->fs_info, block_rsv, blocksize);
6034 return ERR_PTR(ret); 6209 return ERR_PTR(ret);
6035 } 6210 }
6036 6211
@@ -6058,10 +6233,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6058 extent_op->update_flags = 1; 6233 extent_op->update_flags = 1;
6059 extent_op->is_data = 0; 6234 extent_op->is_data = 0;
6060 6235
6061 ret = btrfs_add_delayed_tree_ref(trans, ins.objectid, 6236 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
6237 ins.objectid,
6062 ins.offset, parent, root_objectid, 6238 ins.offset, parent, root_objectid,
6063 level, BTRFS_ADD_DELAYED_EXTENT, 6239 level, BTRFS_ADD_DELAYED_EXTENT,
6064 extent_op); 6240 extent_op, for_cow);
6065 BUG_ON(ret); 6241 BUG_ON(ret);
6066 } 6242 }
6067 return buf; 6243 return buf;
@@ -6078,6 +6254,7 @@ struct walk_control {
6078 int keep_locks; 6254 int keep_locks;
6079 int reada_slot; 6255 int reada_slot;
6080 int reada_count; 6256 int reada_count;
6257 int for_reloc;
6081}; 6258};
6082 6259
6083#define DROP_REFERENCE 1 6260#define DROP_REFERENCE 1
@@ -6216,9 +6393,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
6216 /* wc->stage == UPDATE_BACKREF */ 6393 /* wc->stage == UPDATE_BACKREF */
6217 if (!(wc->flags[level] & flag)) { 6394 if (!(wc->flags[level] & flag)) {
6218 BUG_ON(!path->locks[level]); 6395 BUG_ON(!path->locks[level]);
6219 ret = btrfs_inc_ref(trans, root, eb, 1); 6396 ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
6220 BUG_ON(ret); 6397 BUG_ON(ret);
6221 ret = btrfs_dec_ref(trans, root, eb, 0); 6398 ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
6222 BUG_ON(ret); 6399 BUG_ON(ret);
6223 ret = btrfs_set_disk_extent_flags(trans, root, eb->start, 6400 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
6224 eb->len, flag, 0); 6401 eb->len, flag, 0);
@@ -6362,7 +6539,7 @@ skip:
6362 } 6539 }
6363 6540
6364 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, 6541 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
6365 root->root_key.objectid, level - 1, 0); 6542 root->root_key.objectid, level - 1, 0, 0);
6366 BUG_ON(ret); 6543 BUG_ON(ret);
6367 } 6544 }
6368 btrfs_tree_unlock(next); 6545 btrfs_tree_unlock(next);
@@ -6436,9 +6613,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6436 if (wc->refs[level] == 1) { 6613 if (wc->refs[level] == 1) {
6437 if (level == 0) { 6614 if (level == 0) {
6438 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 6615 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6439 ret = btrfs_dec_ref(trans, root, eb, 1); 6616 ret = btrfs_dec_ref(trans, root, eb, 1,
6617 wc->for_reloc);
6440 else 6618 else
6441 ret = btrfs_dec_ref(trans, root, eb, 0); 6619 ret = btrfs_dec_ref(trans, root, eb, 0,
6620 wc->for_reloc);
6442 BUG_ON(ret); 6621 BUG_ON(ret);
6443 } 6622 }
6444 /* make block locked assertion in clean_tree_block happy */ 6623 /* make block locked assertion in clean_tree_block happy */
@@ -6465,7 +6644,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6465 btrfs_header_owner(path->nodes[level + 1])); 6644 btrfs_header_owner(path->nodes[level + 1]));
6466 } 6645 }
6467 6646
6468 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); 6647 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0);
6469out: 6648out:
6470 wc->refs[level] = 0; 6649 wc->refs[level] = 0;
6471 wc->flags[level] = 0; 6650 wc->flags[level] = 0;
@@ -6549,7 +6728,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
6549 * blocks are properly updated. 6728 * blocks are properly updated.
6550 */ 6729 */
6551void btrfs_drop_snapshot(struct btrfs_root *root, 6730void btrfs_drop_snapshot(struct btrfs_root *root,
6552 struct btrfs_block_rsv *block_rsv, int update_ref) 6731 struct btrfs_block_rsv *block_rsv, int update_ref,
6732 int for_reloc)
6553{ 6733{
6554 struct btrfs_path *path; 6734 struct btrfs_path *path;
6555 struct btrfs_trans_handle *trans; 6735 struct btrfs_trans_handle *trans;
@@ -6637,6 +6817,7 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
6637 wc->stage = DROP_REFERENCE; 6817 wc->stage = DROP_REFERENCE;
6638 wc->update_ref = update_ref; 6818 wc->update_ref = update_ref;
6639 wc->keep_locks = 0; 6819 wc->keep_locks = 0;
6820 wc->for_reloc = for_reloc;
6640 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 6821 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
6641 6822
6642 while (1) { 6823 while (1) {
@@ -6721,6 +6902,7 @@ out:
6721 * drop subtree rooted at tree block 'node'. 6902 * drop subtree rooted at tree block 'node'.
6722 * 6903 *
6723 * NOTE: this function will unlock and release tree block 'node' 6904 * NOTE: this function will unlock and release tree block 'node'
6905 * only used by relocation code
6724 */ 6906 */
6725int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 6907int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6726 struct btrfs_root *root, 6908 struct btrfs_root *root,
@@ -6765,6 +6947,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6765 wc->stage = DROP_REFERENCE; 6947 wc->stage = DROP_REFERENCE;
6766 wc->update_ref = 0; 6948 wc->update_ref = 0;
6767 wc->keep_locks = 1; 6949 wc->keep_locks = 1;
6950 wc->for_reloc = 1;
6768 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 6951 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
6769 6952
6770 while (1) { 6953 while (1) {
@@ -6792,6 +6975,29 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
6792 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | 6975 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
6793 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 6976 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
6794 6977
6978 if (root->fs_info->balance_ctl) {
6979 struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
6980 u64 tgt = 0;
6981
6982 /* pick restriper's target profile and return */
6983 if (flags & BTRFS_BLOCK_GROUP_DATA &&
6984 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
6985 tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
6986 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
6987 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
6988 tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
6989 } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
6990 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
6991 tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
6992 }
6993
6994 if (tgt) {
6995 /* extended -> chunk profile */
6996 tgt &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
6997 return tgt;
6998 }
6999 }
7000
6795 /* 7001 /*
6796 * we add in the count of missing devices because we want 7002 * we add in the count of missing devices because we want
6797 * to make sure that any RAID levels on a degraded FS 7003 * to make sure that any RAID levels on a degraded FS
@@ -7085,7 +7291,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7085 * space to fit our block group in. 7291 * space to fit our block group in.
7086 */ 7292 */
7087 if (device->total_bytes > device->bytes_used + min_free) { 7293 if (device->total_bytes > device->bytes_used + min_free) {
7088 ret = find_free_dev_extent(NULL, device, min_free, 7294 ret = find_free_dev_extent(device, min_free,
7089 &dev_offset, NULL); 7295 &dev_offset, NULL);
7090 if (!ret) 7296 if (!ret)
7091 dev_nr++; 7297 dev_nr++;
@@ -7447,6 +7653,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7447 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 7653 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
7448 &cache->space_info); 7654 &cache->space_info);
7449 BUG_ON(ret); 7655 BUG_ON(ret);
7656 update_global_block_rsv(root->fs_info);
7450 7657
7451 spin_lock(&cache->space_info->lock); 7658 spin_lock(&cache->space_info->lock);
7452 cache->space_info->bytes_readonly += cache->bytes_super; 7659 cache->space_info->bytes_readonly += cache->bytes_super;
@@ -7466,6 +7673,22 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7466 return 0; 7673 return 0;
7467} 7674}
7468 7675
7676static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
7677{
7678 u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
7679
7680 /* chunk -> extended profile */
7681 if (extra_flags == 0)
7682 extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
7683
7684 if (flags & BTRFS_BLOCK_GROUP_DATA)
7685 fs_info->avail_data_alloc_bits &= ~extra_flags;
7686 if (flags & BTRFS_BLOCK_GROUP_METADATA)
7687 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
7688 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
7689 fs_info->avail_system_alloc_bits &= ~extra_flags;
7690}
7691
7469int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 7692int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7470 struct btrfs_root *root, u64 group_start) 7693 struct btrfs_root *root, u64 group_start)
7471{ 7694{
@@ -7476,6 +7699,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7476 struct btrfs_key key; 7699 struct btrfs_key key;
7477 struct inode *inode; 7700 struct inode *inode;
7478 int ret; 7701 int ret;
7702 int index;
7479 int factor; 7703 int factor;
7480 7704
7481 root = root->fs_info->extent_root; 7705 root = root->fs_info->extent_root;
@@ -7491,6 +7715,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7491 free_excluded_extents(root, block_group); 7715 free_excluded_extents(root, block_group);
7492 7716
7493 memcpy(&key, &block_group->key, sizeof(key)); 7717 memcpy(&key, &block_group->key, sizeof(key));
7718 index = get_block_group_index(block_group);
7494 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | 7719 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
7495 BTRFS_BLOCK_GROUP_RAID1 | 7720 BTRFS_BLOCK_GROUP_RAID1 |
7496 BTRFS_BLOCK_GROUP_RAID10)) 7721 BTRFS_BLOCK_GROUP_RAID10))
@@ -7565,6 +7790,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7565 * are still on the list after taking the semaphore 7790 * are still on the list after taking the semaphore
7566 */ 7791 */
7567 list_del_init(&block_group->list); 7792 list_del_init(&block_group->list);
7793 if (list_empty(&block_group->space_info->block_groups[index]))
7794 clear_avail_alloc_bits(root->fs_info, block_group->flags);
7568 up_write(&block_group->space_info->groups_sem); 7795 up_write(&block_group->space_info->groups_sem);
7569 7796
7570 if (block_group->cached == BTRFS_CACHE_STARTED) 7797 if (block_group->cached == BTRFS_CACHE_STARTED)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 49f3c9dc09f4..9d09a4f81875 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -18,6 +18,7 @@
18#include "ctree.h" 18#include "ctree.h"
19#include "btrfs_inode.h" 19#include "btrfs_inode.h"
20#include "volumes.h" 20#include "volumes.h"
21#include "check-integrity.h"
21 22
22static struct kmem_cache *extent_state_cache; 23static struct kmem_cache *extent_state_cache;
23static struct kmem_cache *extent_buffer_cache; 24static struct kmem_cache *extent_buffer_cache;
@@ -1895,7 +1896,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1895 } 1896 }
1896 bio->bi_bdev = dev->bdev; 1897 bio->bi_bdev = dev->bdev;
1897 bio_add_page(bio, page, length, start-page_offset(page)); 1898 bio_add_page(bio, page, length, start-page_offset(page));
1898 submit_bio(WRITE_SYNC, bio); 1899 btrfsic_submit_bio(WRITE_SYNC, bio);
1899 wait_for_completion(&compl); 1900 wait_for_completion(&compl);
1900 1901
1901 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { 1902 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
@@ -2393,7 +2394,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
2393 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 2394 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
2394 mirror_num, bio_flags, start); 2395 mirror_num, bio_flags, start);
2395 else 2396 else
2396 submit_bio(rw, bio); 2397 btrfsic_submit_bio(rw, bio);
2397 2398
2398 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 2399 if (bio_flagged(bio, BIO_EOPNOTSUPP))
2399 ret = -EOPNOTSUPP; 2400 ret = -EOPNOTSUPP;
@@ -3579,6 +3580,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3579 atomic_set(&eb->blocking_writers, 0); 3580 atomic_set(&eb->blocking_writers, 0);
3580 atomic_set(&eb->spinning_readers, 0); 3581 atomic_set(&eb->spinning_readers, 0);
3581 atomic_set(&eb->spinning_writers, 0); 3582 atomic_set(&eb->spinning_writers, 0);
3583 eb->lock_nested = 0;
3582 init_waitqueue_head(&eb->write_lock_wq); 3584 init_waitqueue_head(&eb->write_lock_wq);
3583 init_waitqueue_head(&eb->read_lock_wq); 3585 init_waitqueue_head(&eb->read_lock_wq);
3584 3586
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7604c3001322..bc6a042cb6fc 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -129,6 +129,7 @@ struct extent_buffer {
129 struct list_head leak_list; 129 struct list_head leak_list;
130 struct rcu_head rcu_head; 130 struct rcu_head rcu_head;
131 atomic_t refs; 131 atomic_t refs;
132 pid_t lock_owner;
132 133
133 /* count of read lock holders on the extent buffer */ 134 /* count of read lock holders on the extent buffer */
134 atomic_t write_locks; 135 atomic_t write_locks;
@@ -137,6 +138,7 @@ struct extent_buffer {
137 atomic_t blocking_readers; 138 atomic_t blocking_readers;
138 atomic_t spinning_readers; 139 atomic_t spinning_readers;
139 atomic_t spinning_writers; 140 atomic_t spinning_writers;
141 int lock_nested;
140 142
141 /* protects write locks */ 143 /* protects write locks */
142 rwlock_t lock; 144 rwlock_t lock;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 97fbe939c050..859ba2dd8890 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -678,7 +678,7 @@ next_slot:
678 disk_bytenr, num_bytes, 0, 678 disk_bytenr, num_bytes, 0,
679 root->root_key.objectid, 679 root->root_key.objectid,
680 new_key.objectid, 680 new_key.objectid,
681 start - extent_offset); 681 start - extent_offset, 0);
682 BUG_ON(ret); 682 BUG_ON(ret);
683 *hint_byte = disk_bytenr; 683 *hint_byte = disk_bytenr;
684 } 684 }
@@ -753,7 +753,7 @@ next_slot:
753 disk_bytenr, num_bytes, 0, 753 disk_bytenr, num_bytes, 0,
754 root->root_key.objectid, 754 root->root_key.objectid,
755 key.objectid, key.offset - 755 key.objectid, key.offset -
756 extent_offset); 756 extent_offset, 0);
757 BUG_ON(ret); 757 BUG_ON(ret);
758 inode_sub_bytes(inode, 758 inode_sub_bytes(inode,
759 extent_end - key.offset); 759 extent_end - key.offset);
@@ -962,7 +962,7 @@ again:
962 962
963 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, 963 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
964 root->root_key.objectid, 964 root->root_key.objectid,
965 ino, orig_offset); 965 ino, orig_offset, 0);
966 BUG_ON(ret); 966 BUG_ON(ret);
967 967
968 if (split == start) { 968 if (split == start) {
@@ -989,7 +989,7 @@ again:
989 del_nr++; 989 del_nr++;
990 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 990 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
991 0, root->root_key.objectid, 991 0, root->root_key.objectid,
992 ino, orig_offset); 992 ino, orig_offset, 0);
993 BUG_ON(ret); 993 BUG_ON(ret);
994 } 994 }
995 other_start = 0; 995 other_start = 0;
@@ -1006,7 +1006,7 @@ again:
1006 del_nr++; 1006 del_nr++;
1007 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 1007 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1008 0, root->root_key.objectid, 1008 0, root->root_key.objectid,
1009 ino, orig_offset); 1009 ino, orig_offset, 0);
1010 BUG_ON(ret); 1010 BUG_ON(ret);
1011 } 1011 }
1012 if (del_nr == 0) { 1012 if (del_nr == 0) {
@@ -1081,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1081again: 1081again:
1082 for (i = 0; i < num_pages; i++) { 1082 for (i = 0; i < num_pages; i++) {
1083 pages[i] = find_or_create_page(inode->i_mapping, index + i, 1083 pages[i] = find_or_create_page(inode->i_mapping, index + i,
1084 mask); 1084 mask | __GFP_WRITE);
1085 if (!pages[i]) { 1085 if (!pages[i]) {
1086 faili = i - 1; 1086 faili = i - 1;
1087 err = -ENOMEM; 1087 err = -ENOMEM;
@@ -1136,7 +1136,8 @@ again:
1136 GFP_NOFS); 1136 GFP_NOFS);
1137 } 1137 }
1138 for (i = 0; i < num_pages; i++) { 1138 for (i = 0; i < num_pages; i++) {
1139 clear_page_dirty_for_io(pages[i]); 1139 if (clear_page_dirty_for_io(pages[i]))
1140 account_page_redirty(pages[i]);
1140 set_page_extent_mapped(pages[i]); 1141 set_page_extent_mapped(pages[i]);
1141 WARN_ON(!PageLocked(pages[i])); 1142 WARN_ON(!PageLocked(pages[i]));
1142 } 1143 }
@@ -1273,7 +1274,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1273 dirty_pages); 1274 dirty_pages);
1274 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) 1275 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1275 btrfs_btree_balance_dirty(root, 1); 1276 btrfs_btree_balance_dirty(root, 1);
1276 btrfs_throttle(root);
1277 1277
1278 pos += copied; 1278 pos += copied;
1279 num_written += copied; 1279 num_written += copied;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index ec23d43d0c35..d20ff87ca603 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -319,9 +319,11 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl)
319 io_ctl_unmap_page(io_ctl); 319 io_ctl_unmap_page(io_ctl);
320 320
321 for (i = 0; i < io_ctl->num_pages; i++) { 321 for (i = 0; i < io_ctl->num_pages; i++) {
322 ClearPageChecked(io_ctl->pages[i]); 322 if (io_ctl->pages[i]) {
323 unlock_page(io_ctl->pages[i]); 323 ClearPageChecked(io_ctl->pages[i]);
324 page_cache_release(io_ctl->pages[i]); 324 unlock_page(io_ctl->pages[i]);
325 page_cache_release(io_ctl->pages[i]);
326 }
325 } 327 }
326} 328}
327 329
@@ -423,7 +425,7 @@ static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
423 } 425 }
424 426
425 if (index == 0) 427 if (index == 0)
426 offset = sizeof(u32) * io_ctl->num_pages;; 428 offset = sizeof(u32) * io_ctl->num_pages;
427 429
428 crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc, 430 crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
429 PAGE_CACHE_SIZE - offset); 431 PAGE_CACHE_SIZE - offset);
@@ -635,7 +637,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
635 if (!num_entries) 637 if (!num_entries)
636 return 0; 638 return 0;
637 639
638 io_ctl_init(&io_ctl, inode, root); 640 ret = io_ctl_init(&io_ctl, inode, root);
641 if (ret)
642 return ret;
643
639 ret = readahead_cache(inode); 644 ret = readahead_cache(inode);
640 if (ret) 645 if (ret)
641 goto out; 646 goto out;
@@ -838,7 +843,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
838 struct io_ctl io_ctl; 843 struct io_ctl io_ctl;
839 struct list_head bitmap_list; 844 struct list_head bitmap_list;
840 struct btrfs_key key; 845 struct btrfs_key key;
841 u64 start, end, len; 846 u64 start, extent_start, extent_end, len;
842 int entries = 0; 847 int entries = 0;
843 int bitmaps = 0; 848 int bitmaps = 0;
844 int ret; 849 int ret;
@@ -849,7 +854,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
849 if (!i_size_read(inode)) 854 if (!i_size_read(inode))
850 return -1; 855 return -1;
851 856
852 io_ctl_init(&io_ctl, inode, root); 857 ret = io_ctl_init(&io_ctl, inode, root);
858 if (ret)
859 return -1;
853 860
854 /* Get the cluster for this block_group if it exists */ 861 /* Get the cluster for this block_group if it exists */
855 if (block_group && !list_empty(&block_group->cluster_list)) 862 if (block_group && !list_empty(&block_group->cluster_list))
@@ -857,25 +864,12 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
857 struct btrfs_free_cluster, 864 struct btrfs_free_cluster,
858 block_group_list); 865 block_group_list);
859 866
860 /*
861 * We shouldn't have switched the pinned extents yet so this is the
862 * right one
863 */
864 unpin = root->fs_info->pinned_extents;
865
866 /* Lock all pages first so we can lock the extent safely. */ 867 /* Lock all pages first so we can lock the extent safely. */
867 io_ctl_prepare_pages(&io_ctl, inode, 0); 868 io_ctl_prepare_pages(&io_ctl, inode, 0);
868 869
869 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 870 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
870 0, &cached_state, GFP_NOFS); 871 0, &cached_state, GFP_NOFS);
871 872
872 /*
873 * When searching for pinned extents, we need to start at our start
874 * offset.
875 */
876 if (block_group)
877 start = block_group->key.objectid;
878
879 node = rb_first(&ctl->free_space_offset); 873 node = rb_first(&ctl->free_space_offset);
880 if (!node && cluster) { 874 if (!node && cluster) {
881 node = rb_first(&cluster->root); 875 node = rb_first(&cluster->root);
@@ -918,9 +912,20 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
918 * We want to add any pinned extents to our free space cache 912 * We want to add any pinned extents to our free space cache
919 * so we don't leak the space 913 * so we don't leak the space
920 */ 914 */
915
916 /*
917 * We shouldn't have switched the pinned extents yet so this is the
918 * right one
919 */
920 unpin = root->fs_info->pinned_extents;
921
922 if (block_group)
923 start = block_group->key.objectid;
924
921 while (block_group && (start < block_group->key.objectid + 925 while (block_group && (start < block_group->key.objectid +
922 block_group->key.offset)) { 926 block_group->key.offset)) {
923 ret = find_first_extent_bit(unpin, start, &start, &end, 927 ret = find_first_extent_bit(unpin, start,
928 &extent_start, &extent_end,
924 EXTENT_DIRTY); 929 EXTENT_DIRTY);
925 if (ret) { 930 if (ret) {
926 ret = 0; 931 ret = 0;
@@ -928,20 +933,21 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
928 } 933 }
929 934
930 /* This pinned extent is out of our range */ 935 /* This pinned extent is out of our range */
931 if (start >= block_group->key.objectid + 936 if (extent_start >= block_group->key.objectid +
932 block_group->key.offset) 937 block_group->key.offset)
933 break; 938 break;
934 939
935 len = block_group->key.objectid + 940 extent_start = max(extent_start, start);
936 block_group->key.offset - start; 941 extent_end = min(block_group->key.objectid +
937 len = min(len, end + 1 - start); 942 block_group->key.offset, extent_end + 1);
943 len = extent_end - extent_start;
938 944
939 entries++; 945 entries++;
940 ret = io_ctl_add_entry(&io_ctl, start, len, NULL); 946 ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL);
941 if (ret) 947 if (ret)
942 goto out_nospc; 948 goto out_nospc;
943 949
944 start = end + 1; 950 start = extent_end;
945 } 951 }
946 952
947 /* Write out the bitmaps */ 953 /* Write out the bitmaps */
@@ -2283,23 +2289,23 @@ out:
2283static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, 2289static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
2284 struct btrfs_free_space *entry, 2290 struct btrfs_free_space *entry,
2285 struct btrfs_free_cluster *cluster, 2291 struct btrfs_free_cluster *cluster,
2286 u64 offset, u64 bytes, u64 min_bytes) 2292 u64 offset, u64 bytes,
2293 u64 cont1_bytes, u64 min_bytes)
2287{ 2294{
2288 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2295 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2289 unsigned long next_zero; 2296 unsigned long next_zero;
2290 unsigned long i; 2297 unsigned long i;
2291 unsigned long search_bits; 2298 unsigned long want_bits;
2292 unsigned long total_bits; 2299 unsigned long min_bits;
2293 unsigned long found_bits; 2300 unsigned long found_bits;
2294 unsigned long start = 0; 2301 unsigned long start = 0;
2295 unsigned long total_found = 0; 2302 unsigned long total_found = 0;
2296 int ret; 2303 int ret;
2297 bool found = false;
2298 2304
2299 i = offset_to_bit(entry->offset, block_group->sectorsize, 2305 i = offset_to_bit(entry->offset, block_group->sectorsize,
2300 max_t(u64, offset, entry->offset)); 2306 max_t(u64, offset, entry->offset));
2301 search_bits = bytes_to_bits(bytes, block_group->sectorsize); 2307 want_bits = bytes_to_bits(bytes, block_group->sectorsize);
2302 total_bits = bytes_to_bits(min_bytes, block_group->sectorsize); 2308 min_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
2303 2309
2304again: 2310again:
2305 found_bits = 0; 2311 found_bits = 0;
@@ -2308,7 +2314,7 @@ again:
2308 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) { 2314 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
2309 next_zero = find_next_zero_bit(entry->bitmap, 2315 next_zero = find_next_zero_bit(entry->bitmap,
2310 BITS_PER_BITMAP, i); 2316 BITS_PER_BITMAP, i);
2311 if (next_zero - i >= search_bits) { 2317 if (next_zero - i >= min_bits) {
2312 found_bits = next_zero - i; 2318 found_bits = next_zero - i;
2313 break; 2319 break;
2314 } 2320 }
@@ -2318,10 +2324,9 @@ again:
2318 if (!found_bits) 2324 if (!found_bits)
2319 return -ENOSPC; 2325 return -ENOSPC;
2320 2326
2321 if (!found) { 2327 if (!total_found) {
2322 start = i; 2328 start = i;
2323 cluster->max_size = 0; 2329 cluster->max_size = 0;
2324 found = true;
2325 } 2330 }
2326 2331
2327 total_found += found_bits; 2332 total_found += found_bits;
@@ -2329,13 +2334,8 @@ again:
2329 if (cluster->max_size < found_bits * block_group->sectorsize) 2334 if (cluster->max_size < found_bits * block_group->sectorsize)
2330 cluster->max_size = found_bits * block_group->sectorsize; 2335 cluster->max_size = found_bits * block_group->sectorsize;
2331 2336
2332 if (total_found < total_bits) { 2337 if (total_found < want_bits || cluster->max_size < cont1_bytes) {
2333 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero); 2338 i = next_zero + 1;
2334 if (i - start > total_bits * 2) {
2335 total_found = 0;
2336 cluster->max_size = 0;
2337 found = false;
2338 }
2339 goto again; 2339 goto again;
2340 } 2340 }
2341 2341
@@ -2346,28 +2346,31 @@ again:
2346 &entry->offset_index, 1); 2346 &entry->offset_index, 1);
2347 BUG_ON(ret); 2347 BUG_ON(ret);
2348 2348
2349 trace_btrfs_setup_cluster(block_group, cluster,
2350 total_found * block_group->sectorsize, 1);
2349 return 0; 2351 return 0;
2350} 2352}
2351 2353
2352/* 2354/*
2353 * This searches the block group for just extents to fill the cluster with. 2355 * This searches the block group for just extents to fill the cluster with.
2356 * Try to find a cluster with at least bytes total bytes, at least one
2357 * extent of cont1_bytes, and other clusters of at least min_bytes.
2354 */ 2358 */
2355static noinline int 2359static noinline int
2356setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, 2360setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2357 struct btrfs_free_cluster *cluster, 2361 struct btrfs_free_cluster *cluster,
2358 struct list_head *bitmaps, u64 offset, u64 bytes, 2362 struct list_head *bitmaps, u64 offset, u64 bytes,
2359 u64 min_bytes) 2363 u64 cont1_bytes, u64 min_bytes)
2360{ 2364{
2361 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2365 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2362 struct btrfs_free_space *first = NULL; 2366 struct btrfs_free_space *first = NULL;
2363 struct btrfs_free_space *entry = NULL; 2367 struct btrfs_free_space *entry = NULL;
2364 struct btrfs_free_space *prev = NULL;
2365 struct btrfs_free_space *last; 2368 struct btrfs_free_space *last;
2366 struct rb_node *node; 2369 struct rb_node *node;
2367 u64 window_start; 2370 u64 window_start;
2368 u64 window_free; 2371 u64 window_free;
2369 u64 max_extent; 2372 u64 max_extent;
2370 u64 max_gap = 128 * 1024; 2373 u64 total_size = 0;
2371 2374
2372 entry = tree_search_offset(ctl, offset, 0, 1); 2375 entry = tree_search_offset(ctl, offset, 0, 1);
2373 if (!entry) 2376 if (!entry)
@@ -2377,8 +2380,8 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2377 * We don't want bitmaps, so just move along until we find a normal 2380 * We don't want bitmaps, so just move along until we find a normal
2378 * extent entry. 2381 * extent entry.
2379 */ 2382 */
2380 while (entry->bitmap) { 2383 while (entry->bitmap || entry->bytes < min_bytes) {
2381 if (list_empty(&entry->list)) 2384 if (entry->bitmap && list_empty(&entry->list))
2382 list_add_tail(&entry->list, bitmaps); 2385 list_add_tail(&entry->list, bitmaps);
2383 node = rb_next(&entry->offset_index); 2386 node = rb_next(&entry->offset_index);
2384 if (!node) 2387 if (!node)
@@ -2391,12 +2394,9 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2391 max_extent = entry->bytes; 2394 max_extent = entry->bytes;
2392 first = entry; 2395 first = entry;
2393 last = entry; 2396 last = entry;
2394 prev = entry;
2395 2397
2396 while (window_free <= min_bytes) { 2398 for (node = rb_next(&entry->offset_index); node;
2397 node = rb_next(&entry->offset_index); 2399 node = rb_next(&entry->offset_index)) {
2398 if (!node)
2399 return -ENOSPC;
2400 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2400 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2401 2401
2402 if (entry->bitmap) { 2402 if (entry->bitmap) {
@@ -2405,26 +2405,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2405 continue; 2405 continue;
2406 } 2406 }
2407 2407
2408 /* 2408 if (entry->bytes < min_bytes)
2409 * we haven't filled the empty size and the window is 2409 continue;
2410 * very large. reset and try again 2410
2411 */ 2411 last = entry;
2412 if (entry->offset - (prev->offset + prev->bytes) > max_gap || 2412 window_free += entry->bytes;
2413 entry->offset - window_start > (min_bytes * 2)) { 2413 if (entry->bytes > max_extent)
2414 first = entry;
2415 window_start = entry->offset;
2416 window_free = entry->bytes;
2417 last = entry;
2418 max_extent = entry->bytes; 2414 max_extent = entry->bytes;
2419 } else {
2420 last = entry;
2421 window_free += entry->bytes;
2422 if (entry->bytes > max_extent)
2423 max_extent = entry->bytes;
2424 }
2425 prev = entry;
2426 } 2415 }
2427 2416
2417 if (window_free < bytes || max_extent < cont1_bytes)
2418 return -ENOSPC;
2419
2428 cluster->window_start = first->offset; 2420 cluster->window_start = first->offset;
2429 2421
2430 node = &first->offset_index; 2422 node = &first->offset_index;
@@ -2438,17 +2430,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2438 2430
2439 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2431 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2440 node = rb_next(&entry->offset_index); 2432 node = rb_next(&entry->offset_index);
2441 if (entry->bitmap) 2433 if (entry->bitmap || entry->bytes < min_bytes)
2442 continue; 2434 continue;
2443 2435
2444 rb_erase(&entry->offset_index, &ctl->free_space_offset); 2436 rb_erase(&entry->offset_index, &ctl->free_space_offset);
2445 ret = tree_insert_offset(&cluster->root, entry->offset, 2437 ret = tree_insert_offset(&cluster->root, entry->offset,
2446 &entry->offset_index, 0); 2438 &entry->offset_index, 0);
2439 total_size += entry->bytes;
2447 BUG_ON(ret); 2440 BUG_ON(ret);
2448 } while (node && entry != last); 2441 } while (node && entry != last);
2449 2442
2450 cluster->max_size = max_extent; 2443 cluster->max_size = max_extent;
2451 2444 trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
2452 return 0; 2445 return 0;
2453} 2446}
2454 2447
@@ -2460,7 +2453,7 @@ static noinline int
2460setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, 2453setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2461 struct btrfs_free_cluster *cluster, 2454 struct btrfs_free_cluster *cluster,
2462 struct list_head *bitmaps, u64 offset, u64 bytes, 2455 struct list_head *bitmaps, u64 offset, u64 bytes,
2463 u64 min_bytes) 2456 u64 cont1_bytes, u64 min_bytes)
2464{ 2457{
2465 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2458 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2466 struct btrfs_free_space *entry; 2459 struct btrfs_free_space *entry;
@@ -2485,7 +2478,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2485 if (entry->bytes < min_bytes) 2478 if (entry->bytes < min_bytes)
2486 continue; 2479 continue;
2487 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, 2480 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
2488 bytes, min_bytes); 2481 bytes, cont1_bytes, min_bytes);
2489 if (!ret) 2482 if (!ret)
2490 return 0; 2483 return 0;
2491 } 2484 }
@@ -2499,7 +2492,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2499 2492
2500/* 2493/*
2501 * here we try to find a cluster of blocks in a block group. The goal 2494 * here we try to find a cluster of blocks in a block group. The goal
2502 * is to find at least bytes free and up to empty_size + bytes free. 2495 * is to find at least bytes+empty_size.
2503 * We might not find them all in one contiguous area. 2496 * We might not find them all in one contiguous area.
2504 * 2497 *
2505 * returns zero and sets up cluster if things worked out, otherwise 2498 * returns zero and sets up cluster if things worked out, otherwise
@@ -2515,23 +2508,24 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2515 struct btrfs_free_space *entry, *tmp; 2508 struct btrfs_free_space *entry, *tmp;
2516 LIST_HEAD(bitmaps); 2509 LIST_HEAD(bitmaps);
2517 u64 min_bytes; 2510 u64 min_bytes;
2511 u64 cont1_bytes;
2518 int ret; 2512 int ret;
2519 2513
2520 /* for metadata, allow allocates with more holes */ 2514 /*
2515 * Choose the minimum extent size we'll require for this
2516 * cluster. For SSD_SPREAD, don't allow any fragmentation.
2517 * For metadata, allow allocates with smaller extents. For
2518 * data, keep it dense.
2519 */
2521 if (btrfs_test_opt(root, SSD_SPREAD)) { 2520 if (btrfs_test_opt(root, SSD_SPREAD)) {
2522 min_bytes = bytes + empty_size; 2521 cont1_bytes = min_bytes = bytes + empty_size;
2523 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { 2522 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
2524 /* 2523 cont1_bytes = bytes;
2525 * we want to do larger allocations when we are 2524 min_bytes = block_group->sectorsize;
2526 * flushing out the delayed refs, it helps prevent 2525 } else {
2527 * making more work as we go along. 2526 cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
2528 */ 2527 min_bytes = block_group->sectorsize;
2529 if (trans->transaction->delayed_refs.flushing) 2528 }
2530 min_bytes = max(bytes, (bytes + empty_size) >> 1);
2531 else
2532 min_bytes = max(bytes, (bytes + empty_size) >> 4);
2533 } else
2534 min_bytes = max(bytes, (bytes + empty_size) >> 2);
2535 2529
2536 spin_lock(&ctl->tree_lock); 2530 spin_lock(&ctl->tree_lock);
2537 2531
@@ -2539,7 +2533,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2539 * If we know we don't have enough space to make a cluster don't even 2533 * If we know we don't have enough space to make a cluster don't even
2540 * bother doing all the work to try and find one. 2534 * bother doing all the work to try and find one.
2541 */ 2535 */
2542 if (ctl->free_space < min_bytes) { 2536 if (ctl->free_space < bytes) {
2543 spin_unlock(&ctl->tree_lock); 2537 spin_unlock(&ctl->tree_lock);
2544 return -ENOSPC; 2538 return -ENOSPC;
2545 } 2539 }
@@ -2552,11 +2546,17 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2552 goto out; 2546 goto out;
2553 } 2547 }
2554 2548
2549 trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
2550 min_bytes);
2551
2552 INIT_LIST_HEAD(&bitmaps);
2555 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, 2553 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
2556 bytes, min_bytes); 2554 bytes + empty_size,
2555 cont1_bytes, min_bytes);
2557 if (ret) 2556 if (ret)
2558 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, 2557 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
2559 offset, bytes, min_bytes); 2558 offset, bytes + empty_size,
2559 cont1_bytes, min_bytes);
2560 2560
2561 /* Clear our temporary list */ 2561 /* Clear our temporary list */
2562 list_for_each_entry_safe(entry, tmp, &bitmaps, list) 2562 list_for_each_entry_safe(entry, tmp, &bitmaps, list)
@@ -2567,6 +2567,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2567 list_add_tail(&cluster->block_group_list, 2567 list_add_tail(&cluster->block_group_list,
2568 &block_group->cluster_list); 2568 &block_group->cluster_list);
2569 cluster->block_group = block_group; 2569 cluster->block_group = block_group;
2570 } else {
2571 trace_btrfs_failed_cluster_setup(block_group);
2570 } 2572 }
2571out: 2573out:
2572 spin_unlock(&cluster->lock); 2574 spin_unlock(&cluster->lock);
@@ -2588,17 +2590,57 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
2588 cluster->block_group = NULL; 2590 cluster->block_group = NULL;
2589} 2591}
2590 2592
2591int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, 2593static int do_trimming(struct btrfs_block_group_cache *block_group,
2592 u64 *trimmed, u64 start, u64 end, u64 minlen) 2594 u64 *total_trimmed, u64 start, u64 bytes,
2595 u64 reserved_start, u64 reserved_bytes)
2593{ 2596{
2594 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2597 struct btrfs_space_info *space_info = block_group->space_info;
2595 struct btrfs_free_space *entry = NULL;
2596 struct btrfs_fs_info *fs_info = block_group->fs_info; 2598 struct btrfs_fs_info *fs_info = block_group->fs_info;
2597 u64 bytes = 0; 2599 int ret;
2598 u64 actually_trimmed; 2600 int update = 0;
2599 int ret = 0; 2601 u64 trimmed = 0;
2600 2602
2601 *trimmed = 0; 2603 spin_lock(&space_info->lock);
2604 spin_lock(&block_group->lock);
2605 if (!block_group->ro) {
2606 block_group->reserved += reserved_bytes;
2607 space_info->bytes_reserved += reserved_bytes;
2608 update = 1;
2609 }
2610 spin_unlock(&block_group->lock);
2611 spin_unlock(&space_info->lock);
2612
2613 ret = btrfs_error_discard_extent(fs_info->extent_root,
2614 start, bytes, &trimmed);
2615 if (!ret)
2616 *total_trimmed += trimmed;
2617
2618 btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
2619
2620 if (update) {
2621 spin_lock(&space_info->lock);
2622 spin_lock(&block_group->lock);
2623 if (block_group->ro)
2624 space_info->bytes_readonly += reserved_bytes;
2625 block_group->reserved -= reserved_bytes;
2626 space_info->bytes_reserved -= reserved_bytes;
2627 spin_unlock(&space_info->lock);
2628 spin_unlock(&block_group->lock);
2629 }
2630
2631 return ret;
2632}
2633
2634static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
2635 u64 *total_trimmed, u64 start, u64 end, u64 minlen)
2636{
2637 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2638 struct btrfs_free_space *entry;
2639 struct rb_node *node;
2640 int ret = 0;
2641 u64 extent_start;
2642 u64 extent_bytes;
2643 u64 bytes;
2602 2644
2603 while (start < end) { 2645 while (start < end) {
2604 spin_lock(&ctl->tree_lock); 2646 spin_lock(&ctl->tree_lock);
@@ -2609,81 +2651,118 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2609 } 2651 }
2610 2652
2611 entry = tree_search_offset(ctl, start, 0, 1); 2653 entry = tree_search_offset(ctl, start, 0, 1);
2612 if (!entry) 2654 if (!entry) {
2613 entry = tree_search_offset(ctl,
2614 offset_to_bitmap(ctl, start),
2615 1, 1);
2616
2617 if (!entry || entry->offset >= end) {
2618 spin_unlock(&ctl->tree_lock); 2655 spin_unlock(&ctl->tree_lock);
2619 break; 2656 break;
2620 } 2657 }
2621 2658
2622 if (entry->bitmap) { 2659 /* skip bitmaps */
2623 ret = search_bitmap(ctl, entry, &start, &bytes); 2660 while (entry->bitmap) {
2624 if (!ret) { 2661 node = rb_next(&entry->offset_index);
2625 if (start >= end) { 2662 if (!node) {
2626 spin_unlock(&ctl->tree_lock);
2627 break;
2628 }
2629 bytes = min(bytes, end - start);
2630 bitmap_clear_bits(ctl, entry, start, bytes);
2631 if (entry->bytes == 0)
2632 free_bitmap(ctl, entry);
2633 } else {
2634 start = entry->offset + BITS_PER_BITMAP *
2635 block_group->sectorsize;
2636 spin_unlock(&ctl->tree_lock); 2663 spin_unlock(&ctl->tree_lock);
2637 ret = 0; 2664 goto out;
2638 continue;
2639 } 2665 }
2640 } else { 2666 entry = rb_entry(node, struct btrfs_free_space,
2641 start = entry->offset; 2667 offset_index);
2642 bytes = min(entry->bytes, end - start);
2643 unlink_free_space(ctl, entry);
2644 kmem_cache_free(btrfs_free_space_cachep, entry);
2645 } 2668 }
2646 2669
2670 if (entry->offset >= end) {
2671 spin_unlock(&ctl->tree_lock);
2672 break;
2673 }
2674
2675 extent_start = entry->offset;
2676 extent_bytes = entry->bytes;
2677 start = max(start, extent_start);
2678 bytes = min(extent_start + extent_bytes, end) - start;
2679 if (bytes < minlen) {
2680 spin_unlock(&ctl->tree_lock);
2681 goto next;
2682 }
2683
2684 unlink_free_space(ctl, entry);
2685 kmem_cache_free(btrfs_free_space_cachep, entry);
2686
2647 spin_unlock(&ctl->tree_lock); 2687 spin_unlock(&ctl->tree_lock);
2648 2688
2649 if (bytes >= minlen) { 2689 ret = do_trimming(block_group, total_trimmed, start, bytes,
2650 struct btrfs_space_info *space_info; 2690 extent_start, extent_bytes);
2651 int update = 0; 2691 if (ret)
2652 2692 break;
2653 space_info = block_group->space_info; 2693next:
2654 spin_lock(&space_info->lock); 2694 start += bytes;
2655 spin_lock(&block_group->lock);
2656 if (!block_group->ro) {
2657 block_group->reserved += bytes;
2658 space_info->bytes_reserved += bytes;
2659 update = 1;
2660 }
2661 spin_unlock(&block_group->lock);
2662 spin_unlock(&space_info->lock);
2663
2664 ret = btrfs_error_discard_extent(fs_info->extent_root,
2665 start,
2666 bytes,
2667 &actually_trimmed);
2668
2669 btrfs_add_free_space(block_group, start, bytes);
2670 if (update) {
2671 spin_lock(&space_info->lock);
2672 spin_lock(&block_group->lock);
2673 if (block_group->ro)
2674 space_info->bytes_readonly += bytes;
2675 block_group->reserved -= bytes;
2676 space_info->bytes_reserved -= bytes;
2677 spin_unlock(&space_info->lock);
2678 spin_unlock(&block_group->lock);
2679 }
2680 2695
2681 if (ret) 2696 if (fatal_signal_pending(current)) {
2682 break; 2697 ret = -ERESTARTSYS;
2683 *trimmed += actually_trimmed; 2698 break;
2699 }
2700
2701 cond_resched();
2702 }
2703out:
2704 return ret;
2705}
2706
2707static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
2708 u64 *total_trimmed, u64 start, u64 end, u64 minlen)
2709{
2710 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2711 struct btrfs_free_space *entry;
2712 int ret = 0;
2713 int ret2;
2714 u64 bytes;
2715 u64 offset = offset_to_bitmap(ctl, start);
2716
2717 while (offset < end) {
2718 bool next_bitmap = false;
2719
2720 spin_lock(&ctl->tree_lock);
2721
2722 if (ctl->free_space < minlen) {
2723 spin_unlock(&ctl->tree_lock);
2724 break;
2725 }
2726
2727 entry = tree_search_offset(ctl, offset, 1, 0);
2728 if (!entry) {
2729 spin_unlock(&ctl->tree_lock);
2730 next_bitmap = true;
2731 goto next;
2732 }
2733
2734 bytes = minlen;
2735 ret2 = search_bitmap(ctl, entry, &start, &bytes);
2736 if (ret2 || start >= end) {
2737 spin_unlock(&ctl->tree_lock);
2738 next_bitmap = true;
2739 goto next;
2740 }
2741
2742 bytes = min(bytes, end - start);
2743 if (bytes < minlen) {
2744 spin_unlock(&ctl->tree_lock);
2745 goto next;
2746 }
2747
2748 bitmap_clear_bits(ctl, entry, start, bytes);
2749 if (entry->bytes == 0)
2750 free_bitmap(ctl, entry);
2751
2752 spin_unlock(&ctl->tree_lock);
2753
2754 ret = do_trimming(block_group, total_trimmed, start, bytes,
2755 start, bytes);
2756 if (ret)
2757 break;
2758next:
2759 if (next_bitmap) {
2760 offset += BITS_PER_BITMAP * ctl->unit;
2761 } else {
2762 start += bytes;
2763 if (start >= offset + BITS_PER_BITMAP * ctl->unit)
2764 offset += BITS_PER_BITMAP * ctl->unit;
2684 } 2765 }
2685 start += bytes;
2686 bytes = 0;
2687 2766
2688 if (fatal_signal_pending(current)) { 2767 if (fatal_signal_pending(current)) {
2689 ret = -ERESTARTSYS; 2768 ret = -ERESTARTSYS;
@@ -2696,6 +2775,22 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2696 return ret; 2775 return ret;
2697} 2776}
2698 2777
2778int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2779 u64 *trimmed, u64 start, u64 end, u64 minlen)
2780{
2781 int ret;
2782
2783 *trimmed = 0;
2784
2785 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
2786 if (ret)
2787 return ret;
2788
2789 ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
2790
2791 return ret;
2792}
2793
2699/* 2794/*
2700 * Find the left-most item in the cache tree, and then return the 2795 * Find the left-most item in the cache tree, and then return the
2701 * smallest inode number in the item. 2796 * smallest inode number in the item.
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index f8962a957d65..213ffa86ce1b 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -438,6 +438,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
438 trans->bytes_reserved); 438 trans->bytes_reserved);
439 if (ret) 439 if (ret)
440 goto out; 440 goto out;
441 trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans,
442 trans->bytes_reserved, 1);
441again: 443again:
442 inode = lookup_free_ino_inode(root, path); 444 inode = lookup_free_ino_inode(root, path);
443 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 445 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
@@ -498,6 +500,8 @@ again:
498out_put: 500out_put:
499 iput(inode); 501 iput(inode);
500out_release: 502out_release:
503 trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans,
504 trans->bytes_reserved, 0);
501 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); 505 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
502out: 506out:
503 trans->block_rsv = rsv; 507 trans->block_rsv = rsv;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fd1a06df5bc6..0da19a0ea00d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1944,19 +1944,35 @@ enum btrfs_orphan_cleanup_state {
1944}; 1944};
1945 1945
1946/* 1946/*
1947 * This is called in transaction commmit time. If there are no orphan 1947 * This is called in transaction commit time. If there are no orphan
1948 * files in the subvolume, it removes orphan item and frees block_rsv 1948 * files in the subvolume, it removes orphan item and frees block_rsv
1949 * structure. 1949 * structure.
1950 */ 1950 */
1951void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, 1951void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
1952 struct btrfs_root *root) 1952 struct btrfs_root *root)
1953{ 1953{
1954 struct btrfs_block_rsv *block_rsv;
1954 int ret; 1955 int ret;
1955 1956
1956 if (!list_empty(&root->orphan_list) || 1957 if (!list_empty(&root->orphan_list) ||
1957 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) 1958 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
1958 return; 1959 return;
1959 1960
1961 spin_lock(&root->orphan_lock);
1962 if (!list_empty(&root->orphan_list)) {
1963 spin_unlock(&root->orphan_lock);
1964 return;
1965 }
1966
1967 if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
1968 spin_unlock(&root->orphan_lock);
1969 return;
1970 }
1971
1972 block_rsv = root->orphan_block_rsv;
1973 root->orphan_block_rsv = NULL;
1974 spin_unlock(&root->orphan_lock);
1975
1960 if (root->orphan_item_inserted && 1976 if (root->orphan_item_inserted &&
1961 btrfs_root_refs(&root->root_item) > 0) { 1977 btrfs_root_refs(&root->root_item) > 0) {
1962 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, 1978 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
@@ -1965,10 +1981,9 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
1965 root->orphan_item_inserted = 0; 1981 root->orphan_item_inserted = 0;
1966 } 1982 }
1967 1983
1968 if (root->orphan_block_rsv) { 1984 if (block_rsv) {
1969 WARN_ON(root->orphan_block_rsv->size > 0); 1985 WARN_ON(block_rsv->size > 0);
1970 btrfs_free_block_rsv(root, root->orphan_block_rsv); 1986 btrfs_free_block_rsv(root, block_rsv);
1971 root->orphan_block_rsv = NULL;
1972 } 1987 }
1973} 1988}
1974 1989
@@ -2224,14 +2239,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2224 continue; 2239 continue;
2225 } 2240 }
2226 nr_truncate++; 2241 nr_truncate++;
2227 /*
2228 * Need to hold the imutex for reservation purposes, not
2229 * a huge deal here but I have a WARN_ON in
2230 * btrfs_delalloc_reserve_space to catch offenders.
2231 */
2232 mutex_lock(&inode->i_mutex);
2233 ret = btrfs_truncate(inode); 2242 ret = btrfs_truncate(inode);
2234 mutex_unlock(&inode->i_mutex);
2235 } else { 2243 } else {
2236 nr_unlink++; 2244 nr_unlink++;
2237 } 2245 }
@@ -2845,7 +2853,7 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
2845 BUG_ON(!root->fs_info->enospc_unlink); 2853 BUG_ON(!root->fs_info->enospc_unlink);
2846 root->fs_info->enospc_unlink = 0; 2854 root->fs_info->enospc_unlink = 0;
2847 } 2855 }
2848 btrfs_end_transaction_throttle(trans, root); 2856 btrfs_end_transaction(trans, root);
2849} 2857}
2850 2858
2851static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 2859static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -3009,7 +3017,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3009 int pending_del_nr = 0; 3017 int pending_del_nr = 0;
3010 int pending_del_slot = 0; 3018 int pending_del_slot = 0;
3011 int extent_type = -1; 3019 int extent_type = -1;
3012 int encoding;
3013 int ret; 3020 int ret;
3014 int err = 0; 3021 int err = 0;
3015 u64 ino = btrfs_ino(inode); 3022 u64 ino = btrfs_ino(inode);
@@ -3059,7 +3066,6 @@ search_again:
3059 leaf = path->nodes[0]; 3066 leaf = path->nodes[0];
3060 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3067 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3061 found_type = btrfs_key_type(&found_key); 3068 found_type = btrfs_key_type(&found_key);
3062 encoding = 0;
3063 3069
3064 if (found_key.objectid != ino) 3070 if (found_key.objectid != ino)
3065 break; 3071 break;
@@ -3072,10 +3078,6 @@ search_again:
3072 fi = btrfs_item_ptr(leaf, path->slots[0], 3078 fi = btrfs_item_ptr(leaf, path->slots[0],
3073 struct btrfs_file_extent_item); 3079 struct btrfs_file_extent_item);
3074 extent_type = btrfs_file_extent_type(leaf, fi); 3080 extent_type = btrfs_file_extent_type(leaf, fi);
3075 encoding = btrfs_file_extent_compression(leaf, fi);
3076 encoding |= btrfs_file_extent_encryption(leaf, fi);
3077 encoding |= btrfs_file_extent_other_encoding(leaf, fi);
3078
3079 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 3081 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
3080 item_end += 3082 item_end +=
3081 btrfs_file_extent_num_bytes(leaf, fi); 3083 btrfs_file_extent_num_bytes(leaf, fi);
@@ -3103,7 +3105,7 @@ search_again:
3103 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 3105 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
3104 u64 num_dec; 3106 u64 num_dec;
3105 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); 3107 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
3106 if (!del_item && !encoding) { 3108 if (!del_item) {
3107 u64 orig_num_bytes = 3109 u64 orig_num_bytes =
3108 btrfs_file_extent_num_bytes(leaf, fi); 3110 btrfs_file_extent_num_bytes(leaf, fi);
3109 extent_num_bytes = new_size - 3111 extent_num_bytes = new_size -
@@ -3179,7 +3181,7 @@ delete:
3179 ret = btrfs_free_extent(trans, root, extent_start, 3181 ret = btrfs_free_extent(trans, root, extent_start,
3180 extent_num_bytes, 0, 3182 extent_num_bytes, 0,
3181 btrfs_header_owner(leaf), 3183 btrfs_header_owner(leaf),
3182 ino, extent_offset); 3184 ino, extent_offset, 0);
3183 BUG_ON(ret); 3185 BUG_ON(ret);
3184 } 3186 }
3185 3187
@@ -3434,7 +3436,7 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
3434 i_size_write(inode, newsize); 3436 i_size_write(inode, newsize);
3435 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); 3437 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
3436 ret = btrfs_update_inode(trans, root, inode); 3438 ret = btrfs_update_inode(trans, root, inode);
3437 btrfs_end_transaction_throttle(trans, root); 3439 btrfs_end_transaction(trans, root);
3438 } else { 3440 } else {
3439 3441
3440 /* 3442 /*
@@ -4412,8 +4414,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4412 struct btrfs_root *root, 4414 struct btrfs_root *root,
4413 struct inode *dir, 4415 struct inode *dir,
4414 const char *name, int name_len, 4416 const char *name, int name_len,
4415 u64 ref_objectid, u64 objectid, int mode, 4417 u64 ref_objectid, u64 objectid,
4416 u64 *index) 4418 umode_t mode, u64 *index)
4417{ 4419{
4418 struct inode *inode; 4420 struct inode *inode;
4419 struct btrfs_inode_item *inode_item; 4421 struct btrfs_inode_item *inode_item;
@@ -4596,7 +4598,7 @@ static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
4596} 4598}
4597 4599
4598static int btrfs_mknod(struct inode *dir, struct dentry *dentry, 4600static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4599 int mode, dev_t rdev) 4601 umode_t mode, dev_t rdev)
4600{ 4602{
4601 struct btrfs_trans_handle *trans; 4603 struct btrfs_trans_handle *trans;
4602 struct btrfs_root *root = BTRFS_I(dir)->root; 4604 struct btrfs_root *root = BTRFS_I(dir)->root;
@@ -4655,7 +4657,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4655 } 4657 }
4656out_unlock: 4658out_unlock:
4657 nr = trans->blocks_used; 4659 nr = trans->blocks_used;
4658 btrfs_end_transaction_throttle(trans, root); 4660 btrfs_end_transaction(trans, root);
4659 btrfs_btree_balance_dirty(root, nr); 4661 btrfs_btree_balance_dirty(root, nr);
4660 if (drop_inode) { 4662 if (drop_inode) {
4661 inode_dec_link_count(inode); 4663 inode_dec_link_count(inode);
@@ -4665,7 +4667,7 @@ out_unlock:
4665} 4667}
4666 4668
4667static int btrfs_create(struct inode *dir, struct dentry *dentry, 4669static int btrfs_create(struct inode *dir, struct dentry *dentry,
4668 int mode, struct nameidata *nd) 4670 umode_t mode, struct nameidata *nd)
4669{ 4671{
4670 struct btrfs_trans_handle *trans; 4672 struct btrfs_trans_handle *trans;
4671 struct btrfs_root *root = BTRFS_I(dir)->root; 4673 struct btrfs_root *root = BTRFS_I(dir)->root;
@@ -4723,7 +4725,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4723 } 4725 }
4724out_unlock: 4726out_unlock:
4725 nr = trans->blocks_used; 4727 nr = trans->blocks_used;
4726 btrfs_end_transaction_throttle(trans, root); 4728 btrfs_end_transaction(trans, root);
4727 if (drop_inode) { 4729 if (drop_inode) {
4728 inode_dec_link_count(inode); 4730 inode_dec_link_count(inode);
4729 iput(inode); 4731 iput(inode);
@@ -4782,7 +4784,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4782 } 4784 }
4783 4785
4784 nr = trans->blocks_used; 4786 nr = trans->blocks_used;
4785 btrfs_end_transaction_throttle(trans, root); 4787 btrfs_end_transaction(trans, root);
4786fail: 4788fail:
4787 if (drop_inode) { 4789 if (drop_inode) {
4788 inode_dec_link_count(inode); 4790 inode_dec_link_count(inode);
@@ -4792,7 +4794,7 @@ fail:
4792 return err; 4794 return err;
4793} 4795}
4794 4796
4795static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) 4797static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4796{ 4798{
4797 struct inode *inode = NULL; 4799 struct inode *inode = NULL;
4798 struct btrfs_trans_handle *trans; 4800 struct btrfs_trans_handle *trans;
@@ -4848,7 +4850,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4848 4850
4849out_fail: 4851out_fail:
4850 nr = trans->blocks_used; 4852 nr = trans->blocks_used;
4851 btrfs_end_transaction_throttle(trans, root); 4853 btrfs_end_transaction(trans, root);
4852 if (drop_on_err) 4854 if (drop_on_err)
4853 iput(inode); 4855 iput(inode);
4854 btrfs_btree_balance_dirty(root, nr); 4856 btrfs_btree_balance_dirty(root, nr);
@@ -5121,7 +5123,7 @@ again:
5121 } 5123 }
5122 flush_dcache_page(page); 5124 flush_dcache_page(page);
5123 } else if (create && PageUptodate(page)) { 5125 } else if (create && PageUptodate(page)) {
5124 WARN_ON(1); 5126 BUG();
5125 if (!trans) { 5127 if (!trans) {
5126 kunmap(page); 5128 kunmap(page);
5127 free_extent_map(em); 5129 free_extent_map(em);
@@ -6402,10 +6404,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
6402 u64 page_start; 6404 u64 page_start;
6403 u64 page_end; 6405 u64 page_end;
6404 6406
6405 /* Need this to keep space reservations serialized */
6406 mutex_lock(&inode->i_mutex);
6407 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 6407 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
6408 mutex_unlock(&inode->i_mutex);
6409 if (!ret) 6408 if (!ret)
6410 ret = btrfs_update_time(vma->vm_file); 6409 ret = btrfs_update_time(vma->vm_file);
6411 if (ret) { 6410 if (ret) {
@@ -6494,8 +6493,8 @@ out_unlock:
6494 if (!ret) 6493 if (!ret)
6495 return VM_FAULT_LOCKED; 6494 return VM_FAULT_LOCKED;
6496 unlock_page(page); 6495 unlock_page(page);
6497 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
6498out: 6496out:
6497 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
6499 return ret; 6498 return ret;
6500} 6499}
6501 6500
@@ -6668,7 +6667,7 @@ end_trans:
6668 err = ret; 6667 err = ret;
6669 6668
6670 nr = trans->blocks_used; 6669 nr = trans->blocks_used;
6671 ret = btrfs_end_transaction_throttle(trans, root); 6670 ret = btrfs_end_transaction(trans, root);
6672 btrfs_btree_balance_dirty(root, nr); 6671 btrfs_btree_balance_dirty(root, nr);
6673 } 6672 }
6674 6673
@@ -6749,6 +6748,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6749 extent_io_tree_init(&ei->io_tree, &inode->i_data); 6748 extent_io_tree_init(&ei->io_tree, &inode->i_data);
6750 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); 6749 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
6751 mutex_init(&ei->log_mutex); 6750 mutex_init(&ei->log_mutex);
6751 mutex_init(&ei->delalloc_mutex);
6752 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 6752 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
6753 INIT_LIST_HEAD(&ei->i_orphan); 6753 INIT_LIST_HEAD(&ei->i_orphan);
6754 INIT_LIST_HEAD(&ei->delalloc_inodes); 6754 INIT_LIST_HEAD(&ei->delalloc_inodes);
@@ -6761,7 +6761,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6761static void btrfs_i_callback(struct rcu_head *head) 6761static void btrfs_i_callback(struct rcu_head *head)
6762{ 6762{
6763 struct inode *inode = container_of(head, struct inode, i_rcu); 6763 struct inode *inode = container_of(head, struct inode, i_rcu);
6764 INIT_LIST_HEAD(&inode->i_dentry);
6765 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 6764 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
6766} 6765}
6767 6766
@@ -7075,7 +7074,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
7075 btrfs_end_log_trans(root); 7074 btrfs_end_log_trans(root);
7076 } 7075 }
7077out_fail: 7076out_fail:
7078 btrfs_end_transaction_throttle(trans, root); 7077 btrfs_end_transaction(trans, root);
7079out_notrans: 7078out_notrans:
7080 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 7079 if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
7081 up_read(&root->fs_info->subvol_sem); 7080 up_read(&root->fs_info->subvol_sem);
@@ -7247,7 +7246,7 @@ out_unlock:
7247 if (!err) 7246 if (!err)
7248 d_instantiate(dentry, inode); 7247 d_instantiate(dentry, inode);
7249 nr = trans->blocks_used; 7248 nr = trans->blocks_used;
7250 btrfs_end_transaction_throttle(trans, root); 7249 btrfs_end_transaction(trans, root);
7251 if (drop_inode) { 7250 if (drop_inode) {
7252 inode_dec_link_count(inode); 7251 inode_dec_link_count(inode);
7253 iput(inode); 7252 iput(inode);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c04f02c7d5bb..ab620014bcc3 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -176,6 +176,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
176 struct btrfs_trans_handle *trans; 176 struct btrfs_trans_handle *trans;
177 unsigned int flags, oldflags; 177 unsigned int flags, oldflags;
178 int ret; 178 int ret;
179 u64 ip_oldflags;
180 unsigned int i_oldflags;
179 181
180 if (btrfs_root_readonly(root)) 182 if (btrfs_root_readonly(root))
181 return -EROFS; 183 return -EROFS;
@@ -192,6 +194,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
192 194
193 mutex_lock(&inode->i_mutex); 195 mutex_lock(&inode->i_mutex);
194 196
197 ip_oldflags = ip->flags;
198 i_oldflags = inode->i_flags;
199
195 flags = btrfs_mask_flags(inode->i_mode, flags); 200 flags = btrfs_mask_flags(inode->i_mode, flags);
196 oldflags = btrfs_flags_to_ioctl(ip->flags); 201 oldflags = btrfs_flags_to_ioctl(ip->flags);
197 if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { 202 if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
@@ -201,7 +206,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
201 } 206 }
202 } 207 }
203 208
204 ret = mnt_want_write(file->f_path.mnt); 209 ret = mnt_want_write_file(file);
205 if (ret) 210 if (ret)
206 goto out_unlock; 211 goto out_unlock;
207 212
@@ -249,19 +254,24 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
249 ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); 254 ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
250 } 255 }
251 256
252 trans = btrfs_join_transaction(root); 257 trans = btrfs_start_transaction(root, 1);
253 BUG_ON(IS_ERR(trans)); 258 if (IS_ERR(trans)) {
259 ret = PTR_ERR(trans);
260 goto out_drop;
261 }
254 262
255 btrfs_update_iflags(inode); 263 btrfs_update_iflags(inode);
256 inode->i_ctime = CURRENT_TIME; 264 inode->i_ctime = CURRENT_TIME;
257 ret = btrfs_update_inode(trans, root, inode); 265 ret = btrfs_update_inode(trans, root, inode);
258 BUG_ON(ret);
259 266
260 btrfs_end_transaction(trans, root); 267 btrfs_end_transaction(trans, root);
268 out_drop:
269 if (ret) {
270 ip->flags = ip_oldflags;
271 inode->i_flags = i_oldflags;
272 }
261 273
262 mnt_drop_write(file->f_path.mnt); 274 mnt_drop_write_file(file);
263
264 ret = 0;
265 out_unlock: 275 out_unlock:
266 mutex_unlock(&inode->i_mutex); 276 mutex_unlock(&inode->i_mutex);
267 return ret; 277 return ret;
@@ -276,14 +286,13 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
276 286
277static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) 287static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
278{ 288{
279 struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info; 289 struct btrfs_fs_info *fs_info = btrfs_sb(fdentry(file)->d_sb);
280 struct btrfs_fs_info *fs_info = root->fs_info;
281 struct btrfs_device *device; 290 struct btrfs_device *device;
282 struct request_queue *q; 291 struct request_queue *q;
283 struct fstrim_range range; 292 struct fstrim_range range;
284 u64 minlen = ULLONG_MAX; 293 u64 minlen = ULLONG_MAX;
285 u64 num_devices = 0; 294 u64 num_devices = 0;
286 u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); 295 u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
287 int ret; 296 int ret;
288 297
289 if (!capable(CAP_SYS_ADMIN)) 298 if (!capable(CAP_SYS_ADMIN))
@@ -312,7 +321,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
312 321
313 range.len = min(range.len, total_bytes - range.start); 322 range.len = min(range.len, total_bytes - range.start);
314 range.minlen = max(range.minlen, minlen); 323 range.minlen = max(range.minlen, minlen);
315 ret = btrfs_trim_fs(root, &range); 324 ret = btrfs_trim_fs(fs_info->tree_root, &range);
316 if (ret < 0) 325 if (ret < 0)
317 return ret; 326 return ret;
318 327
@@ -358,7 +367,7 @@ static noinline int create_subvol(struct btrfs_root *root,
358 return PTR_ERR(trans); 367 return PTR_ERR(trans);
359 368
360 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 369 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
361 0, objectid, NULL, 0, 0, 0); 370 0, objectid, NULL, 0, 0, 0, 0);
362 if (IS_ERR(leaf)) { 371 if (IS_ERR(leaf)) {
363 ret = PTR_ERR(leaf); 372 ret = PTR_ERR(leaf);
364 goto fail; 373 goto fail;
@@ -858,10 +867,8 @@ static int cluster_pages_for_defrag(struct inode *inode,
858 return 0; 867 return 0;
859 file_end = (isize - 1) >> PAGE_CACHE_SHIFT; 868 file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
860 869
861 mutex_lock(&inode->i_mutex);
862 ret = btrfs_delalloc_reserve_space(inode, 870 ret = btrfs_delalloc_reserve_space(inode,
863 num_pages << PAGE_CACHE_SHIFT); 871 num_pages << PAGE_CACHE_SHIFT);
864 mutex_unlock(&inode->i_mutex);
865 if (ret) 872 if (ret)
866 return ret; 873 return ret;
867again: 874again:
@@ -1203,13 +1210,21 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1203 if (!capable(CAP_SYS_ADMIN)) 1210 if (!capable(CAP_SYS_ADMIN))
1204 return -EPERM; 1211 return -EPERM;
1205 1212
1213 mutex_lock(&root->fs_info->volume_mutex);
1214 if (root->fs_info->balance_ctl) {
1215 printk(KERN_INFO "btrfs: balance in progress\n");
1216 ret = -EINVAL;
1217 goto out;
1218 }
1219
1206 vol_args = memdup_user(arg, sizeof(*vol_args)); 1220 vol_args = memdup_user(arg, sizeof(*vol_args));
1207 if (IS_ERR(vol_args)) 1221 if (IS_ERR(vol_args)) {
1208 return PTR_ERR(vol_args); 1222 ret = PTR_ERR(vol_args);
1223 goto out;
1224 }
1209 1225
1210 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 1226 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
1211 1227
1212 mutex_lock(&root->fs_info->volume_mutex);
1213 sizestr = vol_args->name; 1228 sizestr = vol_args->name;
1214 devstr = strchr(sizestr, ':'); 1229 devstr = strchr(sizestr, ':');
1215 if (devstr) { 1230 if (devstr) {
@@ -1226,7 +1241,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1226 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", 1241 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
1227 (unsigned long long)devid); 1242 (unsigned long long)devid);
1228 ret = -EINVAL; 1243 ret = -EINVAL;
1229 goto out_unlock; 1244 goto out_free;
1230 } 1245 }
1231 if (!strcmp(sizestr, "max")) 1246 if (!strcmp(sizestr, "max"))
1232 new_size = device->bdev->bd_inode->i_size; 1247 new_size = device->bdev->bd_inode->i_size;
@@ -1241,7 +1256,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1241 new_size = memparse(sizestr, NULL); 1256 new_size = memparse(sizestr, NULL);
1242 if (new_size == 0) { 1257 if (new_size == 0) {
1243 ret = -EINVAL; 1258 ret = -EINVAL;
1244 goto out_unlock; 1259 goto out_free;
1245 } 1260 }
1246 } 1261 }
1247 1262
@@ -1250,7 +1265,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1250 if (mod < 0) { 1265 if (mod < 0) {
1251 if (new_size > old_size) { 1266 if (new_size > old_size) {
1252 ret = -EINVAL; 1267 ret = -EINVAL;
1253 goto out_unlock; 1268 goto out_free;
1254 } 1269 }
1255 new_size = old_size - new_size; 1270 new_size = old_size - new_size;
1256 } else if (mod > 0) { 1271 } else if (mod > 0) {
@@ -1259,11 +1274,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1259 1274
1260 if (new_size < 256 * 1024 * 1024) { 1275 if (new_size < 256 * 1024 * 1024) {
1261 ret = -EINVAL; 1276 ret = -EINVAL;
1262 goto out_unlock; 1277 goto out_free;
1263 } 1278 }
1264 if (new_size > device->bdev->bd_inode->i_size) { 1279 if (new_size > device->bdev->bd_inode->i_size) {
1265 ret = -EFBIG; 1280 ret = -EFBIG;
1266 goto out_unlock; 1281 goto out_free;
1267 } 1282 }
1268 1283
1269 do_div(new_size, root->sectorsize); 1284 do_div(new_size, root->sectorsize);
@@ -1276,7 +1291,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1276 trans = btrfs_start_transaction(root, 0); 1291 trans = btrfs_start_transaction(root, 0);
1277 if (IS_ERR(trans)) { 1292 if (IS_ERR(trans)) {
1278 ret = PTR_ERR(trans); 1293 ret = PTR_ERR(trans);
1279 goto out_unlock; 1294 goto out_free;
1280 } 1295 }
1281 ret = btrfs_grow_device(trans, device, new_size); 1296 ret = btrfs_grow_device(trans, device, new_size);
1282 btrfs_commit_transaction(trans, root); 1297 btrfs_commit_transaction(trans, root);
@@ -1284,9 +1299,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1284 ret = btrfs_shrink_device(device, new_size); 1299 ret = btrfs_shrink_device(device, new_size);
1285 } 1300 }
1286 1301
1287out_unlock: 1302out_free:
1288 mutex_unlock(&root->fs_info->volume_mutex);
1289 kfree(vol_args); 1303 kfree(vol_args);
1304out:
1305 mutex_unlock(&root->fs_info->volume_mutex);
1290 return ret; 1306 return ret;
1291} 1307}
1292 1308
@@ -1855,7 +1871,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
1855 goto out; 1871 goto out;
1856 } 1872 }
1857 1873
1858 err = mnt_want_write(file->f_path.mnt); 1874 err = mnt_want_write_file(file);
1859 if (err) 1875 if (err)
1860 goto out; 1876 goto out;
1861 1877
@@ -1971,7 +1987,7 @@ out_dput:
1971 dput(dentry); 1987 dput(dentry);
1972out_unlock_dir: 1988out_unlock_dir:
1973 mutex_unlock(&dir->i_mutex); 1989 mutex_unlock(&dir->i_mutex);
1974 mnt_drop_write(file->f_path.mnt); 1990 mnt_drop_write_file(file);
1975out: 1991out:
1976 kfree(vol_args); 1992 kfree(vol_args);
1977 return err; 1993 return err;
@@ -1987,7 +2003,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
1987 if (btrfs_root_readonly(root)) 2003 if (btrfs_root_readonly(root))
1988 return -EROFS; 2004 return -EROFS;
1989 2005
1990 ret = mnt_want_write(file->f_path.mnt); 2006 ret = mnt_want_write_file(file);
1991 if (ret) 2007 if (ret)
1992 return ret; 2008 return ret;
1993 2009
@@ -2040,7 +2056,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
2040 ret = -EINVAL; 2056 ret = -EINVAL;
2041 } 2057 }
2042out: 2058out:
2043 mnt_drop_write(file->f_path.mnt); 2059 mnt_drop_write_file(file);
2044 return ret; 2060 return ret;
2045} 2061}
2046 2062
@@ -2052,14 +2068,25 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
2052 if (!capable(CAP_SYS_ADMIN)) 2068 if (!capable(CAP_SYS_ADMIN))
2053 return -EPERM; 2069 return -EPERM;
2054 2070
2071 mutex_lock(&root->fs_info->volume_mutex);
2072 if (root->fs_info->balance_ctl) {
2073 printk(KERN_INFO "btrfs: balance in progress\n");
2074 ret = -EINVAL;
2075 goto out;
2076 }
2077
2055 vol_args = memdup_user(arg, sizeof(*vol_args)); 2078 vol_args = memdup_user(arg, sizeof(*vol_args));
2056 if (IS_ERR(vol_args)) 2079 if (IS_ERR(vol_args)) {
2057 return PTR_ERR(vol_args); 2080 ret = PTR_ERR(vol_args);
2081 goto out;
2082 }
2058 2083
2059 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2084 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
2060 ret = btrfs_init_new_device(root, vol_args->name); 2085 ret = btrfs_init_new_device(root, vol_args->name);
2061 2086
2062 kfree(vol_args); 2087 kfree(vol_args);
2088out:
2089 mutex_unlock(&root->fs_info->volume_mutex);
2063 return ret; 2090 return ret;
2064} 2091}
2065 2092
@@ -2074,14 +2101,25 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
2074 if (root->fs_info->sb->s_flags & MS_RDONLY) 2101 if (root->fs_info->sb->s_flags & MS_RDONLY)
2075 return -EROFS; 2102 return -EROFS;
2076 2103
2104 mutex_lock(&root->fs_info->volume_mutex);
2105 if (root->fs_info->balance_ctl) {
2106 printk(KERN_INFO "btrfs: balance in progress\n");
2107 ret = -EINVAL;
2108 goto out;
2109 }
2110
2077 vol_args = memdup_user(arg, sizeof(*vol_args)); 2111 vol_args = memdup_user(arg, sizeof(*vol_args));
2078 if (IS_ERR(vol_args)) 2112 if (IS_ERR(vol_args)) {
2079 return PTR_ERR(vol_args); 2113 ret = PTR_ERR(vol_args);
2114 goto out;
2115 }
2080 2116
2081 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2117 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
2082 ret = btrfs_rm_device(root, vol_args->name); 2118 ret = btrfs_rm_device(root, vol_args->name);
2083 2119
2084 kfree(vol_args); 2120 kfree(vol_args);
2121out:
2122 mutex_unlock(&root->fs_info->volume_mutex);
2085 return ret; 2123 return ret;
2086} 2124}
2087 2125
@@ -2195,7 +2233,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2195 if (btrfs_root_readonly(root)) 2233 if (btrfs_root_readonly(root))
2196 return -EROFS; 2234 return -EROFS;
2197 2235
2198 ret = mnt_want_write(file->f_path.mnt); 2236 ret = mnt_want_write_file(file);
2199 if (ret) 2237 if (ret)
2200 return ret; 2238 return ret;
2201 2239
@@ -2427,7 +2465,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2427 disko, diskl, 0, 2465 disko, diskl, 0,
2428 root->root_key.objectid, 2466 root->root_key.objectid,
2429 btrfs_ino(inode), 2467 btrfs_ino(inode),
2430 new_key.offset - datao); 2468 new_key.offset - datao,
2469 0);
2431 BUG_ON(ret); 2470 BUG_ON(ret);
2432 } 2471 }
2433 } else if (type == BTRFS_FILE_EXTENT_INLINE) { 2472 } else if (type == BTRFS_FILE_EXTENT_INLINE) {
@@ -2510,7 +2549,7 @@ out_unlock:
2510out_fput: 2549out_fput:
2511 fput(src_file); 2550 fput(src_file);
2512out_drop_write: 2551out_drop_write:
2513 mnt_drop_write(file->f_path.mnt); 2552 mnt_drop_write_file(file);
2514 return ret; 2553 return ret;
2515} 2554}
2516 2555
@@ -2549,7 +2588,7 @@ static long btrfs_ioctl_trans_start(struct file *file)
2549 if (btrfs_root_readonly(root)) 2588 if (btrfs_root_readonly(root))
2550 goto out; 2589 goto out;
2551 2590
2552 ret = mnt_want_write(file->f_path.mnt); 2591 ret = mnt_want_write_file(file);
2553 if (ret) 2592 if (ret)
2554 goto out; 2593 goto out;
2555 2594
@@ -2565,7 +2604,7 @@ static long btrfs_ioctl_trans_start(struct file *file)
2565 2604
2566out_drop: 2605out_drop:
2567 atomic_dec(&root->fs_info->open_ioctl_trans); 2606 atomic_dec(&root->fs_info->open_ioctl_trans);
2568 mnt_drop_write(file->f_path.mnt); 2607 mnt_drop_write_file(file);
2569out: 2608out:
2570 return ret; 2609 return ret;
2571} 2610}
@@ -2800,7 +2839,7 @@ long btrfs_ioctl_trans_end(struct file *file)
2800 2839
2801 atomic_dec(&root->fs_info->open_ioctl_trans); 2840 atomic_dec(&root->fs_info->open_ioctl_trans);
2802 2841
2803 mnt_drop_write(file->f_path.mnt); 2842 mnt_drop_write_file(file);
2804 return 0; 2843 return 0;
2805} 2844}
2806 2845
@@ -2977,7 +3016,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
2977{ 3016{
2978 int ret = 0; 3017 int ret = 0;
2979 int size; 3018 int size;
2980 u64 extent_offset; 3019 u64 extent_item_pos;
2981 struct btrfs_ioctl_logical_ino_args *loi; 3020 struct btrfs_ioctl_logical_ino_args *loi;
2982 struct btrfs_data_container *inodes = NULL; 3021 struct btrfs_data_container *inodes = NULL;
2983 struct btrfs_path *path = NULL; 3022 struct btrfs_path *path = NULL;
@@ -3008,15 +3047,17 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
3008 } 3047 }
3009 3048
3010 ret = extent_from_logical(root->fs_info, loi->logical, path, &key); 3049 ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
3050 btrfs_release_path(path);
3011 3051
3012 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) 3052 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
3013 ret = -ENOENT; 3053 ret = -ENOENT;
3014 if (ret < 0) 3054 if (ret < 0)
3015 goto out; 3055 goto out;
3016 3056
3017 extent_offset = loi->logical - key.objectid; 3057 extent_item_pos = loi->logical - key.objectid;
3018 ret = iterate_extent_inodes(root->fs_info, path, key.objectid, 3058 ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
3019 extent_offset, build_ino_list, inodes); 3059 extent_item_pos, build_ino_list,
3060 inodes);
3020 3061
3021 if (ret < 0) 3062 if (ret < 0)
3022 goto out; 3063 goto out;
@@ -3034,6 +3075,163 @@ out:
3034 return ret; 3075 return ret;
3035} 3076}
3036 3077
3078void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
3079 struct btrfs_ioctl_balance_args *bargs)
3080{
3081 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3082
3083 bargs->flags = bctl->flags;
3084
3085 if (atomic_read(&fs_info->balance_running))
3086 bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
3087 if (atomic_read(&fs_info->balance_pause_req))
3088 bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
3089 if (atomic_read(&fs_info->balance_cancel_req))
3090 bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ;
3091
3092 memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
3093 memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
3094 memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
3095
3096 if (lock) {
3097 spin_lock(&fs_info->balance_lock);
3098 memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
3099 spin_unlock(&fs_info->balance_lock);
3100 } else {
3101 memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
3102 }
3103}
3104
3105static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
3106{
3107 struct btrfs_fs_info *fs_info = root->fs_info;
3108 struct btrfs_ioctl_balance_args *bargs;
3109 struct btrfs_balance_control *bctl;
3110 int ret;
3111
3112 if (!capable(CAP_SYS_ADMIN))
3113 return -EPERM;
3114
3115 if (fs_info->sb->s_flags & MS_RDONLY)
3116 return -EROFS;
3117
3118 mutex_lock(&fs_info->volume_mutex);
3119 mutex_lock(&fs_info->balance_mutex);
3120
3121 if (arg) {
3122 bargs = memdup_user(arg, sizeof(*bargs));
3123 if (IS_ERR(bargs)) {
3124 ret = PTR_ERR(bargs);
3125 goto out;
3126 }
3127
3128 if (bargs->flags & BTRFS_BALANCE_RESUME) {
3129 if (!fs_info->balance_ctl) {
3130 ret = -ENOTCONN;
3131 goto out_bargs;
3132 }
3133
3134 bctl = fs_info->balance_ctl;
3135 spin_lock(&fs_info->balance_lock);
3136 bctl->flags |= BTRFS_BALANCE_RESUME;
3137 spin_unlock(&fs_info->balance_lock);
3138
3139 goto do_balance;
3140 }
3141 } else {
3142 bargs = NULL;
3143 }
3144
3145 if (fs_info->balance_ctl) {
3146 ret = -EINPROGRESS;
3147 goto out_bargs;
3148 }
3149
3150 bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
3151 if (!bctl) {
3152 ret = -ENOMEM;
3153 goto out_bargs;
3154 }
3155
3156 bctl->fs_info = fs_info;
3157 if (arg) {
3158 memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
3159 memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
3160 memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
3161
3162 bctl->flags = bargs->flags;
3163 } else {
3164 /* balance everything - no filters */
3165 bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
3166 }
3167
3168do_balance:
3169 ret = btrfs_balance(bctl, bargs);
3170 /*
3171 * bctl is freed in __cancel_balance or in free_fs_info if
3172 * restriper was paused all the way until unmount
3173 */
3174 if (arg) {
3175 if (copy_to_user(arg, bargs, sizeof(*bargs)))
3176 ret = -EFAULT;
3177 }
3178
3179out_bargs:
3180 kfree(bargs);
3181out:
3182 mutex_unlock(&fs_info->balance_mutex);
3183 mutex_unlock(&fs_info->volume_mutex);
3184 return ret;
3185}
3186
3187static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd)
3188{
3189 if (!capable(CAP_SYS_ADMIN))
3190 return -EPERM;
3191
3192 switch (cmd) {
3193 case BTRFS_BALANCE_CTL_PAUSE:
3194 return btrfs_pause_balance(root->fs_info);
3195 case BTRFS_BALANCE_CTL_CANCEL:
3196 return btrfs_cancel_balance(root->fs_info);
3197 }
3198
3199 return -EINVAL;
3200}
3201
3202static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
3203 void __user *arg)
3204{
3205 struct btrfs_fs_info *fs_info = root->fs_info;
3206 struct btrfs_ioctl_balance_args *bargs;
3207 int ret = 0;
3208
3209 if (!capable(CAP_SYS_ADMIN))
3210 return -EPERM;
3211
3212 mutex_lock(&fs_info->balance_mutex);
3213 if (!fs_info->balance_ctl) {
3214 ret = -ENOTCONN;
3215 goto out;
3216 }
3217
3218 bargs = kzalloc(sizeof(*bargs), GFP_NOFS);
3219 if (!bargs) {
3220 ret = -ENOMEM;
3221 goto out;
3222 }
3223
3224 update_ioctl_balance_args(fs_info, 1, bargs);
3225
3226 if (copy_to_user(arg, bargs, sizeof(*bargs)))
3227 ret = -EFAULT;
3228
3229 kfree(bargs);
3230out:
3231 mutex_unlock(&fs_info->balance_mutex);
3232 return ret;
3233}
3234
3037long btrfs_ioctl(struct file *file, unsigned int 3235long btrfs_ioctl(struct file *file, unsigned int
3038 cmd, unsigned long arg) 3236 cmd, unsigned long arg)
3039{ 3237{
@@ -3078,7 +3276,7 @@ long btrfs_ioctl(struct file *file, unsigned int
3078 case BTRFS_IOC_DEV_INFO: 3276 case BTRFS_IOC_DEV_INFO:
3079 return btrfs_ioctl_dev_info(root, argp); 3277 return btrfs_ioctl_dev_info(root, argp);
3080 case BTRFS_IOC_BALANCE: 3278 case BTRFS_IOC_BALANCE:
3081 return btrfs_balance(root->fs_info->dev_root); 3279 return btrfs_ioctl_balance(root, NULL);
3082 case BTRFS_IOC_CLONE: 3280 case BTRFS_IOC_CLONE:
3083 return btrfs_ioctl_clone(file, arg, 0, 0, 0); 3281 return btrfs_ioctl_clone(file, arg, 0, 0, 0);
3084 case BTRFS_IOC_CLONE_RANGE: 3282 case BTRFS_IOC_CLONE_RANGE:
@@ -3110,6 +3308,12 @@ long btrfs_ioctl(struct file *file, unsigned int
3110 return btrfs_ioctl_scrub_cancel(root, argp); 3308 return btrfs_ioctl_scrub_cancel(root, argp);
3111 case BTRFS_IOC_SCRUB_PROGRESS: 3309 case BTRFS_IOC_SCRUB_PROGRESS:
3112 return btrfs_ioctl_scrub_progress(root, argp); 3310 return btrfs_ioctl_scrub_progress(root, argp);
3311 case BTRFS_IOC_BALANCE_V2:
3312 return btrfs_ioctl_balance(root, argp);
3313 case BTRFS_IOC_BALANCE_CTL:
3314 return btrfs_ioctl_balance_ctl(root, arg);
3315 case BTRFS_IOC_BALANCE_PROGRESS:
3316 return btrfs_ioctl_balance_progress(root, argp);
3113 } 3317 }
3114 3318
3115 return -ENOTTY; 3319 return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 252ae9915de8..4f69028a68c4 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -109,6 +109,55 @@ struct btrfs_ioctl_fs_info_args {
109 __u64 reserved[124]; /* pad to 1k */ 109 __u64 reserved[124]; /* pad to 1k */
110}; 110};
111 111
112/* balance control ioctl modes */
113#define BTRFS_BALANCE_CTL_PAUSE 1
114#define BTRFS_BALANCE_CTL_CANCEL 2
115
116/*
117 * this is packed, because it should be exactly the same as its disk
118 * byte order counterpart (struct btrfs_disk_balance_args)
119 */
120struct btrfs_balance_args {
121 __u64 profiles;
122 __u64 usage;
123 __u64 devid;
124 __u64 pstart;
125 __u64 pend;
126 __u64 vstart;
127 __u64 vend;
128
129 __u64 target;
130
131 __u64 flags;
132
133 __u64 unused[8];
134} __attribute__ ((__packed__));
135
136/* report balance progress to userspace */
137struct btrfs_balance_progress {
138 __u64 expected; /* estimated # of chunks that will be
139 * relocated to fulfill the request */
140 __u64 considered; /* # of chunks we have considered so far */
141 __u64 completed; /* # of chunks relocated so far */
142};
143
144#define BTRFS_BALANCE_STATE_RUNNING (1ULL << 0)
145#define BTRFS_BALANCE_STATE_PAUSE_REQ (1ULL << 1)
146#define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2)
147
148struct btrfs_ioctl_balance_args {
149 __u64 flags; /* in/out */
150 __u64 state; /* out */
151
152 struct btrfs_balance_args data; /* in/out */
153 struct btrfs_balance_args meta; /* in/out */
154 struct btrfs_balance_args sys; /* in/out */
155
156 struct btrfs_balance_progress stat; /* out */
157
158 __u64 unused[72]; /* pad to 1k */
159};
160
112#define BTRFS_INO_LOOKUP_PATH_MAX 4080 161#define BTRFS_INO_LOOKUP_PATH_MAX 4080
113struct btrfs_ioctl_ino_lookup_args { 162struct btrfs_ioctl_ino_lookup_args {
114 __u64 treeid; 163 __u64 treeid;
@@ -272,6 +321,11 @@ struct btrfs_ioctl_logical_ino_args {
272 struct btrfs_ioctl_dev_info_args) 321 struct btrfs_ioctl_dev_info_args)
273#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ 322#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
274 struct btrfs_ioctl_fs_info_args) 323 struct btrfs_ioctl_fs_info_args)
324#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \
325 struct btrfs_ioctl_balance_args)
326#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int)
327#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \
328 struct btrfs_ioctl_balance_args)
275#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ 329#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
276 struct btrfs_ioctl_ino_path_args) 330 struct btrfs_ioctl_ino_path_args)
277#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ 331#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index d77b67c4b275..5e178d8f7167 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -33,6 +33,14 @@ void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
33 */ 33 */
34void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) 34void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
35{ 35{
36 if (eb->lock_nested) {
37 read_lock(&eb->lock);
38 if (eb->lock_nested && current->pid == eb->lock_owner) {
39 read_unlock(&eb->lock);
40 return;
41 }
42 read_unlock(&eb->lock);
43 }
36 if (rw == BTRFS_WRITE_LOCK) { 44 if (rw == BTRFS_WRITE_LOCK) {
37 if (atomic_read(&eb->blocking_writers) == 0) { 45 if (atomic_read(&eb->blocking_writers) == 0) {
38 WARN_ON(atomic_read(&eb->spinning_writers) != 1); 46 WARN_ON(atomic_read(&eb->spinning_writers) != 1);
@@ -57,6 +65,14 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
57 */ 65 */
58void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) 66void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
59{ 67{
68 if (eb->lock_nested) {
69 read_lock(&eb->lock);
70 if (&eb->lock_nested && current->pid == eb->lock_owner) {
71 read_unlock(&eb->lock);
72 return;
73 }
74 read_unlock(&eb->lock);
75 }
60 if (rw == BTRFS_WRITE_LOCK_BLOCKING) { 76 if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
61 BUG_ON(atomic_read(&eb->blocking_writers) != 1); 77 BUG_ON(atomic_read(&eb->blocking_writers) != 1);
62 write_lock(&eb->lock); 78 write_lock(&eb->lock);
@@ -81,12 +97,25 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
81void btrfs_tree_read_lock(struct extent_buffer *eb) 97void btrfs_tree_read_lock(struct extent_buffer *eb)
82{ 98{
83again: 99again:
100 read_lock(&eb->lock);
101 if (atomic_read(&eb->blocking_writers) &&
102 current->pid == eb->lock_owner) {
103 /*
104 * This extent is already write-locked by our thread. We allow
105 * an additional read lock to be added because it's for the same
106 * thread. btrfs_find_all_roots() depends on this as it may be
107 * called on a partly (write-)locked tree.
108 */
109 BUG_ON(eb->lock_nested);
110 eb->lock_nested = 1;
111 read_unlock(&eb->lock);
112 return;
113 }
114 read_unlock(&eb->lock);
84 wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); 115 wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
85 read_lock(&eb->lock); 116 read_lock(&eb->lock);
86 if (atomic_read(&eb->blocking_writers)) { 117 if (atomic_read(&eb->blocking_writers)) {
87 read_unlock(&eb->lock); 118 read_unlock(&eb->lock);
88 wait_event(eb->write_lock_wq,
89 atomic_read(&eb->blocking_writers) == 0);
90 goto again; 119 goto again;
91 } 120 }
92 atomic_inc(&eb->read_locks); 121 atomic_inc(&eb->read_locks);
@@ -129,6 +158,7 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
129 } 158 }
130 atomic_inc(&eb->write_locks); 159 atomic_inc(&eb->write_locks);
131 atomic_inc(&eb->spinning_writers); 160 atomic_inc(&eb->spinning_writers);
161 eb->lock_owner = current->pid;
132 return 1; 162 return 1;
133} 163}
134 164
@@ -137,6 +167,15 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
137 */ 167 */
138void btrfs_tree_read_unlock(struct extent_buffer *eb) 168void btrfs_tree_read_unlock(struct extent_buffer *eb)
139{ 169{
170 if (eb->lock_nested) {
171 read_lock(&eb->lock);
172 if (eb->lock_nested && current->pid == eb->lock_owner) {
173 eb->lock_nested = 0;
174 read_unlock(&eb->lock);
175 return;
176 }
177 read_unlock(&eb->lock);
178 }
140 btrfs_assert_tree_read_locked(eb); 179 btrfs_assert_tree_read_locked(eb);
141 WARN_ON(atomic_read(&eb->spinning_readers) == 0); 180 WARN_ON(atomic_read(&eb->spinning_readers) == 0);
142 atomic_dec(&eb->spinning_readers); 181 atomic_dec(&eb->spinning_readers);
@@ -149,6 +188,15 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
149 */ 188 */
150void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) 189void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
151{ 190{
191 if (eb->lock_nested) {
192 read_lock(&eb->lock);
193 if (eb->lock_nested && current->pid == eb->lock_owner) {
194 eb->lock_nested = 0;
195 read_unlock(&eb->lock);
196 return;
197 }
198 read_unlock(&eb->lock);
199 }
152 btrfs_assert_tree_read_locked(eb); 200 btrfs_assert_tree_read_locked(eb);
153 WARN_ON(atomic_read(&eb->blocking_readers) == 0); 201 WARN_ON(atomic_read(&eb->blocking_readers) == 0);
154 if (atomic_dec_and_test(&eb->blocking_readers)) 202 if (atomic_dec_and_test(&eb->blocking_readers))
@@ -181,6 +229,7 @@ again:
181 WARN_ON(atomic_read(&eb->spinning_writers)); 229 WARN_ON(atomic_read(&eb->spinning_writers));
182 atomic_inc(&eb->spinning_writers); 230 atomic_inc(&eb->spinning_writers);
183 atomic_inc(&eb->write_locks); 231 atomic_inc(&eb->write_locks);
232 eb->lock_owner = current->pid;
184 return 0; 233 return 0;
185} 234}
186 235
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index cfb55434a469..8c1aae2c845d 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1604,12 +1604,12 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
1604 ret = btrfs_inc_extent_ref(trans, root, new_bytenr, 1604 ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
1605 num_bytes, parent, 1605 num_bytes, parent,
1606 btrfs_header_owner(leaf), 1606 btrfs_header_owner(leaf),
1607 key.objectid, key.offset); 1607 key.objectid, key.offset, 1);
1608 BUG_ON(ret); 1608 BUG_ON(ret);
1609 1609
1610 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 1610 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1611 parent, btrfs_header_owner(leaf), 1611 parent, btrfs_header_owner(leaf),
1612 key.objectid, key.offset); 1612 key.objectid, key.offset, 1);
1613 BUG_ON(ret); 1613 BUG_ON(ret);
1614 } 1614 }
1615 if (dirty) 1615 if (dirty)
@@ -1778,21 +1778,23 @@ again:
1778 1778
1779 ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize, 1779 ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
1780 path->nodes[level]->start, 1780 path->nodes[level]->start,
1781 src->root_key.objectid, level - 1, 0); 1781 src->root_key.objectid, level - 1, 0,
1782 1);
1782 BUG_ON(ret); 1783 BUG_ON(ret);
1783 ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize, 1784 ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
1784 0, dest->root_key.objectid, level - 1, 1785 0, dest->root_key.objectid, level - 1,
1785 0); 1786 0, 1);
1786 BUG_ON(ret); 1787 BUG_ON(ret);
1787 1788
1788 ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, 1789 ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
1789 path->nodes[level]->start, 1790 path->nodes[level]->start,
1790 src->root_key.objectid, level - 1, 0); 1791 src->root_key.objectid, level - 1, 0,
1792 1);
1791 BUG_ON(ret); 1793 BUG_ON(ret);
1792 1794
1793 ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, 1795 ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
1794 0, dest->root_key.objectid, level - 1, 1796 0, dest->root_key.objectid, level - 1,
1795 0); 1797 0, 1);
1796 BUG_ON(ret); 1798 BUG_ON(ret);
1797 1799
1798 btrfs_unlock_up_safe(path, 0); 1800 btrfs_unlock_up_safe(path, 0);
@@ -2244,7 +2246,7 @@ again:
2244 } else { 2246 } else {
2245 list_del_init(&reloc_root->root_list); 2247 list_del_init(&reloc_root->root_list);
2246 } 2248 }
2247 btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0); 2249 btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
2248 } 2250 }
2249 2251
2250 if (found) { 2252 if (found) {
@@ -2558,7 +2560,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2558 node->eb->start, blocksize, 2560 node->eb->start, blocksize,
2559 upper->eb->start, 2561 upper->eb->start,
2560 btrfs_header_owner(upper->eb), 2562 btrfs_header_owner(upper->eb),
2561 node->level, 0); 2563 node->level, 0, 1);
2562 BUG_ON(ret); 2564 BUG_ON(ret);
2563 2565
2564 ret = btrfs_drop_subtree(trans, root, eb, upper->eb); 2566 ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
@@ -2947,9 +2949,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
2947 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; 2949 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
2948 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; 2950 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
2949 while (index <= last_index) { 2951 while (index <= last_index) {
2950 mutex_lock(&inode->i_mutex);
2951 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); 2952 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
2952 mutex_unlock(&inode->i_mutex);
2953 if (ret) 2953 if (ret)
2954 goto out; 2954 goto out;
2955 2955
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ddf2c90d3fc0..9770cc5bfb76 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -25,6 +25,7 @@
25#include "transaction.h" 25#include "transaction.h"
26#include "backref.h" 26#include "backref.h"
27#include "extent_io.h" 27#include "extent_io.h"
28#include "check-integrity.h"
28 29
29/* 30/*
30 * This is only the first step towards a full-features scrub. It reads all 31 * This is only the first step towards a full-features scrub. It reads all
@@ -309,7 +310,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
309 u8 ref_level; 310 u8 ref_level;
310 unsigned long ptr = 0; 311 unsigned long ptr = 0;
311 const int bufsize = 4096; 312 const int bufsize = 4096;
312 u64 extent_offset; 313 u64 extent_item_pos;
313 314
314 path = btrfs_alloc_path(); 315 path = btrfs_alloc_path();
315 316
@@ -329,12 +330,13 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
329 if (ret < 0) 330 if (ret < 0)
330 goto out; 331 goto out;
331 332
332 extent_offset = swarn.logical - found_key.objectid; 333 extent_item_pos = swarn.logical - found_key.objectid;
333 swarn.extent_item_size = found_key.offset; 334 swarn.extent_item_size = found_key.offset;
334 335
335 eb = path->nodes[0]; 336 eb = path->nodes[0];
336 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); 337 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
337 item_size = btrfs_item_size_nr(eb, path->slots[0]); 338 item_size = btrfs_item_size_nr(eb, path->slots[0]);
339 btrfs_release_path(path);
338 340
339 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 341 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
340 do { 342 do {
@@ -351,7 +353,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
351 } else { 353 } else {
352 swarn.path = path; 354 swarn.path = path;
353 iterate_extent_inodes(fs_info, path, found_key.objectid, 355 iterate_extent_inodes(fs_info, path, found_key.objectid,
354 extent_offset, 356 extent_item_pos,
355 scrub_print_warning_inode, &swarn); 357 scrub_print_warning_inode, &swarn);
356 } 358 }
357 359
@@ -732,7 +734,7 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
732 bio_add_page(bio, page, PAGE_SIZE, 0); 734 bio_add_page(bio, page, PAGE_SIZE, 0);
733 bio->bi_end_io = scrub_fixup_end_io; 735 bio->bi_end_io = scrub_fixup_end_io;
734 bio->bi_private = &complete; 736 bio->bi_private = &complete;
735 submit_bio(rw, bio); 737 btrfsic_submit_bio(rw, bio);
736 738
737 /* this will also unplug the queue */ 739 /* this will also unplug the queue */
738 wait_for_completion(&complete); 740 wait_for_completion(&complete);
@@ -958,7 +960,7 @@ static int scrub_submit(struct scrub_dev *sdev)
958 sdev->curr = -1; 960 sdev->curr = -1;
959 atomic_inc(&sdev->in_flight); 961 atomic_inc(&sdev->in_flight);
960 962
961 submit_bio(READ, sbio->bio); 963 btrfsic_submit_bio(READ, sbio->bio);
962 964
963 return 0; 965 return 0;
964} 966}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 200f63bc6675..3ce97b217cbe 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -40,7 +40,6 @@
40#include <linux/magic.h> 40#include <linux/magic.h>
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/cleancache.h> 42#include <linux/cleancache.h>
43#include <linux/mnt_namespace.h>
44#include <linux/ratelimit.h> 43#include <linux/ratelimit.h>
45#include "compat.h" 44#include "compat.h"
46#include "delayed-inode.h" 45#include "delayed-inode.h"
@@ -148,13 +147,13 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
148 147
149static void btrfs_put_super(struct super_block *sb) 148static void btrfs_put_super(struct super_block *sb)
150{ 149{
151 struct btrfs_root *root = btrfs_sb(sb); 150 (void)close_ctree(btrfs_sb(sb)->tree_root);
152 int ret; 151 /* FIXME: need to fix VFS to return error? */
153 152 /* AV: return it _where_? ->put_super() can be triggered by any number
154 ret = close_ctree(root); 153 * of async events, up to and including delivery of SIGKILL to the
155 sb->s_fs_info = NULL; 154 * last process that kept it busy. Or segfault in the aforementioned
156 155 * process... Whom would you report that to?
157 (void)ret; /* FIXME: need to fix VFS to return error? */ 156 */
158} 157}
159 158
160enum { 159enum {
@@ -164,8 +163,11 @@ enum {
164 Opt_compress_type, Opt_compress_force, Opt_compress_force_type, 163 Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
165 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, 164 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
166 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, 165 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
167 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, 166 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
168 Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err, 167 Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
168 Opt_check_integrity, Opt_check_integrity_including_extent_data,
169 Opt_check_integrity_print_mask,
170 Opt_err,
169}; 171};
170 172
171static match_table_t tokens = { 173static match_table_t tokens = {
@@ -200,6 +202,10 @@ static match_table_t tokens = {
200 {Opt_inode_cache, "inode_cache"}, 202 {Opt_inode_cache, "inode_cache"},
201 {Opt_no_space_cache, "nospace_cache"}, 203 {Opt_no_space_cache, "nospace_cache"},
202 {Opt_recovery, "recovery"}, 204 {Opt_recovery, "recovery"},
205 {Opt_skip_balance, "skip_balance"},
206 {Opt_check_integrity, "check_int"},
207 {Opt_check_integrity_including_extent_data, "check_int_data"},
208 {Opt_check_integrity_print_mask, "check_int_print_mask=%d"},
203 {Opt_err, NULL}, 209 {Opt_err, NULL},
204}; 210};
205 211
@@ -398,6 +404,40 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
398 printk(KERN_INFO "btrfs: enabling auto recovery"); 404 printk(KERN_INFO "btrfs: enabling auto recovery");
399 btrfs_set_opt(info->mount_opt, RECOVERY); 405 btrfs_set_opt(info->mount_opt, RECOVERY);
400 break; 406 break;
407 case Opt_skip_balance:
408 btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
409 break;
410#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
411 case Opt_check_integrity_including_extent_data:
412 printk(KERN_INFO "btrfs: enabling check integrity"
413 " including extent data\n");
414 btrfs_set_opt(info->mount_opt,
415 CHECK_INTEGRITY_INCLUDING_EXTENT_DATA);
416 btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
417 break;
418 case Opt_check_integrity:
419 printk(KERN_INFO "btrfs: enabling check integrity\n");
420 btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
421 break;
422 case Opt_check_integrity_print_mask:
423 intarg = 0;
424 match_int(&args[0], &intarg);
425 if (intarg) {
426 info->check_integrity_print_mask = intarg;
427 printk(KERN_INFO "btrfs:"
428 " check_integrity_print_mask 0x%x\n",
429 info->check_integrity_print_mask);
430 }
431 break;
432#else
433 case Opt_check_integrity_including_extent_data:
434 case Opt_check_integrity:
435 case Opt_check_integrity_print_mask:
436 printk(KERN_ERR "btrfs: support for check_integrity*"
437 " not compiled in!\n");
438 ret = -EINVAL;
439 goto out;
440#endif
401 case Opt_err: 441 case Opt_err:
402 printk(KERN_INFO "btrfs: unrecognized mount option " 442 printk(KERN_INFO "btrfs: unrecognized mount option "
403 "'%s'\n", p); 443 "'%s'\n", p);
@@ -501,7 +541,8 @@ out:
501static struct dentry *get_default_root(struct super_block *sb, 541static struct dentry *get_default_root(struct super_block *sb,
502 u64 subvol_objectid) 542 u64 subvol_objectid)
503{ 543{
504 struct btrfs_root *root = sb->s_fs_info; 544 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
545 struct btrfs_root *root = fs_info->tree_root;
505 struct btrfs_root *new_root; 546 struct btrfs_root *new_root;
506 struct btrfs_dir_item *di; 547 struct btrfs_dir_item *di;
507 struct btrfs_path *path; 548 struct btrfs_path *path;
@@ -531,7 +572,7 @@ static struct dentry *get_default_root(struct super_block *sb,
531 * will mount by default if we haven't been given a specific subvolume 572 * will mount by default if we haven't been given a specific subvolume
532 * to mount. 573 * to mount.
533 */ 574 */
534 dir_id = btrfs_super_root_dir(root->fs_info->super_copy); 575 dir_id = btrfs_super_root_dir(fs_info->super_copy);
535 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); 576 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
536 if (IS_ERR(di)) { 577 if (IS_ERR(di)) {
537 btrfs_free_path(path); 578 btrfs_free_path(path);
@@ -545,7 +586,7 @@ static struct dentry *get_default_root(struct super_block *sb,
545 */ 586 */
546 btrfs_free_path(path); 587 btrfs_free_path(path);
547 dir_id = BTRFS_FIRST_FREE_OBJECTID; 588 dir_id = BTRFS_FIRST_FREE_OBJECTID;
548 new_root = root->fs_info->fs_root; 589 new_root = fs_info->fs_root;
549 goto setup_root; 590 goto setup_root;
550 } 591 }
551 592
@@ -553,7 +594,7 @@ static struct dentry *get_default_root(struct super_block *sb,
553 btrfs_free_path(path); 594 btrfs_free_path(path);
554 595
555find_root: 596find_root:
556 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); 597 new_root = btrfs_read_fs_root_no_name(fs_info, &location);
557 if (IS_ERR(new_root)) 598 if (IS_ERR(new_root))
558 return ERR_CAST(new_root); 599 return ERR_CAST(new_root);
559 600
@@ -589,7 +630,7 @@ static int btrfs_fill_super(struct super_block *sb,
589{ 630{
590 struct inode *inode; 631 struct inode *inode;
591 struct dentry *root_dentry; 632 struct dentry *root_dentry;
592 struct btrfs_root *tree_root; 633 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
593 struct btrfs_key key; 634 struct btrfs_key key;
594 int err; 635 int err;
595 636
@@ -604,18 +645,16 @@ static int btrfs_fill_super(struct super_block *sb,
604 sb->s_flags |= MS_POSIXACL; 645 sb->s_flags |= MS_POSIXACL;
605#endif 646#endif
606 647
607 tree_root = open_ctree(sb, fs_devices, (char *)data); 648 err = open_ctree(sb, fs_devices, (char *)data);
608 649 if (err) {
609 if (IS_ERR(tree_root)) {
610 printk("btrfs: open_ctree failed\n"); 650 printk("btrfs: open_ctree failed\n");
611 return PTR_ERR(tree_root); 651 return err;
612 } 652 }
613 sb->s_fs_info = tree_root;
614 653
615 key.objectid = BTRFS_FIRST_FREE_OBJECTID; 654 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
616 key.type = BTRFS_INODE_ITEM_KEY; 655 key.type = BTRFS_INODE_ITEM_KEY;
617 key.offset = 0; 656 key.offset = 0;
618 inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL); 657 inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL);
619 if (IS_ERR(inode)) { 658 if (IS_ERR(inode)) {
620 err = PTR_ERR(inode); 659 err = PTR_ERR(inode);
621 goto fail_close; 660 goto fail_close;
@@ -632,23 +671,25 @@ static int btrfs_fill_super(struct super_block *sb,
632 671
633 save_mount_options(sb, data); 672 save_mount_options(sb, data);
634 cleancache_init_fs(sb); 673 cleancache_init_fs(sb);
674 sb->s_flags |= MS_ACTIVE;
635 return 0; 675 return 0;
636 676
637fail_close: 677fail_close:
638 close_ctree(tree_root); 678 close_ctree(fs_info->tree_root);
639 return err; 679 return err;
640} 680}
641 681
642int btrfs_sync_fs(struct super_block *sb, int wait) 682int btrfs_sync_fs(struct super_block *sb, int wait)
643{ 683{
644 struct btrfs_trans_handle *trans; 684 struct btrfs_trans_handle *trans;
645 struct btrfs_root *root = btrfs_sb(sb); 685 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
686 struct btrfs_root *root = fs_info->tree_root;
646 int ret; 687 int ret;
647 688
648 trace_btrfs_sync_fs(wait); 689 trace_btrfs_sync_fs(wait);
649 690
650 if (!wait) { 691 if (!wait) {
651 filemap_flush(root->fs_info->btree_inode->i_mapping); 692 filemap_flush(fs_info->btree_inode->i_mapping);
652 return 0; 693 return 0;
653 } 694 }
654 695
@@ -662,10 +703,10 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
662 return ret; 703 return ret;
663} 704}
664 705
665static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) 706static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
666{ 707{
667 struct btrfs_root *root = btrfs_sb(vfs->mnt_sb); 708 struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
668 struct btrfs_fs_info *info = root->fs_info; 709 struct btrfs_root *root = info->tree_root;
669 char *compress_type; 710 char *compress_type;
670 711
671 if (btrfs_test_opt(root, DEGRADED)) 712 if (btrfs_test_opt(root, DEGRADED))
@@ -723,28 +764,25 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
723 seq_puts(seq, ",autodefrag"); 764 seq_puts(seq, ",autodefrag");
724 if (btrfs_test_opt(root, INODE_MAP_CACHE)) 765 if (btrfs_test_opt(root, INODE_MAP_CACHE))
725 seq_puts(seq, ",inode_cache"); 766 seq_puts(seq, ",inode_cache");
767 if (btrfs_test_opt(root, SKIP_BALANCE))
768 seq_puts(seq, ",skip_balance");
726 return 0; 769 return 0;
727} 770}
728 771
729static int btrfs_test_super(struct super_block *s, void *data) 772static int btrfs_test_super(struct super_block *s, void *data)
730{ 773{
731 struct btrfs_root *test_root = data; 774 struct btrfs_fs_info *p = data;
732 struct btrfs_root *root = btrfs_sb(s); 775 struct btrfs_fs_info *fs_info = btrfs_sb(s);
733 776
734 /* 777 return fs_info->fs_devices == p->fs_devices;
735 * If this super block is going away, return false as it
736 * can't match as an existing super block.
737 */
738 if (!atomic_read(&s->s_active))
739 return 0;
740 return root->fs_info->fs_devices == test_root->fs_info->fs_devices;
741} 778}
742 779
743static int btrfs_set_super(struct super_block *s, void *data) 780static int btrfs_set_super(struct super_block *s, void *data)
744{ 781{
745 s->s_fs_info = data; 782 int err = set_anon_super(s, data);
746 783 if (!err)
747 return set_anon_super(s, data); 784 s->s_fs_info = data;
785 return err;
748} 786}
749 787
750/* 788/*
@@ -904,12 +942,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
904 if (!fs_info) 942 if (!fs_info)
905 return ERR_PTR(-ENOMEM); 943 return ERR_PTR(-ENOMEM);
906 944
907 fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
908 if (!fs_info->tree_root) {
909 error = -ENOMEM;
910 goto error_fs_info;
911 }
912 fs_info->tree_root->fs_info = fs_info;
913 fs_info->fs_devices = fs_devices; 945 fs_info->fs_devices = fs_devices;
914 946
915 fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); 947 fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
@@ -929,43 +961,30 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
929 } 961 }
930 962
931 bdev = fs_devices->latest_bdev; 963 bdev = fs_devices->latest_bdev;
932 s = sget(fs_type, btrfs_test_super, btrfs_set_super, 964 s = sget(fs_type, btrfs_test_super, btrfs_set_super, fs_info);
933 fs_info->tree_root);
934 if (IS_ERR(s)) { 965 if (IS_ERR(s)) {
935 error = PTR_ERR(s); 966 error = PTR_ERR(s);
936 goto error_close_devices; 967 goto error_close_devices;
937 } 968 }
938 969
939 if (s->s_root) { 970 if (s->s_root) {
940 if ((flags ^ s->s_flags) & MS_RDONLY) {
941 deactivate_locked_super(s);
942 error = -EBUSY;
943 goto error_close_devices;
944 }
945
946 btrfs_close_devices(fs_devices); 971 btrfs_close_devices(fs_devices);
947 free_fs_info(fs_info); 972 free_fs_info(fs_info);
973 if ((flags ^ s->s_flags) & MS_RDONLY)
974 error = -EBUSY;
948 } else { 975 } else {
949 char b[BDEVNAME_SIZE]; 976 char b[BDEVNAME_SIZE];
950 977
951 s->s_flags = flags | MS_NOSEC; 978 s->s_flags = flags | MS_NOSEC;
952 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); 979 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
953 btrfs_sb(s)->fs_info->bdev_holder = fs_type; 980 btrfs_sb(s)->bdev_holder = fs_type;
954 error = btrfs_fill_super(s, fs_devices, data, 981 error = btrfs_fill_super(s, fs_devices, data,
955 flags & MS_SILENT ? 1 : 0); 982 flags & MS_SILENT ? 1 : 0);
956 if (error) {
957 deactivate_locked_super(s);
958 return ERR_PTR(error);
959 }
960
961 s->s_flags |= MS_ACTIVE;
962 } 983 }
963 984
964 root = get_default_root(s, subvol_objectid); 985 root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
965 if (IS_ERR(root)) { 986 if (IS_ERR(root))
966 deactivate_locked_super(s); 987 deactivate_locked_super(s);
967 return root;
968 }
969 988
970 return root; 989 return root;
971 990
@@ -978,7 +997,8 @@ error_fs_info:
978 997
979static int btrfs_remount(struct super_block *sb, int *flags, char *data) 998static int btrfs_remount(struct super_block *sb, int *flags, char *data)
980{ 999{
981 struct btrfs_root *root = btrfs_sb(sb); 1000 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1001 struct btrfs_root *root = fs_info->tree_root;
982 int ret; 1002 int ret;
983 1003
984 ret = btrfs_parse_options(root, data); 1004 ret = btrfs_parse_options(root, data);
@@ -994,13 +1014,13 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
994 ret = btrfs_commit_super(root); 1014 ret = btrfs_commit_super(root);
995 WARN_ON(ret); 1015 WARN_ON(ret);
996 } else { 1016 } else {
997 if (root->fs_info->fs_devices->rw_devices == 0) 1017 if (fs_info->fs_devices->rw_devices == 0)
998 return -EACCES; 1018 return -EACCES;
999 1019
1000 if (btrfs_super_log_root(root->fs_info->super_copy) != 0) 1020 if (btrfs_super_log_root(fs_info->super_copy) != 0)
1001 return -EINVAL; 1021 return -EINVAL;
1002 1022
1003 ret = btrfs_cleanup_fs_roots(root->fs_info); 1023 ret = btrfs_cleanup_fs_roots(fs_info);
1004 WARN_ON(ret); 1024 WARN_ON(ret);
1005 1025
1006 /* recover relocation */ 1026 /* recover relocation */
@@ -1169,18 +1189,18 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1169 1189
1170static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) 1190static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1171{ 1191{
1172 struct btrfs_root *root = btrfs_sb(dentry->d_sb); 1192 struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
1173 struct btrfs_super_block *disk_super = root->fs_info->super_copy; 1193 struct btrfs_super_block *disk_super = fs_info->super_copy;
1174 struct list_head *head = &root->fs_info->space_info; 1194 struct list_head *head = &fs_info->space_info;
1175 struct btrfs_space_info *found; 1195 struct btrfs_space_info *found;
1176 u64 total_used = 0; 1196 u64 total_used = 0;
1177 u64 total_free_data = 0; 1197 u64 total_free_data = 0;
1178 int bits = dentry->d_sb->s_blocksize_bits; 1198 int bits = dentry->d_sb->s_blocksize_bits;
1179 __be32 *fsid = (__be32 *)root->fs_info->fsid; 1199 __be32 *fsid = (__be32 *)fs_info->fsid;
1180 int ret; 1200 int ret;
1181 1201
1182 /* holding chunk_muext to avoid allocating new chunks */ 1202 /* holding chunk_muext to avoid allocating new chunks */
1183 mutex_lock(&root->fs_info->chunk_mutex); 1203 mutex_lock(&fs_info->chunk_mutex);
1184 rcu_read_lock(); 1204 rcu_read_lock();
1185 list_for_each_entry_rcu(found, head, list) { 1205 list_for_each_entry_rcu(found, head, list) {
1186 if (found->flags & BTRFS_BLOCK_GROUP_DATA) { 1206 if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
@@ -1199,14 +1219,14 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1199 buf->f_bsize = dentry->d_sb->s_blocksize; 1219 buf->f_bsize = dentry->d_sb->s_blocksize;
1200 buf->f_type = BTRFS_SUPER_MAGIC; 1220 buf->f_type = BTRFS_SUPER_MAGIC;
1201 buf->f_bavail = total_free_data; 1221 buf->f_bavail = total_free_data;
1202 ret = btrfs_calc_avail_data_space(root, &total_free_data); 1222 ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
1203 if (ret) { 1223 if (ret) {
1204 mutex_unlock(&root->fs_info->chunk_mutex); 1224 mutex_unlock(&fs_info->chunk_mutex);
1205 return ret; 1225 return ret;
1206 } 1226 }
1207 buf->f_bavail += total_free_data; 1227 buf->f_bavail += total_free_data;
1208 buf->f_bavail = buf->f_bavail >> bits; 1228 buf->f_bavail = buf->f_bavail >> bits;
1209 mutex_unlock(&root->fs_info->chunk_mutex); 1229 mutex_unlock(&fs_info->chunk_mutex);
1210 1230
1211 /* We treat it as constant endianness (it doesn't matter _which_) 1231 /* We treat it as constant endianness (it doesn't matter _which_)
1212 because we want the fsid to come out the same whether mounted 1232 because we want the fsid to come out the same whether mounted
@@ -1220,11 +1240,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1220 return 0; 1240 return 0;
1221} 1241}
1222 1242
1243static void btrfs_kill_super(struct super_block *sb)
1244{
1245 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1246 kill_anon_super(sb);
1247 free_fs_info(fs_info);
1248}
1249
1223static struct file_system_type btrfs_fs_type = { 1250static struct file_system_type btrfs_fs_type = {
1224 .owner = THIS_MODULE, 1251 .owner = THIS_MODULE,
1225 .name = "btrfs", 1252 .name = "btrfs",
1226 .mount = btrfs_mount, 1253 .mount = btrfs_mount,
1227 .kill_sb = kill_anon_super, 1254 .kill_sb = btrfs_kill_super,
1228 .fs_flags = FS_REQUIRES_DEV, 1255 .fs_flags = FS_REQUIRES_DEV,
1229}; 1256};
1230 1257
@@ -1258,17 +1285,17 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
1258 1285
1259static int btrfs_freeze(struct super_block *sb) 1286static int btrfs_freeze(struct super_block *sb)
1260{ 1287{
1261 struct btrfs_root *root = btrfs_sb(sb); 1288 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1262 mutex_lock(&root->fs_info->transaction_kthread_mutex); 1289 mutex_lock(&fs_info->transaction_kthread_mutex);
1263 mutex_lock(&root->fs_info->cleaner_mutex); 1290 mutex_lock(&fs_info->cleaner_mutex);
1264 return 0; 1291 return 0;
1265} 1292}
1266 1293
1267static int btrfs_unfreeze(struct super_block *sb) 1294static int btrfs_unfreeze(struct super_block *sb)
1268{ 1295{
1269 struct btrfs_root *root = btrfs_sb(sb); 1296 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1270 mutex_unlock(&root->fs_info->cleaner_mutex); 1297 mutex_unlock(&fs_info->cleaner_mutex);
1271 mutex_unlock(&root->fs_info->transaction_kthread_mutex); 1298 mutex_unlock(&fs_info->transaction_kthread_mutex);
1272 return 0; 1299 return 0;
1273} 1300}
1274 1301
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 81376d94cd3c..287a6728b1ad 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -36,6 +36,8 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
36 WARN_ON(atomic_read(&transaction->use_count) == 0); 36 WARN_ON(atomic_read(&transaction->use_count) == 0);
37 if (atomic_dec_and_test(&transaction->use_count)) { 37 if (atomic_dec_and_test(&transaction->use_count)) {
38 BUG_ON(!list_empty(&transaction->list)); 38 BUG_ON(!list_empty(&transaction->list));
39 WARN_ON(transaction->delayed_refs.root.rb_node);
40 WARN_ON(!list_empty(&transaction->delayed_refs.seq_head));
39 memset(transaction, 0, sizeof(*transaction)); 41 memset(transaction, 0, sizeof(*transaction));
40 kmem_cache_free(btrfs_transaction_cachep, transaction); 42 kmem_cache_free(btrfs_transaction_cachep, transaction);
41 } 43 }
@@ -108,8 +110,11 @@ loop:
108 cur_trans->delayed_refs.num_heads = 0; 110 cur_trans->delayed_refs.num_heads = 0;
109 cur_trans->delayed_refs.flushing = 0; 111 cur_trans->delayed_refs.flushing = 0;
110 cur_trans->delayed_refs.run_delayed_start = 0; 112 cur_trans->delayed_refs.run_delayed_start = 0;
113 cur_trans->delayed_refs.seq = 1;
114 init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);
111 spin_lock_init(&cur_trans->commit_lock); 115 spin_lock_init(&cur_trans->commit_lock);
112 spin_lock_init(&cur_trans->delayed_refs.lock); 116 spin_lock_init(&cur_trans->delayed_refs.lock);
117 INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
113 118
114 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 119 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
115 list_add_tail(&cur_trans->list, &root->fs_info->trans_list); 120 list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
@@ -321,6 +326,8 @@ again:
321 } 326 }
322 327
323 if (num_bytes) { 328 if (num_bytes) {
329 trace_btrfs_space_reservation(root->fs_info, "transaction",
330 (u64)h, num_bytes, 1);
324 h->block_rsv = &root->fs_info->trans_block_rsv; 331 h->block_rsv = &root->fs_info->trans_block_rsv;
325 h->bytes_reserved = num_bytes; 332 h->bytes_reserved = num_bytes;
326 } 333 }
@@ -467,19 +474,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
467 474
468 btrfs_trans_release_metadata(trans, root); 475 btrfs_trans_release_metadata(trans, root);
469 trans->block_rsv = NULL; 476 trans->block_rsv = NULL;
470 while (count < 4) { 477 while (count < 2) {
471 unsigned long cur = trans->delayed_ref_updates; 478 unsigned long cur = trans->delayed_ref_updates;
472 trans->delayed_ref_updates = 0; 479 trans->delayed_ref_updates = 0;
473 if (cur && 480 if (cur &&
474 trans->transaction->delayed_refs.num_heads_ready > 64) { 481 trans->transaction->delayed_refs.num_heads_ready > 64) {
475 trans->delayed_ref_updates = 0; 482 trans->delayed_ref_updates = 0;
476
477 /*
478 * do a full flush if the transaction is trying
479 * to close
480 */
481 if (trans->transaction->delayed_refs.flushing)
482 cur = 0;
483 btrfs_run_delayed_refs(trans, root, cur); 483 btrfs_run_delayed_refs(trans, root, cur);
484 } else { 484 } else {
485 break; 485 break;
@@ -1393,9 +1393,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
1393 1393
1394 if (btrfs_header_backref_rev(root->node) < 1394 if (btrfs_header_backref_rev(root->node) <
1395 BTRFS_MIXED_BACKREF_REV) 1395 BTRFS_MIXED_BACKREF_REV)
1396 btrfs_drop_snapshot(root, NULL, 0); 1396 btrfs_drop_snapshot(root, NULL, 0, 0);
1397 else 1397 else
1398 btrfs_drop_snapshot(root, NULL, 1); 1398 btrfs_drop_snapshot(root, NULL, 1, 0);
1399 } 1399 }
1400 return 0; 1400 return 0;
1401} 1401}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 3568374d419d..cb877e0886a7 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -589,7 +589,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
589 ret = btrfs_inc_extent_ref(trans, root, 589 ret = btrfs_inc_extent_ref(trans, root,
590 ins.objectid, ins.offset, 590 ins.objectid, ins.offset,
591 0, root->root_key.objectid, 591 0, root->root_key.objectid,
592 key->objectid, offset); 592 key->objectid, offset, 0);
593 BUG_ON(ret); 593 BUG_ON(ret);
594 } else { 594 } else {
595 /* 595 /*
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
new file mode 100644
index 000000000000..12f5147bd2b1
--- /dev/null
+++ b/fs/btrfs/ulist.c
@@ -0,0 +1,220 @@
1/*
2 * Copyright (C) 2011 STRATO AG
3 * written by Arne Jansen <sensille@gmx.net>
4 * Distributed under the GNU GPL license version 2.
5 */
6
7#include <linux/slab.h>
8#include <linux/module.h>
9#include "ulist.h"
10
11/*
12 * ulist is a generic data structure to hold a collection of unique u64
13 * values. The only operations it supports is adding to the list and
14 * enumerating it.
15 * It is possible to store an auxiliary value along with the key.
16 *
17 * The implementation is preliminary and can probably be sped up
18 * significantly. A first step would be to store the values in an rbtree
19 * as soon as ULIST_SIZE is exceeded.
20 *
21 * A sample usage for ulists is the enumeration of directed graphs without
22 * visiting a node twice. The pseudo-code could look like this:
23 *
24 * ulist = ulist_alloc();
25 * ulist_add(ulist, root);
26 * elem = NULL;
27 *
28 * while ((elem = ulist_next(ulist, elem)) {
29 * for (all child nodes n in elem)
30 * ulist_add(ulist, n);
31 * do something useful with the node;
32 * }
33 * ulist_free(ulist);
34 *
35 * This assumes the graph nodes are adressable by u64. This stems from the
36 * usage for tree enumeration in btrfs, where the logical addresses are
37 * 64 bit.
38 *
39 * It is also useful for tree enumeration which could be done elegantly
40 * recursively, but is not possible due to kernel stack limitations. The
41 * loop would be similar to the above.
42 */
43
44/**
45 * ulist_init - freshly initialize a ulist
46 * @ulist: the ulist to initialize
47 *
48 * Note: don't use this function to init an already used ulist, use
49 * ulist_reinit instead.
50 */
51void ulist_init(struct ulist *ulist)
52{
53 ulist->nnodes = 0;
54 ulist->nodes = ulist->int_nodes;
55 ulist->nodes_alloced = ULIST_SIZE;
56}
57EXPORT_SYMBOL(ulist_init);
58
59/**
60 * ulist_fini - free up additionally allocated memory for the ulist
61 * @ulist: the ulist from which to free the additional memory
62 *
63 * This is useful in cases where the base 'struct ulist' has been statically
64 * allocated.
65 */
66void ulist_fini(struct ulist *ulist)
67{
68 /*
69 * The first ULIST_SIZE elements are stored inline in struct ulist.
70 * Only if more elements are alocated they need to be freed.
71 */
72 if (ulist->nodes_alloced > ULIST_SIZE)
73 kfree(ulist->nodes);
74 ulist->nodes_alloced = 0; /* in case ulist_fini is called twice */
75}
76EXPORT_SYMBOL(ulist_fini);
77
78/**
79 * ulist_reinit - prepare a ulist for reuse
80 * @ulist: ulist to be reused
81 *
82 * Free up all additional memory allocated for the list elements and reinit
83 * the ulist.
84 */
85void ulist_reinit(struct ulist *ulist)
86{
87 ulist_fini(ulist);
88 ulist_init(ulist);
89}
90EXPORT_SYMBOL(ulist_reinit);
91
92/**
93 * ulist_alloc - dynamically allocate a ulist
94 * @gfp_mask: allocation flags to for base allocation
95 *
96 * The allocated ulist will be returned in an initialized state.
97 */
98struct ulist *ulist_alloc(unsigned long gfp_mask)
99{
100 struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask);
101
102 if (!ulist)
103 return NULL;
104
105 ulist_init(ulist);
106
107 return ulist;
108}
109EXPORT_SYMBOL(ulist_alloc);
110
111/**
112 * ulist_free - free dynamically allocated ulist
113 * @ulist: ulist to free
114 *
115 * It is not necessary to call ulist_fini before.
116 */
117void ulist_free(struct ulist *ulist)
118{
119 if (!ulist)
120 return;
121 ulist_fini(ulist);
122 kfree(ulist);
123}
124EXPORT_SYMBOL(ulist_free);
125
126/**
127 * ulist_add - add an element to the ulist
128 * @ulist: ulist to add the element to
129 * @val: value to add to ulist
130 * @aux: auxiliary value to store along with val
131 * @gfp_mask: flags to use for allocation
132 *
133 * Note: locking must be provided by the caller. In case of rwlocks write
134 * locking is needed
135 *
136 * Add an element to a ulist. The @val will only be added if it doesn't
137 * already exist. If it is added, the auxiliary value @aux is stored along with
138 * it. In case @val already exists in the ulist, @aux is ignored, even if
139 * it differs from the already stored value.
140 *
141 * ulist_add returns 0 if @val already exists in ulist and 1 if @val has been
142 * inserted.
143 * In case of allocation failure -ENOMEM is returned and the ulist stays
144 * unaltered.
145 */
146int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
147 unsigned long gfp_mask)
148{
149 int i;
150
151 for (i = 0; i < ulist->nnodes; ++i) {
152 if (ulist->nodes[i].val == val)
153 return 0;
154 }
155
156 if (ulist->nnodes >= ulist->nodes_alloced) {
157 u64 new_alloced = ulist->nodes_alloced + 128;
158 struct ulist_node *new_nodes;
159 void *old = NULL;
160
161 /*
162 * if nodes_alloced == ULIST_SIZE no memory has been allocated
163 * yet, so pass NULL to krealloc
164 */
165 if (ulist->nodes_alloced > ULIST_SIZE)
166 old = ulist->nodes;
167
168 new_nodes = krealloc(old, sizeof(*new_nodes) * new_alloced,
169 gfp_mask);
170 if (!new_nodes)
171 return -ENOMEM;
172
173 if (!old)
174 memcpy(new_nodes, ulist->int_nodes,
175 sizeof(ulist->int_nodes));
176
177 ulist->nodes = new_nodes;
178 ulist->nodes_alloced = new_alloced;
179 }
180 ulist->nodes[ulist->nnodes].val = val;
181 ulist->nodes[ulist->nnodes].aux = aux;
182 ++ulist->nnodes;
183
184 return 1;
185}
186EXPORT_SYMBOL(ulist_add);
187
188/**
189 * ulist_next - iterate ulist
190 * @ulist: ulist to iterate
191 * @prev: previously returned element or %NULL to start iteration
192 *
193 * Note: locking must be provided by the caller. In case of rwlocks only read
194 * locking is needed
195 *
196 * This function is used to iterate an ulist. The iteration is started with
197 * @prev = %NULL. It returns the next element from the ulist or %NULL when the
198 * end is reached. No guarantee is made with respect to the order in which
199 * the elements are returned. They might neither be returned in order of
200 * addition nor in ascending order.
201 * It is allowed to call ulist_add during an enumeration. Newly added items
202 * are guaranteed to show up in the running enumeration.
203 */
204struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev)
205{
206 int next;
207
208 if (ulist->nnodes == 0)
209 return NULL;
210
211 if (!prev)
212 return &ulist->nodes[0];
213
214 next = (prev - ulist->nodes) + 1;
215 if (next < 0 || next >= ulist->nnodes)
216 return NULL;
217
218 return &ulist->nodes[next];
219}
220EXPORT_SYMBOL(ulist_next);
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
new file mode 100644
index 000000000000..2e25dec58ec0
--- /dev/null
+++ b/fs/btrfs/ulist.h
@@ -0,0 +1,68 @@
1/*
2 * Copyright (C) 2011 STRATO AG
3 * written by Arne Jansen <sensille@gmx.net>
4 * Distributed under the GNU GPL license version 2.
5 *
6 */
7
8#ifndef __ULIST__
9#define __ULIST__
10
11/*
12 * ulist is a generic data structure to hold a collection of unique u64
13 * values. The only operations it supports is adding to the list and
14 * enumerating it.
15 * It is possible to store an auxiliary value along with the key.
16 *
17 * The implementation is preliminary and can probably be sped up
18 * significantly. A first step would be to store the values in an rbtree
19 * as soon as ULIST_SIZE is exceeded.
20 */
21
22/*
23 * number of elements statically allocated inside struct ulist
24 */
25#define ULIST_SIZE 16
26
27/*
28 * element of the list
29 */
30struct ulist_node {
31 u64 val; /* value to store */
32 unsigned long aux; /* auxiliary value saved along with the val */
33};
34
35struct ulist {
36 /*
37 * number of elements stored in list
38 */
39 unsigned long nnodes;
40
41 /*
42 * number of nodes we already have room for
43 */
44 unsigned long nodes_alloced;
45
46 /*
47 * pointer to the array storing the elements. The first ULIST_SIZE
48 * elements are stored inline. In this case the it points to int_nodes.
49 * After exceeding ULIST_SIZE, dynamic memory is allocated.
50 */
51 struct ulist_node *nodes;
52
53 /*
54 * inline storage space for the first ULIST_SIZE entries
55 */
56 struct ulist_node int_nodes[ULIST_SIZE];
57};
58
59void ulist_init(struct ulist *ulist);
60void ulist_fini(struct ulist *ulist);
61void ulist_reinit(struct ulist *ulist);
62struct ulist *ulist_alloc(unsigned long gfp_mask);
63void ulist_free(struct ulist *ulist);
64int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
65 unsigned long gfp_mask);
66struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev);
67
68#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f4b839fd3c9d..0b4e2af7954d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -23,6 +23,7 @@
23#include <linux/random.h> 23#include <linux/random.h>
24#include <linux/iocontext.h> 24#include <linux/iocontext.h>
25#include <linux/capability.h> 25#include <linux/capability.h>
26#include <linux/kthread.h>
26#include <asm/div64.h> 27#include <asm/div64.h>
27#include "compat.h" 28#include "compat.h"
28#include "ctree.h" 29#include "ctree.h"
@@ -32,6 +33,7 @@
32#include "print-tree.h" 33#include "print-tree.h"
33#include "volumes.h" 34#include "volumes.h"
34#include "async-thread.h" 35#include "async-thread.h"
36#include "check-integrity.h"
35 37
36static int init_first_rw_device(struct btrfs_trans_handle *trans, 38static int init_first_rw_device(struct btrfs_trans_handle *trans,
37 struct btrfs_root *root, 39 struct btrfs_root *root,
@@ -246,7 +248,7 @@ loop_lock:
246 sync_pending = 0; 248 sync_pending = 0;
247 } 249 }
248 250
249 submit_bio(cur->bi_rw, cur); 251 btrfsic_submit_bio(cur->bi_rw, cur);
250 num_run++; 252 num_run++;
251 batch_run++; 253 batch_run++;
252 if (need_resched()) 254 if (need_resched())
@@ -706,8 +708,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
706 u64 devid; 708 u64 devid;
707 u64 transid; 709 u64 transid;
708 710
709 mutex_lock(&uuid_mutex);
710
711 flags |= FMODE_EXCL; 711 flags |= FMODE_EXCL;
712 bdev = blkdev_get_by_path(path, flags, holder); 712 bdev = blkdev_get_by_path(path, flags, holder);
713 713
@@ -716,6 +716,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
716 goto error; 716 goto error;
717 } 717 }
718 718
719 mutex_lock(&uuid_mutex);
719 ret = set_blocksize(bdev, 4096); 720 ret = set_blocksize(bdev, 4096);
720 if (ret) 721 if (ret)
721 goto error_close; 722 goto error_close;
@@ -737,9 +738,9 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
737 738
738 brelse(bh); 739 brelse(bh);
739error_close: 740error_close:
741 mutex_unlock(&uuid_mutex);
740 blkdev_put(bdev, flags); 742 blkdev_put(bdev, flags);
741error: 743error:
742 mutex_unlock(&uuid_mutex);
743 return ret; 744 return ret;
744} 745}
745 746
@@ -829,7 +830,6 @@ out:
829 830
830/* 831/*
831 * find_free_dev_extent - find free space in the specified device 832 * find_free_dev_extent - find free space in the specified device
832 * @trans: transaction handler
833 * @device: the device which we search the free space in 833 * @device: the device which we search the free space in
834 * @num_bytes: the size of the free space that we need 834 * @num_bytes: the size of the free space that we need
835 * @start: store the start of the free space. 835 * @start: store the start of the free space.
@@ -848,8 +848,7 @@ out:
848 * But if we don't find suitable free space, it is used to store the size of 848 * But if we don't find suitable free space, it is used to store the size of
849 * the max free space. 849 * the max free space.
850 */ 850 */
851int find_free_dev_extent(struct btrfs_trans_handle *trans, 851int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
852 struct btrfs_device *device, u64 num_bytes,
853 u64 *start, u64 *len) 852 u64 *start, u64 *len)
854{ 853{
855 struct btrfs_key key; 854 struct btrfs_key key;
@@ -893,7 +892,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
893 key.offset = search_start; 892 key.offset = search_start;
894 key.type = BTRFS_DEV_EXTENT_KEY; 893 key.type = BTRFS_DEV_EXTENT_KEY;
895 894
896 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 895 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
897 if (ret < 0) 896 if (ret < 0)
898 goto out; 897 goto out;
899 if (ret > 0) { 898 if (ret > 0) {
@@ -1282,7 +1281,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1282 bool clear_super = false; 1281 bool clear_super = false;
1283 1282
1284 mutex_lock(&uuid_mutex); 1283 mutex_lock(&uuid_mutex);
1285 mutex_lock(&root->fs_info->volume_mutex);
1286 1284
1287 all_avail = root->fs_info->avail_data_alloc_bits | 1285 all_avail = root->fs_info->avail_data_alloc_bits |
1288 root->fs_info->avail_system_alloc_bits | 1286 root->fs_info->avail_system_alloc_bits |
@@ -1452,7 +1450,6 @@ error_close:
1452 if (bdev) 1450 if (bdev)
1453 blkdev_put(bdev, FMODE_READ | FMODE_EXCL); 1451 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1454out: 1452out:
1455 mutex_unlock(&root->fs_info->volume_mutex);
1456 mutex_unlock(&uuid_mutex); 1453 mutex_unlock(&uuid_mutex);
1457 return ret; 1454 return ret;
1458error_undo: 1455error_undo:
@@ -1469,8 +1466,7 @@ error_undo:
1469/* 1466/*
1470 * does all the dirty work required for changing file system's UUID. 1467 * does all the dirty work required for changing file system's UUID.
1471 */ 1468 */
1472static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, 1469static int btrfs_prepare_sprout(struct btrfs_root *root)
1473 struct btrfs_root *root)
1474{ 1470{
1475 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1471 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1476 struct btrfs_fs_devices *old_devices; 1472 struct btrfs_fs_devices *old_devices;
@@ -1629,7 +1625,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1629 } 1625 }
1630 1626
1631 filemap_write_and_wait(bdev->bd_inode->i_mapping); 1627 filemap_write_and_wait(bdev->bd_inode->i_mapping);
1632 mutex_lock(&root->fs_info->volume_mutex);
1633 1628
1634 devices = &root->fs_info->fs_devices->devices; 1629 devices = &root->fs_info->fs_devices->devices;
1635 /* 1630 /*
@@ -1695,7 +1690,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1695 1690
1696 if (seeding_dev) { 1691 if (seeding_dev) {
1697 sb->s_flags &= ~MS_RDONLY; 1692 sb->s_flags &= ~MS_RDONLY;
1698 ret = btrfs_prepare_sprout(trans, root); 1693 ret = btrfs_prepare_sprout(root);
1699 BUG_ON(ret); 1694 BUG_ON(ret);
1700 } 1695 }
1701 1696
@@ -1757,8 +1752,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1757 ret = btrfs_relocate_sys_chunks(root); 1752 ret = btrfs_relocate_sys_chunks(root);
1758 BUG_ON(ret); 1753 BUG_ON(ret);
1759 } 1754 }
1760out: 1755
1761 mutex_unlock(&root->fs_info->volume_mutex);
1762 return ret; 1756 return ret;
1763error: 1757error:
1764 blkdev_put(bdev, FMODE_EXCL); 1758 blkdev_put(bdev, FMODE_EXCL);
@@ -1766,7 +1760,7 @@ error:
1766 mutex_unlock(&uuid_mutex); 1760 mutex_unlock(&uuid_mutex);
1767 up_write(&sb->s_umount); 1761 up_write(&sb->s_umount);
1768 } 1762 }
1769 goto out; 1763 return ret;
1770} 1764}
1771 1765
1772static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 1766static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
@@ -2077,6 +2071,362 @@ error:
2077 return ret; 2071 return ret;
2078} 2072}
2079 2073
2074static int insert_balance_item(struct btrfs_root *root,
2075 struct btrfs_balance_control *bctl)
2076{
2077 struct btrfs_trans_handle *trans;
2078 struct btrfs_balance_item *item;
2079 struct btrfs_disk_balance_args disk_bargs;
2080 struct btrfs_path *path;
2081 struct extent_buffer *leaf;
2082 struct btrfs_key key;
2083 int ret, err;
2084
2085 path = btrfs_alloc_path();
2086 if (!path)
2087 return -ENOMEM;
2088
2089 trans = btrfs_start_transaction(root, 0);
2090 if (IS_ERR(trans)) {
2091 btrfs_free_path(path);
2092 return PTR_ERR(trans);
2093 }
2094
2095 key.objectid = BTRFS_BALANCE_OBJECTID;
2096 key.type = BTRFS_BALANCE_ITEM_KEY;
2097 key.offset = 0;
2098
2099 ret = btrfs_insert_empty_item(trans, root, path, &key,
2100 sizeof(*item));
2101 if (ret)
2102 goto out;
2103
2104 leaf = path->nodes[0];
2105 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2106
2107 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
2108
2109 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
2110 btrfs_set_balance_data(leaf, item, &disk_bargs);
2111 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
2112 btrfs_set_balance_meta(leaf, item, &disk_bargs);
2113 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
2114 btrfs_set_balance_sys(leaf, item, &disk_bargs);
2115
2116 btrfs_set_balance_flags(leaf, item, bctl->flags);
2117
2118 btrfs_mark_buffer_dirty(leaf);
2119out:
2120 btrfs_free_path(path);
2121 err = btrfs_commit_transaction(trans, root);
2122 if (err && !ret)
2123 ret = err;
2124 return ret;
2125}
2126
2127static int del_balance_item(struct btrfs_root *root)
2128{
2129 struct btrfs_trans_handle *trans;
2130 struct btrfs_path *path;
2131 struct btrfs_key key;
2132 int ret, err;
2133
2134 path = btrfs_alloc_path();
2135 if (!path)
2136 return -ENOMEM;
2137
2138 trans = btrfs_start_transaction(root, 0);
2139 if (IS_ERR(trans)) {
2140 btrfs_free_path(path);
2141 return PTR_ERR(trans);
2142 }
2143
2144 key.objectid = BTRFS_BALANCE_OBJECTID;
2145 key.type = BTRFS_BALANCE_ITEM_KEY;
2146 key.offset = 0;
2147
2148 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2149 if (ret < 0)
2150 goto out;
2151 if (ret > 0) {
2152 ret = -ENOENT;
2153 goto out;
2154 }
2155
2156 ret = btrfs_del_item(trans, root, path);
2157out:
2158 btrfs_free_path(path);
2159 err = btrfs_commit_transaction(trans, root);
2160 if (err && !ret)
2161 ret = err;
2162 return ret;
2163}
2164
2165/*
2166 * This is a heuristic used to reduce the number of chunks balanced on
2167 * resume after balance was interrupted.
2168 */
2169static void update_balance_args(struct btrfs_balance_control *bctl)
2170{
2171 /*
2172 * Turn on soft mode for chunk types that were being converted.
2173 */
2174 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
2175 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
2176 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
2177 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
2178 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
2179 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
2180
2181 /*
2182 * Turn on usage filter if is not already used. The idea is
2183 * that chunks that we have already balanced should be
2184 * reasonably full. Don't do it for chunks that are being
2185 * converted - that will keep us from relocating unconverted
2186 * (albeit full) chunks.
2187 */
2188 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2189 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2190 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
2191 bctl->data.usage = 90;
2192 }
2193 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2194 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2195 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
2196 bctl->sys.usage = 90;
2197 }
2198 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2199 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2200 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
2201 bctl->meta.usage = 90;
2202 }
2203}
2204
2205/*
2206 * Should be called with both balance and volume mutexes held to
2207 * serialize other volume operations (add_dev/rm_dev/resize) with
2208 * restriper. Same goes for unset_balance_control.
2209 */
2210static void set_balance_control(struct btrfs_balance_control *bctl)
2211{
2212 struct btrfs_fs_info *fs_info = bctl->fs_info;
2213
2214 BUG_ON(fs_info->balance_ctl);
2215
2216 spin_lock(&fs_info->balance_lock);
2217 fs_info->balance_ctl = bctl;
2218 spin_unlock(&fs_info->balance_lock);
2219}
2220
2221static void unset_balance_control(struct btrfs_fs_info *fs_info)
2222{
2223 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2224
2225 BUG_ON(!fs_info->balance_ctl);
2226
2227 spin_lock(&fs_info->balance_lock);
2228 fs_info->balance_ctl = NULL;
2229 spin_unlock(&fs_info->balance_lock);
2230
2231 kfree(bctl);
2232}
2233
2234/*
2235 * Balance filters. Return 1 if chunk should be filtered out
2236 * (should not be balanced).
2237 */
2238static int chunk_profiles_filter(u64 chunk_profile,
2239 struct btrfs_balance_args *bargs)
2240{
2241 chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
2242
2243 if (chunk_profile == 0)
2244 chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2245
2246 if (bargs->profiles & chunk_profile)
2247 return 0;
2248
2249 return 1;
2250}
2251
2252static u64 div_factor_fine(u64 num, int factor)
2253{
2254 if (factor <= 0)
2255 return 0;
2256 if (factor >= 100)
2257 return num;
2258
2259 num *= factor;
2260 do_div(num, 100);
2261 return num;
2262}
2263
2264static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2265 struct btrfs_balance_args *bargs)
2266{
2267 struct btrfs_block_group_cache *cache;
2268 u64 chunk_used, user_thresh;
2269 int ret = 1;
2270
2271 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2272 chunk_used = btrfs_block_group_used(&cache->item);
2273
2274 user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
2275 if (chunk_used < user_thresh)
2276 ret = 0;
2277
2278 btrfs_put_block_group(cache);
2279 return ret;
2280}
2281
2282static int chunk_devid_filter(struct extent_buffer *leaf,
2283 struct btrfs_chunk *chunk,
2284 struct btrfs_balance_args *bargs)
2285{
2286 struct btrfs_stripe *stripe;
2287 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2288 int i;
2289
2290 for (i = 0; i < num_stripes; i++) {
2291 stripe = btrfs_stripe_nr(chunk, i);
2292 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
2293 return 0;
2294 }
2295
2296 return 1;
2297}
2298
2299/* [pstart, pend) */
2300static int chunk_drange_filter(struct extent_buffer *leaf,
2301 struct btrfs_chunk *chunk,
2302 u64 chunk_offset,
2303 struct btrfs_balance_args *bargs)
2304{
2305 struct btrfs_stripe *stripe;
2306 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2307 u64 stripe_offset;
2308 u64 stripe_length;
2309 int factor;
2310 int i;
2311
2312 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
2313 return 0;
2314
2315 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
2316 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
2317 factor = 2;
2318 else
2319 factor = 1;
2320 factor = num_stripes / factor;
2321
2322 for (i = 0; i < num_stripes; i++) {
2323 stripe = btrfs_stripe_nr(chunk, i);
2324 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
2325 continue;
2326
2327 stripe_offset = btrfs_stripe_offset(leaf, stripe);
2328 stripe_length = btrfs_chunk_length(leaf, chunk);
2329 do_div(stripe_length, factor);
2330
2331 if (stripe_offset < bargs->pend &&
2332 stripe_offset + stripe_length > bargs->pstart)
2333 return 0;
2334 }
2335
2336 return 1;
2337}
2338
2339/* [vstart, vend) */
2340static int chunk_vrange_filter(struct extent_buffer *leaf,
2341 struct btrfs_chunk *chunk,
2342 u64 chunk_offset,
2343 struct btrfs_balance_args *bargs)
2344{
2345 if (chunk_offset < bargs->vend &&
2346 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
2347 /* at least part of the chunk is inside this vrange */
2348 return 0;
2349
2350 return 1;
2351}
2352
2353static int chunk_soft_convert_filter(u64 chunk_profile,
2354 struct btrfs_balance_args *bargs)
2355{
2356 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
2357 return 0;
2358
2359 chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
2360
2361 if (chunk_profile == 0)
2362 chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2363
2364 if (bargs->target & chunk_profile)
2365 return 1;
2366
2367 return 0;
2368}
2369
2370static int should_balance_chunk(struct btrfs_root *root,
2371 struct extent_buffer *leaf,
2372 struct btrfs_chunk *chunk, u64 chunk_offset)
2373{
2374 struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
2375 struct btrfs_balance_args *bargs = NULL;
2376 u64 chunk_type = btrfs_chunk_type(leaf, chunk);
2377
2378 /* type filter */
2379 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
2380 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
2381 return 0;
2382 }
2383
2384 if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
2385 bargs = &bctl->data;
2386 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
2387 bargs = &bctl->sys;
2388 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
2389 bargs = &bctl->meta;
2390
2391 /* profiles filter */
2392 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
2393 chunk_profiles_filter(chunk_type, bargs)) {
2394 return 0;
2395 }
2396
2397 /* usage filter */
2398 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
2399 chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
2400 return 0;
2401 }
2402
2403 /* devid filter */
2404 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
2405 chunk_devid_filter(leaf, chunk, bargs)) {
2406 return 0;
2407 }
2408
2409 /* drange filter, makes sense only with devid filter */
2410 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
2411 chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
2412 return 0;
2413 }
2414
2415 /* vrange filter */
2416 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
2417 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
2418 return 0;
2419 }
2420
2421 /* soft profile changing mode */
2422 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
2423 chunk_soft_convert_filter(chunk_type, bargs)) {
2424 return 0;
2425 }
2426
2427 return 1;
2428}
2429
2080static u64 div_factor(u64 num, int factor) 2430static u64 div_factor(u64 num, int factor)
2081{ 2431{
2082 if (factor == 10) 2432 if (factor == 10)
@@ -2086,29 +2436,28 @@ static u64 div_factor(u64 num, int factor)
2086 return num; 2436 return num;
2087} 2437}
2088 2438
2089int btrfs_balance(struct btrfs_root *dev_root) 2439static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2090{ 2440{
2091 int ret; 2441 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2092 struct list_head *devices = &dev_root->fs_info->fs_devices->devices; 2442 struct btrfs_root *chunk_root = fs_info->chunk_root;
2443 struct btrfs_root *dev_root = fs_info->dev_root;
2444 struct list_head *devices;
2093 struct btrfs_device *device; 2445 struct btrfs_device *device;
2094 u64 old_size; 2446 u64 old_size;
2095 u64 size_to_free; 2447 u64 size_to_free;
2448 struct btrfs_chunk *chunk;
2096 struct btrfs_path *path; 2449 struct btrfs_path *path;
2097 struct btrfs_key key; 2450 struct btrfs_key key;
2098 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
2099 struct btrfs_trans_handle *trans;
2100 struct btrfs_key found_key; 2451 struct btrfs_key found_key;
2101 2452 struct btrfs_trans_handle *trans;
2102 if (dev_root->fs_info->sb->s_flags & MS_RDONLY) 2453 struct extent_buffer *leaf;
2103 return -EROFS; 2454 int slot;
2104 2455 int ret;
2105 if (!capable(CAP_SYS_ADMIN)) 2456 int enospc_errors = 0;
2106 return -EPERM; 2457 bool counting = true;
2107
2108 mutex_lock(&dev_root->fs_info->volume_mutex);
2109 dev_root = dev_root->fs_info->dev_root;
2110 2458
2111 /* step one make some room on all the devices */ 2459 /* step one make some room on all the devices */
2460 devices = &fs_info->fs_devices->devices;
2112 list_for_each_entry(device, devices, dev_list) { 2461 list_for_each_entry(device, devices, dev_list) {
2113 old_size = device->total_bytes; 2462 old_size = device->total_bytes;
2114 size_to_free = div_factor(old_size, 1); 2463 size_to_free = div_factor(old_size, 1);
@@ -2137,11 +2486,23 @@ int btrfs_balance(struct btrfs_root *dev_root)
2137 ret = -ENOMEM; 2486 ret = -ENOMEM;
2138 goto error; 2487 goto error;
2139 } 2488 }
2489
2490 /* zero out stat counters */
2491 spin_lock(&fs_info->balance_lock);
2492 memset(&bctl->stat, 0, sizeof(bctl->stat));
2493 spin_unlock(&fs_info->balance_lock);
2494again:
2140 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2495 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2141 key.offset = (u64)-1; 2496 key.offset = (u64)-1;
2142 key.type = BTRFS_CHUNK_ITEM_KEY; 2497 key.type = BTRFS_CHUNK_ITEM_KEY;
2143 2498
2144 while (1) { 2499 while (1) {
2500 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
2501 atomic_read(&fs_info->balance_cancel_req)) {
2502 ret = -ECANCELED;
2503 goto error;
2504 }
2505
2145 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 2506 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2146 if (ret < 0) 2507 if (ret < 0)
2147 goto error; 2508 goto error;
@@ -2151,15 +2512,19 @@ int btrfs_balance(struct btrfs_root *dev_root)
2151 * failed 2512 * failed
2152 */ 2513 */
2153 if (ret == 0) 2514 if (ret == 0)
2154 break; 2515 BUG(); /* FIXME break ? */
2155 2516
2156 ret = btrfs_previous_item(chunk_root, path, 0, 2517 ret = btrfs_previous_item(chunk_root, path, 0,
2157 BTRFS_CHUNK_ITEM_KEY); 2518 BTRFS_CHUNK_ITEM_KEY);
2158 if (ret) 2519 if (ret) {
2520 ret = 0;
2159 break; 2521 break;
2522 }
2523
2524 leaf = path->nodes[0];
2525 slot = path->slots[0];
2526 btrfs_item_key_to_cpu(leaf, &found_key, slot);
2160 2527
2161 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2162 path->slots[0]);
2163 if (found_key.objectid != key.objectid) 2528 if (found_key.objectid != key.objectid)
2164 break; 2529 break;
2165 2530
@@ -2167,22 +2532,375 @@ int btrfs_balance(struct btrfs_root *dev_root)
2167 if (found_key.offset == 0) 2532 if (found_key.offset == 0)
2168 break; 2533 break;
2169 2534
2535 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
2536
2537 if (!counting) {
2538 spin_lock(&fs_info->balance_lock);
2539 bctl->stat.considered++;
2540 spin_unlock(&fs_info->balance_lock);
2541 }
2542
2543 ret = should_balance_chunk(chunk_root, leaf, chunk,
2544 found_key.offset);
2170 btrfs_release_path(path); 2545 btrfs_release_path(path);
2546 if (!ret)
2547 goto loop;
2548
2549 if (counting) {
2550 spin_lock(&fs_info->balance_lock);
2551 bctl->stat.expected++;
2552 spin_unlock(&fs_info->balance_lock);
2553 goto loop;
2554 }
2555
2171 ret = btrfs_relocate_chunk(chunk_root, 2556 ret = btrfs_relocate_chunk(chunk_root,
2172 chunk_root->root_key.objectid, 2557 chunk_root->root_key.objectid,
2173 found_key.objectid, 2558 found_key.objectid,
2174 found_key.offset); 2559 found_key.offset);
2175 if (ret && ret != -ENOSPC) 2560 if (ret && ret != -ENOSPC)
2176 goto error; 2561 goto error;
2562 if (ret == -ENOSPC) {
2563 enospc_errors++;
2564 } else {
2565 spin_lock(&fs_info->balance_lock);
2566 bctl->stat.completed++;
2567 spin_unlock(&fs_info->balance_lock);
2568 }
2569loop:
2177 key.offset = found_key.offset - 1; 2570 key.offset = found_key.offset - 1;
2178 } 2571 }
2179 ret = 0; 2572
2573 if (counting) {
2574 btrfs_release_path(path);
2575 counting = false;
2576 goto again;
2577 }
2180error: 2578error:
2181 btrfs_free_path(path); 2579 btrfs_free_path(path);
2182 mutex_unlock(&dev_root->fs_info->volume_mutex); 2580 if (enospc_errors) {
2581 printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
2582 enospc_errors);
2583 if (!ret)
2584 ret = -ENOSPC;
2585 }
2586
2183 return ret; 2587 return ret;
2184} 2588}
2185 2589
2590static inline int balance_need_close(struct btrfs_fs_info *fs_info)
2591{
2592 /* cancel requested || normal exit path */
2593 return atomic_read(&fs_info->balance_cancel_req) ||
2594 (atomic_read(&fs_info->balance_pause_req) == 0 &&
2595 atomic_read(&fs_info->balance_cancel_req) == 0);
2596}
2597
2598static void __cancel_balance(struct btrfs_fs_info *fs_info)
2599{
2600 int ret;
2601
2602 unset_balance_control(fs_info);
2603 ret = del_balance_item(fs_info->tree_root);
2604 BUG_ON(ret);
2605}
2606
2607void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
2608 struct btrfs_ioctl_balance_args *bargs);
2609
2610/*
2611 * Should be called with both balance and volume mutexes held
2612 */
2613int btrfs_balance(struct btrfs_balance_control *bctl,
2614 struct btrfs_ioctl_balance_args *bargs)
2615{
2616 struct btrfs_fs_info *fs_info = bctl->fs_info;
2617 u64 allowed;
2618 int ret;
2619
2620 if (btrfs_fs_closing(fs_info) ||
2621 atomic_read(&fs_info->balance_pause_req) ||
2622 atomic_read(&fs_info->balance_cancel_req)) {
2623 ret = -EINVAL;
2624 goto out;
2625 }
2626
2627 /*
2628 * In case of mixed groups both data and meta should be picked,
2629 * and identical options should be given for both of them.
2630 */
2631 allowed = btrfs_super_incompat_flags(fs_info->super_copy);
2632 if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
2633 (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) {
2634 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
2635 !(bctl->flags & BTRFS_BALANCE_METADATA) ||
2636 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
2637 printk(KERN_ERR "btrfs: with mixed groups data and "
2638 "metadata balance options must be the same\n");
2639 ret = -EINVAL;
2640 goto out;
2641 }
2642 }
2643
2644 /*
2645 * Profile changing sanity checks. Skip them if a simple
2646 * balance is requested.
2647 */
2648 if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) &
2649 BTRFS_BALANCE_ARGS_CONVERT))
2650 goto do_balance;
2651
2652 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2653 if (fs_info->fs_devices->num_devices == 1)
2654 allowed |= BTRFS_BLOCK_GROUP_DUP;
2655 else if (fs_info->fs_devices->num_devices < 4)
2656 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
2657 else
2658 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
2659 BTRFS_BLOCK_GROUP_RAID10);
2660
2661 if (!profile_is_valid(bctl->data.target, 1) ||
2662 bctl->data.target & ~allowed) {
2663 printk(KERN_ERR "btrfs: unable to start balance with target "
2664 "data profile %llu\n",
2665 (unsigned long long)bctl->data.target);
2666 ret = -EINVAL;
2667 goto out;
2668 }
2669 if (!profile_is_valid(bctl->meta.target, 1) ||
2670 bctl->meta.target & ~allowed) {
2671 printk(KERN_ERR "btrfs: unable to start balance with target "
2672 "metadata profile %llu\n",
2673 (unsigned long long)bctl->meta.target);
2674 ret = -EINVAL;
2675 goto out;
2676 }
2677 if (!profile_is_valid(bctl->sys.target, 1) ||
2678 bctl->sys.target & ~allowed) {
2679 printk(KERN_ERR "btrfs: unable to start balance with target "
2680 "system profile %llu\n",
2681 (unsigned long long)bctl->sys.target);
2682 ret = -EINVAL;
2683 goto out;
2684 }
2685
2686 if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) {
2687 printk(KERN_ERR "btrfs: dup for data is not allowed\n");
2688 ret = -EINVAL;
2689 goto out;
2690 }
2691
2692 /* allow to reduce meta or sys integrity only if force set */
2693 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2694 BTRFS_BLOCK_GROUP_RAID10;
2695 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2696 (fs_info->avail_system_alloc_bits & allowed) &&
2697 !(bctl->sys.target & allowed)) ||
2698 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2699 (fs_info->avail_metadata_alloc_bits & allowed) &&
2700 !(bctl->meta.target & allowed))) {
2701 if (bctl->flags & BTRFS_BALANCE_FORCE) {
2702 printk(KERN_INFO "btrfs: force reducing metadata "
2703 "integrity\n");
2704 } else {
2705 printk(KERN_ERR "btrfs: balance will reduce metadata "
2706 "integrity, use force if you want this\n");
2707 ret = -EINVAL;
2708 goto out;
2709 }
2710 }
2711
2712do_balance:
2713 ret = insert_balance_item(fs_info->tree_root, bctl);
2714 if (ret && ret != -EEXIST)
2715 goto out;
2716
2717 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
2718 BUG_ON(ret == -EEXIST);
2719 set_balance_control(bctl);
2720 } else {
2721 BUG_ON(ret != -EEXIST);
2722 spin_lock(&fs_info->balance_lock);
2723 update_balance_args(bctl);
2724 spin_unlock(&fs_info->balance_lock);
2725 }
2726
2727 atomic_inc(&fs_info->balance_running);
2728 mutex_unlock(&fs_info->balance_mutex);
2729
2730 ret = __btrfs_balance(fs_info);
2731
2732 mutex_lock(&fs_info->balance_mutex);
2733 atomic_dec(&fs_info->balance_running);
2734
2735 if (bargs) {
2736 memset(bargs, 0, sizeof(*bargs));
2737 update_ioctl_balance_args(fs_info, 0, bargs);
2738 }
2739
2740 if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
2741 balance_need_close(fs_info)) {
2742 __cancel_balance(fs_info);
2743 }
2744
2745 wake_up(&fs_info->balance_wait_q);
2746
2747 return ret;
2748out:
2749 if (bctl->flags & BTRFS_BALANCE_RESUME)
2750 __cancel_balance(fs_info);
2751 else
2752 kfree(bctl);
2753 return ret;
2754}
2755
2756static int balance_kthread(void *data)
2757{
2758 struct btrfs_balance_control *bctl =
2759 (struct btrfs_balance_control *)data;
2760 struct btrfs_fs_info *fs_info = bctl->fs_info;
2761 int ret = 0;
2762
2763 mutex_lock(&fs_info->volume_mutex);
2764 mutex_lock(&fs_info->balance_mutex);
2765
2766 set_balance_control(bctl);
2767
2768 if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
2769 printk(KERN_INFO "btrfs: force skipping balance\n");
2770 } else {
2771 printk(KERN_INFO "btrfs: continuing balance\n");
2772 ret = btrfs_balance(bctl, NULL);
2773 }
2774
2775 mutex_unlock(&fs_info->balance_mutex);
2776 mutex_unlock(&fs_info->volume_mutex);
2777 return ret;
2778}
2779
2780int btrfs_recover_balance(struct btrfs_root *tree_root)
2781{
2782 struct task_struct *tsk;
2783 struct btrfs_balance_control *bctl;
2784 struct btrfs_balance_item *item;
2785 struct btrfs_disk_balance_args disk_bargs;
2786 struct btrfs_path *path;
2787 struct extent_buffer *leaf;
2788 struct btrfs_key key;
2789 int ret;
2790
2791 path = btrfs_alloc_path();
2792 if (!path)
2793 return -ENOMEM;
2794
2795 bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
2796 if (!bctl) {
2797 ret = -ENOMEM;
2798 goto out;
2799 }
2800
2801 key.objectid = BTRFS_BALANCE_OBJECTID;
2802 key.type = BTRFS_BALANCE_ITEM_KEY;
2803 key.offset = 0;
2804
2805 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
2806 if (ret < 0)
2807 goto out_bctl;
2808 if (ret > 0) { /* ret = -ENOENT; */
2809 ret = 0;
2810 goto out_bctl;
2811 }
2812
2813 leaf = path->nodes[0];
2814 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2815
2816 bctl->fs_info = tree_root->fs_info;
2817 bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME;
2818
2819 btrfs_balance_data(leaf, item, &disk_bargs);
2820 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
2821 btrfs_balance_meta(leaf, item, &disk_bargs);
2822 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
2823 btrfs_balance_sys(leaf, item, &disk_bargs);
2824 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
2825
2826 tsk = kthread_run(balance_kthread, bctl, "btrfs-balance");
2827 if (IS_ERR(tsk))
2828 ret = PTR_ERR(tsk);
2829 else
2830 goto out;
2831
2832out_bctl:
2833 kfree(bctl);
2834out:
2835 btrfs_free_path(path);
2836 return ret;
2837}
2838
2839int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
2840{
2841 int ret = 0;
2842
2843 mutex_lock(&fs_info->balance_mutex);
2844 if (!fs_info->balance_ctl) {
2845 mutex_unlock(&fs_info->balance_mutex);
2846 return -ENOTCONN;
2847 }
2848
2849 if (atomic_read(&fs_info->balance_running)) {
2850 atomic_inc(&fs_info->balance_pause_req);
2851 mutex_unlock(&fs_info->balance_mutex);
2852
2853 wait_event(fs_info->balance_wait_q,
2854 atomic_read(&fs_info->balance_running) == 0);
2855
2856 mutex_lock(&fs_info->balance_mutex);
2857 /* we are good with balance_ctl ripped off from under us */
2858 BUG_ON(atomic_read(&fs_info->balance_running));
2859 atomic_dec(&fs_info->balance_pause_req);
2860 } else {
2861 ret = -ENOTCONN;
2862 }
2863
2864 mutex_unlock(&fs_info->balance_mutex);
2865 return ret;
2866}
2867
2868int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
2869{
2870 mutex_lock(&fs_info->balance_mutex);
2871 if (!fs_info->balance_ctl) {
2872 mutex_unlock(&fs_info->balance_mutex);
2873 return -ENOTCONN;
2874 }
2875
2876 atomic_inc(&fs_info->balance_cancel_req);
2877 /*
2878 * if we are running just wait and return, balance item is
2879 * deleted in btrfs_balance in this case
2880 */
2881 if (atomic_read(&fs_info->balance_running)) {
2882 mutex_unlock(&fs_info->balance_mutex);
2883 wait_event(fs_info->balance_wait_q,
2884 atomic_read(&fs_info->balance_running) == 0);
2885 mutex_lock(&fs_info->balance_mutex);
2886 } else {
2887 /* __cancel_balance needs volume_mutex */
2888 mutex_unlock(&fs_info->balance_mutex);
2889 mutex_lock(&fs_info->volume_mutex);
2890 mutex_lock(&fs_info->balance_mutex);
2891
2892 if (fs_info->balance_ctl)
2893 __cancel_balance(fs_info);
2894
2895 mutex_unlock(&fs_info->volume_mutex);
2896 }
2897
2898 BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
2899 atomic_dec(&fs_info->balance_cancel_req);
2900 mutex_unlock(&fs_info->balance_mutex);
2901 return 0;
2902}
2903
2186/* 2904/*
2187 * shrinking a device means finding all of the device extents past 2905 * shrinking a device means finding all of the device extents past
2188 * the new size, and then following the back refs to the chunks. 2906 * the new size, and then following the back refs to the chunks.
@@ -2323,8 +3041,7 @@ done:
2323 return ret; 3041 return ret;
2324} 3042}
2325 3043
2326static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, 3044static int btrfs_add_system_chunk(struct btrfs_root *root,
2327 struct btrfs_root *root,
2328 struct btrfs_key *key, 3045 struct btrfs_key *key,
2329 struct btrfs_chunk *chunk, int item_size) 3046 struct btrfs_chunk *chunk, int item_size)
2330{ 3047{
@@ -2441,10 +3158,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2441 max_stripe_size = 1024 * 1024 * 1024; 3158 max_stripe_size = 1024 * 1024 * 1024;
2442 max_chunk_size = 10 * max_stripe_size; 3159 max_chunk_size = 10 * max_stripe_size;
2443 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 3160 } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
2444 max_stripe_size = 256 * 1024 * 1024; 3161 /* for larger filesystems, use larger metadata chunks */
3162 if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
3163 max_stripe_size = 1024 * 1024 * 1024;
3164 else
3165 max_stripe_size = 256 * 1024 * 1024;
2445 max_chunk_size = max_stripe_size; 3166 max_chunk_size = max_stripe_size;
2446 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 3167 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
2447 max_stripe_size = 8 * 1024 * 1024; 3168 max_stripe_size = 32 * 1024 * 1024;
2448 max_chunk_size = 2 * max_stripe_size; 3169 max_chunk_size = 2 * max_stripe_size;
2449 } else { 3170 } else {
2450 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n", 3171 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
@@ -2496,7 +3217,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2496 if (total_avail == 0) 3217 if (total_avail == 0)
2497 continue; 3218 continue;
2498 3219
2499 ret = find_free_dev_extent(trans, device, 3220 ret = find_free_dev_extent(device,
2500 max_stripe_size * dev_stripes, 3221 max_stripe_size * dev_stripes,
2501 &dev_offset, &max_avail); 3222 &dev_offset, &max_avail);
2502 if (ret && ret != -ENOSPC) 3223 if (ret && ret != -ENOSPC)
@@ -2687,7 +3408,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2687 BUG_ON(ret); 3408 BUG_ON(ret);
2688 3409
2689 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 3410 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2690 ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk, 3411 ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
2691 item_size); 3412 item_size);
2692 BUG_ON(ret); 3413 BUG_ON(ret);
2693 } 3414 }
@@ -2752,8 +3473,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
2752 return ret; 3473 return ret;
2753 3474
2754 alloc_profile = BTRFS_BLOCK_GROUP_METADATA | 3475 alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
2755 (fs_info->metadata_alloc_profile & 3476 fs_info->avail_metadata_alloc_bits;
2756 fs_info->avail_metadata_alloc_bits);
2757 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 3477 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2758 3478
2759 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 3479 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
@@ -2763,8 +3483,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
2763 sys_chunk_offset = chunk_offset + chunk_size; 3483 sys_chunk_offset = chunk_offset + chunk_size;
2764 3484
2765 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | 3485 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
2766 (fs_info->system_alloc_profile & 3486 fs_info->avail_system_alloc_bits;
2767 fs_info->avail_system_alloc_bits);
2768 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 3487 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2769 3488
2770 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 3489 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
@@ -2901,26 +3620,13 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2901 u64 stripe_nr; 3620 u64 stripe_nr;
2902 u64 stripe_nr_orig; 3621 u64 stripe_nr_orig;
2903 u64 stripe_nr_end; 3622 u64 stripe_nr_end;
2904 int stripes_allocated = 8;
2905 int stripes_required = 1;
2906 int stripe_index; 3623 int stripe_index;
2907 int i; 3624 int i;
3625 int ret = 0;
2908 int num_stripes; 3626 int num_stripes;
2909 int max_errors = 0; 3627 int max_errors = 0;
2910 struct btrfs_bio *bbio = NULL; 3628 struct btrfs_bio *bbio = NULL;
2911 3629
2912 if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
2913 stripes_allocated = 1;
2914again:
2915 if (bbio_ret) {
2916 bbio = kzalloc(btrfs_bio_size(stripes_allocated),
2917 GFP_NOFS);
2918 if (!bbio)
2919 return -ENOMEM;
2920
2921 atomic_set(&bbio->error, 0);
2922 }
2923
2924 read_lock(&em_tree->lock); 3630 read_lock(&em_tree->lock);
2925 em = lookup_extent_mapping(em_tree, logical, *length); 3631 em = lookup_extent_mapping(em_tree, logical, *length);
2926 read_unlock(&em_tree->lock); 3632 read_unlock(&em_tree->lock);
@@ -2939,32 +3645,6 @@ again:
2939 if (mirror_num > map->num_stripes) 3645 if (mirror_num > map->num_stripes)
2940 mirror_num = 0; 3646 mirror_num = 0;
2941 3647
2942 /* if our btrfs_bio struct is too small, back off and try again */
2943 if (rw & REQ_WRITE) {
2944 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2945 BTRFS_BLOCK_GROUP_DUP)) {
2946 stripes_required = map->num_stripes;
2947 max_errors = 1;
2948 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2949 stripes_required = map->sub_stripes;
2950 max_errors = 1;
2951 }
2952 }
2953 if (rw & REQ_DISCARD) {
2954 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2955 BTRFS_BLOCK_GROUP_RAID1 |
2956 BTRFS_BLOCK_GROUP_DUP |
2957 BTRFS_BLOCK_GROUP_RAID10)) {
2958 stripes_required = map->num_stripes;
2959 }
2960 }
2961 if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
2962 stripes_allocated < stripes_required) {
2963 stripes_allocated = map->num_stripes;
2964 free_extent_map(em);
2965 kfree(bbio);
2966 goto again;
2967 }
2968 stripe_nr = offset; 3648 stripe_nr = offset;
2969 /* 3649 /*
2970 * stripe_nr counts the total number of stripes we have to stride 3650 * stripe_nr counts the total number of stripes we have to stride
@@ -2980,10 +3660,7 @@ again:
2980 3660
2981 if (rw & REQ_DISCARD) 3661 if (rw & REQ_DISCARD)
2982 *length = min_t(u64, em->len - offset, *length); 3662 *length = min_t(u64, em->len - offset, *length);
2983 else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 3663 else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
2984 BTRFS_BLOCK_GROUP_RAID1 |
2985 BTRFS_BLOCK_GROUP_RAID10 |
2986 BTRFS_BLOCK_GROUP_DUP)) {
2987 /* we limit the length of each bio to what fits in a stripe */ 3664 /* we limit the length of each bio to what fits in a stripe */
2988 *length = min_t(u64, em->len - offset, 3665 *length = min_t(u64, em->len - offset,
2989 map->stripe_len - stripe_offset); 3666 map->stripe_len - stripe_offset);
@@ -3059,81 +3736,55 @@ again:
3059 } 3736 }
3060 BUG_ON(stripe_index >= map->num_stripes); 3737 BUG_ON(stripe_index >= map->num_stripes);
3061 3738
3739 bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS);
3740 if (!bbio) {
3741 ret = -ENOMEM;
3742 goto out;
3743 }
3744 atomic_set(&bbio->error, 0);
3745
3062 if (rw & REQ_DISCARD) { 3746 if (rw & REQ_DISCARD) {
3747 int factor = 0;
3748 int sub_stripes = 0;
3749 u64 stripes_per_dev = 0;
3750 u32 remaining_stripes = 0;
3751
3752 if (map->type &
3753 (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
3754 if (map->type & BTRFS_BLOCK_GROUP_RAID0)
3755 sub_stripes = 1;
3756 else
3757 sub_stripes = map->sub_stripes;
3758
3759 factor = map->num_stripes / sub_stripes;
3760 stripes_per_dev = div_u64_rem(stripe_nr_end -
3761 stripe_nr_orig,
3762 factor,
3763 &remaining_stripes);
3764 }
3765
3063 for (i = 0; i < num_stripes; i++) { 3766 for (i = 0; i < num_stripes; i++) {
3064 bbio->stripes[i].physical = 3767 bbio->stripes[i].physical =
3065 map->stripes[stripe_index].physical + 3768 map->stripes[stripe_index].physical +
3066 stripe_offset + stripe_nr * map->stripe_len; 3769 stripe_offset + stripe_nr * map->stripe_len;
3067 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 3770 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
3068 3771
3069 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3772 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3070 u64 stripes; 3773 BTRFS_BLOCK_GROUP_RAID10)) {
3071 u32 last_stripe = 0; 3774 bbio->stripes[i].length = stripes_per_dev *
3072 int j; 3775 map->stripe_len;
3073 3776 if (i / sub_stripes < remaining_stripes)
3074 div_u64_rem(stripe_nr_end - 1, 3777 bbio->stripes[i].length +=
3075 map->num_stripes, 3778 map->stripe_len;
3076 &last_stripe); 3779 if (i < sub_stripes)
3077
3078 for (j = 0; j < map->num_stripes; j++) {
3079 u32 test;
3080
3081 div_u64_rem(stripe_nr_end - 1 - j,
3082 map->num_stripes, &test);
3083 if (test == stripe_index)
3084 break;
3085 }
3086 stripes = stripe_nr_end - 1 - j;
3087 do_div(stripes, map->num_stripes);
3088 bbio->stripes[i].length = map->stripe_len *
3089 (stripes - stripe_nr + 1);
3090
3091 if (i == 0) {
3092 bbio->stripes[i].length -=
3093 stripe_offset;
3094 stripe_offset = 0;
3095 }
3096 if (stripe_index == last_stripe)
3097 bbio->stripes[i].length -=
3098 stripe_end_offset;
3099 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3100 u64 stripes;
3101 int j;
3102 int factor = map->num_stripes /
3103 map->sub_stripes;
3104 u32 last_stripe = 0;
3105
3106 div_u64_rem(stripe_nr_end - 1,
3107 factor, &last_stripe);
3108 last_stripe *= map->sub_stripes;
3109
3110 for (j = 0; j < factor; j++) {
3111 u32 test;
3112
3113 div_u64_rem(stripe_nr_end - 1 - j,
3114 factor, &test);
3115
3116 if (test ==
3117 stripe_index / map->sub_stripes)
3118 break;
3119 }
3120 stripes = stripe_nr_end - 1 - j;
3121 do_div(stripes, factor);
3122 bbio->stripes[i].length = map->stripe_len *
3123 (stripes - stripe_nr + 1);
3124
3125 if (i < map->sub_stripes) {
3126 bbio->stripes[i].length -= 3780 bbio->stripes[i].length -=
3127 stripe_offset; 3781 stripe_offset;
3128 if (i == map->sub_stripes - 1) 3782 if ((i / sub_stripes + 1) %
3129 stripe_offset = 0; 3783 sub_stripes == remaining_stripes)
3130 }
3131 if (stripe_index >= last_stripe &&
3132 stripe_index <= (last_stripe +
3133 map->sub_stripes - 1)) {
3134 bbio->stripes[i].length -= 3784 bbio->stripes[i].length -=
3135 stripe_end_offset; 3785 stripe_end_offset;
3136 } 3786 if (i == sub_stripes - 1)
3787 stripe_offset = 0;
3137 } else 3788 } else
3138 bbio->stripes[i].length = *length; 3789 bbio->stripes[i].length = *length;
3139 3790
@@ -3155,15 +3806,22 @@ again:
3155 stripe_index++; 3806 stripe_index++;
3156 } 3807 }
3157 } 3808 }
3158 if (bbio_ret) { 3809
3159 *bbio_ret = bbio; 3810 if (rw & REQ_WRITE) {
3160 bbio->num_stripes = num_stripes; 3811 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
3161 bbio->max_errors = max_errors; 3812 BTRFS_BLOCK_GROUP_RAID10 |
3162 bbio->mirror_num = mirror_num; 3813 BTRFS_BLOCK_GROUP_DUP)) {
3814 max_errors = 1;
3815 }
3163 } 3816 }
3817
3818 *bbio_ret = bbio;
3819 bbio->num_stripes = num_stripes;
3820 bbio->max_errors = max_errors;
3821 bbio->mirror_num = mirror_num;
3164out: 3822out:
3165 free_extent_map(em); 3823 free_extent_map(em);
3166 return 0; 3824 return ret;
3167} 3825}
3168 3826
3169int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 3827int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
@@ -3304,7 +3962,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
3304 /* don't bother with additional async steps for reads, right now */ 3962 /* don't bother with additional async steps for reads, right now */
3305 if (!(rw & REQ_WRITE)) { 3963 if (!(rw & REQ_WRITE)) {
3306 bio_get(bio); 3964 bio_get(bio);
3307 submit_bio(rw, bio); 3965 btrfsic_submit_bio(rw, bio);
3308 bio_put(bio); 3966 bio_put(bio);
3309 return 0; 3967 return 0;
3310 } 3968 }
@@ -3399,7 +4057,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3399 if (async_submit) 4057 if (async_submit)
3400 schedule_bio(root, dev, rw, bio); 4058 schedule_bio(root, dev, rw, bio);
3401 else 4059 else
3402 submit_bio(rw, bio); 4060 btrfsic_submit_bio(rw, bio);
3403 } else { 4061 } else {
3404 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; 4062 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
3405 bio->bi_sector = logical >> 9; 4063 bio->bi_sector = logical >> 9;
@@ -3568,7 +4226,7 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
3568 struct btrfs_fs_devices *fs_devices; 4226 struct btrfs_fs_devices *fs_devices;
3569 int ret; 4227 int ret;
3570 4228
3571 mutex_lock(&uuid_mutex); 4229 BUG_ON(!mutex_is_locked(&uuid_mutex));
3572 4230
3573 fs_devices = root->fs_info->fs_devices->seed; 4231 fs_devices = root->fs_info->fs_devices->seed;
3574 while (fs_devices) { 4232 while (fs_devices) {
@@ -3606,7 +4264,6 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
3606 fs_devices->seed = root->fs_info->fs_devices->seed; 4264 fs_devices->seed = root->fs_info->fs_devices->seed;
3607 root->fs_info->fs_devices->seed = fs_devices; 4265 root->fs_info->fs_devices->seed = fs_devices;
3608out: 4266out:
3609 mutex_unlock(&uuid_mutex);
3610 return ret; 4267 return ret;
3611} 4268}
3612 4269
@@ -3749,6 +4406,9 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
3749 if (!path) 4406 if (!path)
3750 return -ENOMEM; 4407 return -ENOMEM;
3751 4408
4409 mutex_lock(&uuid_mutex);
4410 lock_chunks(root);
4411
3752 /* first we search for all of the device items, and then we 4412 /* first we search for all of the device items, and then we
3753 * read in all of the chunk items. This way we can create chunk 4413 * read in all of the chunk items. This way we can create chunk
3754 * mappings that reference all of the devices that are afound 4414 * mappings that reference all of the devices that are afound
@@ -3799,6 +4459,9 @@ again:
3799 } 4459 }
3800 ret = 0; 4460 ret = 0;
3801error: 4461error:
4462 unlock_chunks(root);
4463 mutex_unlock(&uuid_mutex);
4464
3802 btrfs_free_path(path); 4465 btrfs_free_path(path);
3803 return ret; 4466 return ret;
3804} 4467}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 78f2d4d4f37f..19ac95048b88 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -186,6 +186,51 @@ struct map_lookup {
186#define map_lookup_size(n) (sizeof(struct map_lookup) + \ 186#define map_lookup_size(n) (sizeof(struct map_lookup) + \
187 (sizeof(struct btrfs_bio_stripe) * (n))) 187 (sizeof(struct btrfs_bio_stripe) * (n)))
188 188
189/*
190 * Restriper's general type filter
191 */
192#define BTRFS_BALANCE_DATA (1ULL << 0)
193#define BTRFS_BALANCE_SYSTEM (1ULL << 1)
194#define BTRFS_BALANCE_METADATA (1ULL << 2)
195
196#define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \
197 BTRFS_BALANCE_SYSTEM | \
198 BTRFS_BALANCE_METADATA)
199
200#define BTRFS_BALANCE_FORCE (1ULL << 3)
201#define BTRFS_BALANCE_RESUME (1ULL << 4)
202
203/*
204 * Balance filters
205 */
206#define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0)
207#define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1)
208#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2)
209#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3)
210#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4)
211
212/*
213 * Profile changing flags. When SOFT is set we won't relocate chunk if
214 * it already has the target profile (even though it may be
215 * half-filled).
216 */
217#define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8)
218#define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9)
219
220struct btrfs_balance_args;
221struct btrfs_balance_progress;
222struct btrfs_balance_control {
223 struct btrfs_fs_info *fs_info;
224
225 struct btrfs_balance_args data;
226 struct btrfs_balance_args meta;
227 struct btrfs_balance_args sys;
228
229 u64 flags;
230
231 struct btrfs_balance_progress stat;
232};
233
189int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 234int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
190 u64 end, u64 *length); 235 u64 end, u64 *length);
191 236
@@ -228,9 +273,12 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
228 u8 *uuid, u8 *fsid); 273 u8 *uuid, u8 *fsid);
229int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); 274int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
230int btrfs_init_new_device(struct btrfs_root *root, char *path); 275int btrfs_init_new_device(struct btrfs_root *root, char *path);
231int btrfs_balance(struct btrfs_root *dev_root); 276int btrfs_balance(struct btrfs_balance_control *bctl,
277 struct btrfs_ioctl_balance_args *bargs);
278int btrfs_recover_balance(struct btrfs_root *tree_root);
279int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
280int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
232int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); 281int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
233int find_free_dev_extent(struct btrfs_trans_handle *trans, 282int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
234 struct btrfs_device *device, u64 num_bytes,
235 u64 *start, u64 *max_avail); 283 u64 *start, u64 *max_avail);
236#endif 284#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3848b04e310e..e7a5659087e6 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -200,7 +200,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
200 ret = btrfs_update_inode(trans, root, inode); 200 ret = btrfs_update_inode(trans, root, inode);
201 BUG_ON(ret); 201 BUG_ON(ret);
202out: 202out:
203 btrfs_end_transaction_throttle(trans, root); 203 btrfs_end_transaction(trans, root);
204 return ret; 204 return ret;
205} 205}
206 206