aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/Kconfig19
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/backref.c1131
-rw-r--r--fs/btrfs/backref.h5
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/check-integrity.c3068
-rw-r--r--fs/btrfs/check-integrity.h36
-rw-r--r--fs/btrfs/ctree.c42
-rw-r--r--fs/btrfs/ctree.h239
-rw-r--r--fs/btrfs/delayed-inode.c45
-rw-r--r--fs/btrfs/delayed-ref.c153
-rw-r--r--fs/btrfs/delayed-ref.h104
-rw-r--r--fs/btrfs/disk-io.c119
-rw-r--r--fs/btrfs/disk-io.h6
-rw-r--r--fs/btrfs/export.c2
-rw-r--r--fs/btrfs/extent-tree.c465
-rw-r--r--fs/btrfs/extent_io.c6
-rw-r--r--fs/btrfs/extent_io.h2
-rw-r--r--fs/btrfs/file.c11
-rw-r--r--fs/btrfs/free-space-cache.c417
-rw-r--r--fs/btrfs/inode-map.c4
-rw-r--r--fs/btrfs/inode.c66
-rw-r--r--fs/btrfs/ioctl.c268
-rw-r--r--fs/btrfs/ioctl.h54
-rw-r--r--fs/btrfs/locking.c53
-rw-r--r--fs/btrfs/relocation.c20
-rw-r--r--fs/btrfs/scrub.c12
-rw-r--r--fs/btrfs/super.c190
-rw-r--r--fs/btrfs/transaction.c20
-rw-r--r--fs/btrfs/tree-log.c2
-rw-r--r--fs/btrfs/ulist.c220
-rw-r--r--fs/btrfs/ulist.h68
-rw-r--r--fs/btrfs/volumes.c993
-rw-r--r--fs/btrfs/volumes.h54
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--fs/namei.c28
-rw-r--r--fs/proc/base.c150
-rw-r--r--fs/xfs/xfs_aops.c29
-rw-r--r--fs/xfs/xfs_attr.c4
-rw-r--r--fs/xfs/xfs_attr_leaf.c9
-rw-r--r--fs/xfs/xfs_bmap.c116
-rw-r--r--fs/xfs/xfs_dfrag.c43
-rw-r--r--fs/xfs/xfs_file.c184
-rw-r--r--fs/xfs/xfs_fs_subr.c2
-rw-r--r--fs/xfs/xfs_iget.c24
-rw-r--r--fs/xfs/xfs_inode.c193
-rw-r--r--fs/xfs/xfs_inode.h114
-rw-r--r--fs/xfs/xfs_inode_item.c8
-rw-r--r--fs/xfs/xfs_iomap.c46
-rw-r--r--fs/xfs/xfs_iops.c46
-rw-r--r--fs/xfs/xfs_qm_syscalls.c8
-rw-r--r--fs/xfs/xfs_super.c8
-rw-r--r--fs/xfs/xfs_sync.c9
-rw-r--r--fs/xfs/xfs_trace.h29
-rw-r--r--fs/xfs/xfs_vnodeops.c44
55 files changed, 7283 insertions, 1713 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index ecb9fd3be143..d33f01c08b60 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -31,3 +31,22 @@ config BTRFS_FS_POSIX_ACL
31 Linux website <http://acl.bestbits.at/>. 31 Linux website <http://acl.bestbits.at/>.
32 32
33 If you don't know what Access Control Lists are, say N 33 If you don't know what Access Control Lists are, say N
34
35config BTRFS_FS_CHECK_INTEGRITY
36 bool "Btrfs with integrity check tool compiled in (DANGEROUS)"
37 depends on BTRFS_FS
38 help
39 Adds code that examines all block write requests (including
40 writes of the super block). The goal is to verify that the
41 state of the filesystem on disk is always consistent, i.e.,
42 after a power-loss or kernel panic event the filesystem is
43 in a consistent state.
44
45 If the integrity check tool is included and activated in
46 the mount options, plenty of kernel memory is used, and
47 plenty of additional CPU cycles are spent. Enabling this
48 functionality is not intended for normal use.
49
50 In most cases, unless you are a btrfs developer who needs
51 to verify the integrity of (super)-block write requests
52 during the run of a regression test, say N
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index c0ddfd29c5e5..0c4fa2befae7 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,6 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o backref.o 11 reada.o backref.o ulist.o
12 12
13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 22c64fff1bd5..b9a843226de8 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -19,18 +19,789 @@
19#include "ctree.h" 19#include "ctree.h"
20#include "disk-io.h" 20#include "disk-io.h"
21#include "backref.h" 21#include "backref.h"
22#include "ulist.h"
23#include "transaction.h"
24#include "delayed-ref.h"
22 25
23struct __data_ref { 26/*
27 * this structure records all encountered refs on the way up to the root
28 */
29struct __prelim_ref {
24 struct list_head list; 30 struct list_head list;
25 u64 inum; 31 u64 root_id;
26 u64 root; 32 struct btrfs_key key;
27 u64 extent_data_item_offset; 33 int level;
34 int count;
35 u64 parent;
36 u64 wanted_disk_byte;
28}; 37};
29 38
30struct __shared_ref { 39static int __add_prelim_ref(struct list_head *head, u64 root_id,
31 struct list_head list; 40 struct btrfs_key *key, int level, u64 parent,
41 u64 wanted_disk_byte, int count)
42{
43 struct __prelim_ref *ref;
44
45 /* in case we're adding delayed refs, we're holding the refs spinlock */
46 ref = kmalloc(sizeof(*ref), GFP_ATOMIC);
47 if (!ref)
48 return -ENOMEM;
49
50 ref->root_id = root_id;
51 if (key)
52 ref->key = *key;
53 else
54 memset(&ref->key, 0, sizeof(ref->key));
55
56 ref->level = level;
57 ref->count = count;
58 ref->parent = parent;
59 ref->wanted_disk_byte = wanted_disk_byte;
60 list_add_tail(&ref->list, head);
61
62 return 0;
63}
64
65static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
66 struct ulist *parents,
67 struct extent_buffer *eb, int level,
68 u64 wanted_objectid, u64 wanted_disk_byte)
69{
70 int ret;
71 int slot;
72 struct btrfs_file_extent_item *fi;
73 struct btrfs_key key;
32 u64 disk_byte; 74 u64 disk_byte;
33}; 75
76add_parent:
77 ret = ulist_add(parents, eb->start, 0, GFP_NOFS);
78 if (ret < 0)
79 return ret;
80
81 if (level != 0)
82 return 0;
83
84 /*
85 * if the current leaf is full with EXTENT_DATA items, we must
86 * check the next one if that holds a reference as well.
87 * ref->count cannot be used to skip this check.
88 * repeat this until we don't find any additional EXTENT_DATA items.
89 */
90 while (1) {
91 ret = btrfs_next_leaf(root, path);
92 if (ret < 0)
93 return ret;
94 if (ret)
95 return 0;
96
97 eb = path->nodes[0];
98 for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) {
99 btrfs_item_key_to_cpu(eb, &key, slot);
100 if (key.objectid != wanted_objectid ||
101 key.type != BTRFS_EXTENT_DATA_KEY)
102 return 0;
103 fi = btrfs_item_ptr(eb, slot,
104 struct btrfs_file_extent_item);
105 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
106 if (disk_byte == wanted_disk_byte)
107 goto add_parent;
108 }
109 }
110
111 return 0;
112}
113
114/*
115 * resolve an indirect backref in the form (root_id, key, level)
116 * to a logical address
117 */
118static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
119 struct __prelim_ref *ref,
120 struct ulist *parents)
121{
122 struct btrfs_path *path;
123 struct btrfs_root *root;
124 struct btrfs_key root_key;
125 struct btrfs_key key = {0};
126 struct extent_buffer *eb;
127 int ret = 0;
128 int root_level;
129 int level = ref->level;
130
131 path = btrfs_alloc_path();
132 if (!path)
133 return -ENOMEM;
134
135 root_key.objectid = ref->root_id;
136 root_key.type = BTRFS_ROOT_ITEM_KEY;
137 root_key.offset = (u64)-1;
138 root = btrfs_read_fs_root_no_name(fs_info, &root_key);
139 if (IS_ERR(root)) {
140 ret = PTR_ERR(root);
141 goto out;
142 }
143
144 rcu_read_lock();
145 root_level = btrfs_header_level(root->node);
146 rcu_read_unlock();
147
148 if (root_level + 1 == level)
149 goto out;
150
151 path->lowest_level = level;
152 ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0);
153 pr_debug("search slot in root %llu (level %d, ref count %d) returned "
154 "%d for key (%llu %u %llu)\n",
155 (unsigned long long)ref->root_id, level, ref->count, ret,
156 (unsigned long long)ref->key.objectid, ref->key.type,
157 (unsigned long long)ref->key.offset);
158 if (ret < 0)
159 goto out;
160
161 eb = path->nodes[level];
162 if (!eb) {
163 WARN_ON(1);
164 ret = 1;
165 goto out;
166 }
167
168 if (level == 0) {
169 if (ret == 1 && path->slots[0] >= btrfs_header_nritems(eb)) {
170 ret = btrfs_next_leaf(root, path);
171 if (ret)
172 goto out;
173 eb = path->nodes[0];
174 }
175
176 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
177 }
178
179 /* the last two parameters will only be used for level == 0 */
180 ret = add_all_parents(root, path, parents, eb, level, key.objectid,
181 ref->wanted_disk_byte);
182out:
183 btrfs_free_path(path);
184 return ret;
185}
186
187/*
188 * resolve all indirect backrefs from the list
189 */
190static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
191 struct list_head *head)
192{
193 int err;
194 int ret = 0;
195 struct __prelim_ref *ref;
196 struct __prelim_ref *ref_safe;
197 struct __prelim_ref *new_ref;
198 struct ulist *parents;
199 struct ulist_node *node;
200
201 parents = ulist_alloc(GFP_NOFS);
202 if (!parents)
203 return -ENOMEM;
204
205 /*
206 * _safe allows us to insert directly after the current item without
207 * iterating over the newly inserted items.
208 * we're also allowed to re-assign ref during iteration.
209 */
210 list_for_each_entry_safe(ref, ref_safe, head, list) {
211 if (ref->parent) /* already direct */
212 continue;
213 if (ref->count == 0)
214 continue;
215 err = __resolve_indirect_ref(fs_info, ref, parents);
216 if (err) {
217 if (ret == 0)
218 ret = err;
219 continue;
220 }
221
222 /* we put the first parent into the ref at hand */
223 node = ulist_next(parents, NULL);
224 ref->parent = node ? node->val : 0;
225
226 /* additional parents require new refs being added here */
227 while ((node = ulist_next(parents, node))) {
228 new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS);
229 if (!new_ref) {
230 ret = -ENOMEM;
231 break;
232 }
233 memcpy(new_ref, ref, sizeof(*ref));
234 new_ref->parent = node->val;
235 list_add(&new_ref->list, &ref->list);
236 }
237 ulist_reinit(parents);
238 }
239
240 ulist_free(parents);
241 return ret;
242}
243
244/*
245 * merge two lists of backrefs and adjust counts accordingly
246 *
247 * mode = 1: merge identical keys, if key is set
248 * mode = 2: merge identical parents
249 */
250static int __merge_refs(struct list_head *head, int mode)
251{
252 struct list_head *pos1;
253
254 list_for_each(pos1, head) {
255 struct list_head *n2;
256 struct list_head *pos2;
257 struct __prelim_ref *ref1;
258
259 ref1 = list_entry(pos1, struct __prelim_ref, list);
260
261 if (mode == 1 && ref1->key.type == 0)
262 continue;
263 for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
264 pos2 = n2, n2 = pos2->next) {
265 struct __prelim_ref *ref2;
266
267 ref2 = list_entry(pos2, struct __prelim_ref, list);
268
269 if (mode == 1) {
270 if (memcmp(&ref1->key, &ref2->key,
271 sizeof(ref1->key)) ||
272 ref1->level != ref2->level ||
273 ref1->root_id != ref2->root_id)
274 continue;
275 ref1->count += ref2->count;
276 } else {
277 if (ref1->parent != ref2->parent)
278 continue;
279 ref1->count += ref2->count;
280 }
281 list_del(&ref2->list);
282 kfree(ref2);
283 }
284
285 }
286 return 0;
287}
288
289/*
290 * add all currently queued delayed refs from this head whose seq nr is
291 * smaller or equal that seq to the list
292 */
293static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
294 struct btrfs_key *info_key,
295 struct list_head *prefs)
296{
297 struct btrfs_delayed_extent_op *extent_op = head->extent_op;
298 struct rb_node *n = &head->node.rb_node;
299 int sgn;
300 int ret;
301
302 if (extent_op && extent_op->update_key)
303 btrfs_disk_key_to_cpu(info_key, &extent_op->key);
304
305 while ((n = rb_prev(n))) {
306 struct btrfs_delayed_ref_node *node;
307 node = rb_entry(n, struct btrfs_delayed_ref_node,
308 rb_node);
309 if (node->bytenr != head->node.bytenr)
310 break;
311 WARN_ON(node->is_head);
312
313 if (node->seq > seq)
314 continue;
315
316 switch (node->action) {
317 case BTRFS_ADD_DELAYED_EXTENT:
318 case BTRFS_UPDATE_DELAYED_HEAD:
319 WARN_ON(1);
320 continue;
321 case BTRFS_ADD_DELAYED_REF:
322 sgn = 1;
323 break;
324 case BTRFS_DROP_DELAYED_REF:
325 sgn = -1;
326 break;
327 default:
328 BUG_ON(1);
329 }
330 switch (node->type) {
331 case BTRFS_TREE_BLOCK_REF_KEY: {
332 struct btrfs_delayed_tree_ref *ref;
333
334 ref = btrfs_delayed_node_to_tree_ref(node);
335 ret = __add_prelim_ref(prefs, ref->root, info_key,
336 ref->level + 1, 0, node->bytenr,
337 node->ref_mod * sgn);
338 break;
339 }
340 case BTRFS_SHARED_BLOCK_REF_KEY: {
341 struct btrfs_delayed_tree_ref *ref;
342
343 ref = btrfs_delayed_node_to_tree_ref(node);
344 ret = __add_prelim_ref(prefs, ref->root, info_key,
345 ref->level + 1, ref->parent,
346 node->bytenr,
347 node->ref_mod * sgn);
348 break;
349 }
350 case BTRFS_EXTENT_DATA_REF_KEY: {
351 struct btrfs_delayed_data_ref *ref;
352 struct btrfs_key key;
353
354 ref = btrfs_delayed_node_to_data_ref(node);
355
356 key.objectid = ref->objectid;
357 key.type = BTRFS_EXTENT_DATA_KEY;
358 key.offset = ref->offset;
359 ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0,
360 node->bytenr,
361 node->ref_mod * sgn);
362 break;
363 }
364 case BTRFS_SHARED_DATA_REF_KEY: {
365 struct btrfs_delayed_data_ref *ref;
366 struct btrfs_key key;
367
368 ref = btrfs_delayed_node_to_data_ref(node);
369
370 key.objectid = ref->objectid;
371 key.type = BTRFS_EXTENT_DATA_KEY;
372 key.offset = ref->offset;
373 ret = __add_prelim_ref(prefs, ref->root, &key, 0,
374 ref->parent, node->bytenr,
375 node->ref_mod * sgn);
376 break;
377 }
378 default:
379 WARN_ON(1);
380 }
381 BUG_ON(ret);
382 }
383
384 return 0;
385}
386
387/*
388 * add all inline backrefs for bytenr to the list
389 */
390static int __add_inline_refs(struct btrfs_fs_info *fs_info,
391 struct btrfs_path *path, u64 bytenr,
392 struct btrfs_key *info_key, int *info_level,
393 struct list_head *prefs)
394{
395 int ret;
396 int slot;
397 struct extent_buffer *leaf;
398 struct btrfs_key key;
399 unsigned long ptr;
400 unsigned long end;
401 struct btrfs_extent_item *ei;
402 u64 flags;
403 u64 item_size;
404
405 /*
406 * enumerate all inline refs
407 */
408 leaf = path->nodes[0];
409 slot = path->slots[0] - 1;
410
411 item_size = btrfs_item_size_nr(leaf, slot);
412 BUG_ON(item_size < sizeof(*ei));
413
414 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
415 flags = btrfs_extent_flags(leaf, ei);
416
417 ptr = (unsigned long)(ei + 1);
418 end = (unsigned long)ei + item_size;
419
420 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
421 struct btrfs_tree_block_info *info;
422 struct btrfs_disk_key disk_key;
423
424 info = (struct btrfs_tree_block_info *)ptr;
425 *info_level = btrfs_tree_block_level(leaf, info);
426 btrfs_tree_block_key(leaf, info, &disk_key);
427 btrfs_disk_key_to_cpu(info_key, &disk_key);
428 ptr += sizeof(struct btrfs_tree_block_info);
429 BUG_ON(ptr > end);
430 } else {
431 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
432 }
433
434 while (ptr < end) {
435 struct btrfs_extent_inline_ref *iref;
436 u64 offset;
437 int type;
438
439 iref = (struct btrfs_extent_inline_ref *)ptr;
440 type = btrfs_extent_inline_ref_type(leaf, iref);
441 offset = btrfs_extent_inline_ref_offset(leaf, iref);
442
443 switch (type) {
444 case BTRFS_SHARED_BLOCK_REF_KEY:
445 ret = __add_prelim_ref(prefs, 0, info_key,
446 *info_level + 1, offset,
447 bytenr, 1);
448 break;
449 case BTRFS_SHARED_DATA_REF_KEY: {
450 struct btrfs_shared_data_ref *sdref;
451 int count;
452
453 sdref = (struct btrfs_shared_data_ref *)(iref + 1);
454 count = btrfs_shared_data_ref_count(leaf, sdref);
455 ret = __add_prelim_ref(prefs, 0, NULL, 0, offset,
456 bytenr, count);
457 break;
458 }
459 case BTRFS_TREE_BLOCK_REF_KEY:
460 ret = __add_prelim_ref(prefs, offset, info_key,
461 *info_level + 1, 0, bytenr, 1);
462 break;
463 case BTRFS_EXTENT_DATA_REF_KEY: {
464 struct btrfs_extent_data_ref *dref;
465 int count;
466 u64 root;
467
468 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
469 count = btrfs_extent_data_ref_count(leaf, dref);
470 key.objectid = btrfs_extent_data_ref_objectid(leaf,
471 dref);
472 key.type = BTRFS_EXTENT_DATA_KEY;
473 key.offset = btrfs_extent_data_ref_offset(leaf, dref);
474 root = btrfs_extent_data_ref_root(leaf, dref);
475 ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr,
476 count);
477 break;
478 }
479 default:
480 WARN_ON(1);
481 }
482 BUG_ON(ret);
483 ptr += btrfs_extent_inline_ref_size(type);
484 }
485
486 return 0;
487}
488
489/*
490 * add all non-inline backrefs for bytenr to the list
491 */
492static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
493 struct btrfs_path *path, u64 bytenr,
494 struct btrfs_key *info_key, int info_level,
495 struct list_head *prefs)
496{
497 struct btrfs_root *extent_root = fs_info->extent_root;
498 int ret;
499 int slot;
500 struct extent_buffer *leaf;
501 struct btrfs_key key;
502
503 while (1) {
504 ret = btrfs_next_item(extent_root, path);
505 if (ret < 0)
506 break;
507 if (ret) {
508 ret = 0;
509 break;
510 }
511
512 slot = path->slots[0];
513 leaf = path->nodes[0];
514 btrfs_item_key_to_cpu(leaf, &key, slot);
515
516 if (key.objectid != bytenr)
517 break;
518 if (key.type < BTRFS_TREE_BLOCK_REF_KEY)
519 continue;
520 if (key.type > BTRFS_SHARED_DATA_REF_KEY)
521 break;
522
523 switch (key.type) {
524 case BTRFS_SHARED_BLOCK_REF_KEY:
525 ret = __add_prelim_ref(prefs, 0, info_key,
526 info_level + 1, key.offset,
527 bytenr, 1);
528 break;
529 case BTRFS_SHARED_DATA_REF_KEY: {
530 struct btrfs_shared_data_ref *sdref;
531 int count;
532
533 sdref = btrfs_item_ptr(leaf, slot,
534 struct btrfs_shared_data_ref);
535 count = btrfs_shared_data_ref_count(leaf, sdref);
536 ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset,
537 bytenr, count);
538 break;
539 }
540 case BTRFS_TREE_BLOCK_REF_KEY:
541 ret = __add_prelim_ref(prefs, key.offset, info_key,
542 info_level + 1, 0, bytenr, 1);
543 break;
544 case BTRFS_EXTENT_DATA_REF_KEY: {
545 struct btrfs_extent_data_ref *dref;
546 int count;
547 u64 root;
548
549 dref = btrfs_item_ptr(leaf, slot,
550 struct btrfs_extent_data_ref);
551 count = btrfs_extent_data_ref_count(leaf, dref);
552 key.objectid = btrfs_extent_data_ref_objectid(leaf,
553 dref);
554 key.type = BTRFS_EXTENT_DATA_KEY;
555 key.offset = btrfs_extent_data_ref_offset(leaf, dref);
556 root = btrfs_extent_data_ref_root(leaf, dref);
557 ret = __add_prelim_ref(prefs, root, &key, 0, 0,
558 bytenr, count);
559 break;
560 }
561 default:
562 WARN_ON(1);
563 }
564 BUG_ON(ret);
565 }
566
567 return ret;
568}
569
570/*
571 * this adds all existing backrefs (inline backrefs, backrefs and delayed
572 * refs) for the given bytenr to the refs list, merges duplicates and resolves
573 * indirect refs to their parent bytenr.
574 * When roots are found, they're added to the roots list
575 *
576 * FIXME some caching might speed things up
577 */
578static int find_parent_nodes(struct btrfs_trans_handle *trans,
579 struct btrfs_fs_info *fs_info, u64 bytenr,
580 u64 seq, struct ulist *refs, struct ulist *roots)
581{
582 struct btrfs_key key;
583 struct btrfs_path *path;
584 struct btrfs_key info_key = { 0 };
585 struct btrfs_delayed_ref_root *delayed_refs = NULL;
586 struct btrfs_delayed_ref_head *head = NULL;
587 int info_level = 0;
588 int ret;
589 struct list_head prefs_delayed;
590 struct list_head prefs;
591 struct __prelim_ref *ref;
592
593 INIT_LIST_HEAD(&prefs);
594 INIT_LIST_HEAD(&prefs_delayed);
595
596 key.objectid = bytenr;
597 key.type = BTRFS_EXTENT_ITEM_KEY;
598 key.offset = (u64)-1;
599
600 path = btrfs_alloc_path();
601 if (!path)
602 return -ENOMEM;
603
604 /*
605 * grab both a lock on the path and a lock on the delayed ref head.
606 * We need both to get a consistent picture of how the refs look
607 * at a specified point in time
608 */
609again:
610 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
611 if (ret < 0)
612 goto out;
613 BUG_ON(ret == 0);
614
615 /*
616 * look if there are updates for this ref queued and lock the head
617 */
618 delayed_refs = &trans->transaction->delayed_refs;
619 spin_lock(&delayed_refs->lock);
620 head = btrfs_find_delayed_ref_head(trans, bytenr);
621 if (head) {
622 if (!mutex_trylock(&head->mutex)) {
623 atomic_inc(&head->node.refs);
624 spin_unlock(&delayed_refs->lock);
625
626 btrfs_release_path(path);
627
628 /*
629 * Mutex was contended, block until it's
630 * released and try again
631 */
632 mutex_lock(&head->mutex);
633 mutex_unlock(&head->mutex);
634 btrfs_put_delayed_ref(&head->node);
635 goto again;
636 }
637 ret = __add_delayed_refs(head, seq, &info_key, &prefs_delayed);
638 if (ret)
639 goto out;
640 }
641 spin_unlock(&delayed_refs->lock);
642
643 if (path->slots[0]) {
644 struct extent_buffer *leaf;
645 int slot;
646
647 leaf = path->nodes[0];
648 slot = path->slots[0] - 1;
649 btrfs_item_key_to_cpu(leaf, &key, slot);
650 if (key.objectid == bytenr &&
651 key.type == BTRFS_EXTENT_ITEM_KEY) {
652 ret = __add_inline_refs(fs_info, path, bytenr,
653 &info_key, &info_level, &prefs);
654 if (ret)
655 goto out;
656 ret = __add_keyed_refs(fs_info, path, bytenr, &info_key,
657 info_level, &prefs);
658 if (ret)
659 goto out;
660 }
661 }
662 btrfs_release_path(path);
663
664 /*
665 * when adding the delayed refs above, the info_key might not have
666 * been known yet. Go over the list and replace the missing keys
667 */
668 list_for_each_entry(ref, &prefs_delayed, list) {
669 if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0)
670 memcpy(&ref->key, &info_key, sizeof(ref->key));
671 }
672 list_splice_init(&prefs_delayed, &prefs);
673
674 ret = __merge_refs(&prefs, 1);
675 if (ret)
676 goto out;
677
678 ret = __resolve_indirect_refs(fs_info, &prefs);
679 if (ret)
680 goto out;
681
682 ret = __merge_refs(&prefs, 2);
683 if (ret)
684 goto out;
685
686 while (!list_empty(&prefs)) {
687 ref = list_first_entry(&prefs, struct __prelim_ref, list);
688 list_del(&ref->list);
689 if (ref->count < 0)
690 WARN_ON(1);
691 if (ref->count && ref->root_id && ref->parent == 0) {
692 /* no parent == root of tree */
693 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
694 BUG_ON(ret < 0);
695 }
696 if (ref->count && ref->parent) {
697 ret = ulist_add(refs, ref->parent, 0, GFP_NOFS);
698 BUG_ON(ret < 0);
699 }
700 kfree(ref);
701 }
702
703out:
704 if (head)
705 mutex_unlock(&head->mutex);
706 btrfs_free_path(path);
707 while (!list_empty(&prefs)) {
708 ref = list_first_entry(&prefs, struct __prelim_ref, list);
709 list_del(&ref->list);
710 kfree(ref);
711 }
712 while (!list_empty(&prefs_delayed)) {
713 ref = list_first_entry(&prefs_delayed, struct __prelim_ref,
714 list);
715 list_del(&ref->list);
716 kfree(ref);
717 }
718
719 return ret;
720}
721
722/*
723 * Finds all leafs with a reference to the specified combination of bytenr and
724 * offset. key_list_head will point to a list of corresponding keys (caller must
725 * free each list element). The leafs will be stored in the leafs ulist, which
726 * must be freed with ulist_free.
727 *
728 * returns 0 on success, <0 on error
729 */
730static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
731 struct btrfs_fs_info *fs_info, u64 bytenr,
732 u64 num_bytes, u64 seq, struct ulist **leafs)
733{
734 struct ulist *tmp;
735 int ret;
736
737 tmp = ulist_alloc(GFP_NOFS);
738 if (!tmp)
739 return -ENOMEM;
740 *leafs = ulist_alloc(GFP_NOFS);
741 if (!*leafs) {
742 ulist_free(tmp);
743 return -ENOMEM;
744 }
745
746 ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp);
747 ulist_free(tmp);
748
749 if (ret < 0 && ret != -ENOENT) {
750 ulist_free(*leafs);
751 return ret;
752 }
753
754 return 0;
755}
756
757/*
758 * walk all backrefs for a given extent to find all roots that reference this
759 * extent. Walking a backref means finding all extents that reference this
760 * extent and in turn walk the backrefs of those, too. Naturally this is a
761 * recursive process, but here it is implemented in an iterative fashion: We
762 * find all referencing extents for the extent in question and put them on a
763 * list. In turn, we find all referencing extents for those, further appending
764 * to the list. The way we iterate the list allows adding more elements after
765 * the current while iterating. The process stops when we reach the end of the
766 * list. Found roots are added to the roots list.
767 *
768 * returns 0 on success, < 0 on error.
769 */
770int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
771 struct btrfs_fs_info *fs_info, u64 bytenr,
772 u64 num_bytes, u64 seq, struct ulist **roots)
773{
774 struct ulist *tmp;
775 struct ulist_node *node = NULL;
776 int ret;
777
778 tmp = ulist_alloc(GFP_NOFS);
779 if (!tmp)
780 return -ENOMEM;
781 *roots = ulist_alloc(GFP_NOFS);
782 if (!*roots) {
783 ulist_free(tmp);
784 return -ENOMEM;
785 }
786
787 while (1) {
788 ret = find_parent_nodes(trans, fs_info, bytenr, seq,
789 tmp, *roots);
790 if (ret < 0 && ret != -ENOENT) {
791 ulist_free(tmp);
792 ulist_free(*roots);
793 return ret;
794 }
795 node = ulist_next(tmp, node);
796 if (!node)
797 break;
798 bytenr = node->val;
799 }
800
801 ulist_free(tmp);
802 return 0;
803}
804
34 805
35static int __inode_info(u64 inum, u64 ioff, u8 key_type, 806static int __inode_info(u64 inum, u64 ioff, u8 key_type,
36 struct btrfs_root *fs_root, struct btrfs_path *path, 807 struct btrfs_root *fs_root, struct btrfs_path *path,
@@ -181,8 +952,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
181 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); 952 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
182 if (found_key->type != BTRFS_EXTENT_ITEM_KEY || 953 if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
183 found_key->objectid > logical || 954 found_key->objectid > logical ||
184 found_key->objectid + found_key->offset <= logical) 955 found_key->objectid + found_key->offset <= logical) {
956 pr_debug("logical %llu is not within any extent\n",
957 (unsigned long long)logical);
185 return -ENOENT; 958 return -ENOENT;
959 }
186 960
187 eb = path->nodes[0]; 961 eb = path->nodes[0];
188 item_size = btrfs_item_size_nr(eb, path->slots[0]); 962 item_size = btrfs_item_size_nr(eb, path->slots[0]);
@@ -191,6 +965,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
191 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); 965 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
192 flags = btrfs_extent_flags(eb, ei); 966 flags = btrfs_extent_flags(eb, ei);
193 967
968 pr_debug("logical %llu is at position %llu within the extent (%llu "
969 "EXTENT_ITEM %llu) flags %#llx size %u\n",
970 (unsigned long long)logical,
971 (unsigned long long)(logical - found_key->objectid),
972 (unsigned long long)found_key->objectid,
973 (unsigned long long)found_key->offset,
974 (unsigned long long)flags, item_size);
194 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) 975 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
195 return BTRFS_EXTENT_FLAG_TREE_BLOCK; 976 return BTRFS_EXTENT_FLAG_TREE_BLOCK;
196 if (flags & BTRFS_EXTENT_FLAG_DATA) 977 if (flags & BTRFS_EXTENT_FLAG_DATA)
@@ -287,128 +1068,11 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
287 return 0; 1068 return 0;
288} 1069}
289 1070
290static int __data_list_add(struct list_head *head, u64 inum, 1071static int iterate_leaf_refs(struct btrfs_fs_info *fs_info,
291 u64 extent_data_item_offset, u64 root) 1072 struct btrfs_path *path, u64 logical,
292{ 1073 u64 orig_extent_item_objectid,
293 struct __data_ref *ref; 1074 u64 extent_item_pos, u64 root,
294 1075 iterate_extent_inodes_t *iterate, void *ctx)
295 ref = kmalloc(sizeof(*ref), GFP_NOFS);
296 if (!ref)
297 return -ENOMEM;
298
299 ref->inum = inum;
300 ref->extent_data_item_offset = extent_data_item_offset;
301 ref->root = root;
302 list_add_tail(&ref->list, head);
303
304 return 0;
305}
306
307static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb,
308 struct btrfs_extent_data_ref *dref)
309{
310 return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref),
311 btrfs_extent_data_ref_offset(eb, dref),
312 btrfs_extent_data_ref_root(eb, dref));
313}
314
315static int __shared_list_add(struct list_head *head, u64 disk_byte)
316{
317 struct __shared_ref *ref;
318
319 ref = kmalloc(sizeof(*ref), GFP_NOFS);
320 if (!ref)
321 return -ENOMEM;
322
323 ref->disk_byte = disk_byte;
324 list_add_tail(&ref->list, head);
325
326 return 0;
327}
328
329static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info,
330 u64 logical, u64 inum,
331 u64 extent_data_item_offset,
332 u64 extent_offset,
333 struct btrfs_path *path,
334 struct list_head *data_refs,
335 iterate_extent_inodes_t *iterate,
336 void *ctx)
337{
338 u64 ref_root;
339 u32 item_size;
340 struct btrfs_key key;
341 struct extent_buffer *eb;
342 struct btrfs_extent_item *ei;
343 struct btrfs_extent_inline_ref *eiref;
344 struct __data_ref *ref;
345 int ret;
346 int type;
347 int last;
348 unsigned long ptr = 0;
349
350 WARN_ON(!list_empty(data_refs));
351 ret = extent_from_logical(fs_info, logical, path, &key);
352 if (ret & BTRFS_EXTENT_FLAG_DATA)
353 ret = -EIO;
354 if (ret < 0)
355 goto out;
356
357 eb = path->nodes[0];
358 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
359 item_size = btrfs_item_size_nr(eb, path->slots[0]);
360
361 ret = 0;
362 ref_root = 0;
363 /*
364 * as done in iterate_extent_inodes, we first build a list of refs to
365 * iterate, then free the path and then iterate them to avoid deadlocks.
366 */
367 do {
368 last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
369 &eiref, &type);
370 if (last < 0) {
371 ret = last;
372 goto out;
373 }
374 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
375 type == BTRFS_SHARED_BLOCK_REF_KEY) {
376 ref_root = btrfs_extent_inline_ref_offset(eb, eiref);
377 ret = __data_list_add(data_refs, inum,
378 extent_data_item_offset,
379 ref_root);
380 }
381 } while (!ret && !last);
382
383 btrfs_release_path(path);
384
385 if (ref_root == 0) {
386 printk(KERN_ERR "btrfs: failed to find tree block ref "
387 "for shared data backref %llu\n", logical);
388 WARN_ON(1);
389 ret = -EIO;
390 }
391
392out:
393 while (!list_empty(data_refs)) {
394 ref = list_first_entry(data_refs, struct __data_ref, list);
395 list_del(&ref->list);
396 if (!ret)
397 ret = iterate(ref->inum, extent_offset +
398 ref->extent_data_item_offset,
399 ref->root, ctx);
400 kfree(ref);
401 }
402
403 return ret;
404}
405
406static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
407 u64 logical, u64 orig_extent_item_objectid,
408 u64 extent_offset, struct btrfs_path *path,
409 struct list_head *data_refs,
410 iterate_extent_inodes_t *iterate,
411 void *ctx)
412{ 1076{
413 u64 disk_byte; 1077 u64 disk_byte;
414 struct btrfs_key key; 1078 struct btrfs_key key;
@@ -416,8 +1080,10 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
416 struct extent_buffer *eb; 1080 struct extent_buffer *eb;
417 int slot; 1081 int slot;
418 int nritems; 1082 int nritems;
419 int ret; 1083 int ret = 0;
420 int found = 0; 1084 int extent_type;
1085 u64 data_offset;
1086 u64 data_len;
421 1087
422 eb = read_tree_block(fs_info->tree_root, logical, 1088 eb = read_tree_block(fs_info->tree_root, logical,
423 fs_info->tree_root->leafsize, 0); 1089 fs_info->tree_root->leafsize, 0);
@@ -435,149 +1101,99 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
435 if (key.type != BTRFS_EXTENT_DATA_KEY) 1101 if (key.type != BTRFS_EXTENT_DATA_KEY)
436 continue; 1102 continue;
437 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 1103 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
438 if (!fi) { 1104 extent_type = btrfs_file_extent_type(eb, fi);
439 free_extent_buffer(eb); 1105 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
440 return -EIO; 1106 continue;
441 } 1107 /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
442 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); 1108 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
443 if (disk_byte != orig_extent_item_objectid) { 1109 if (disk_byte != orig_extent_item_objectid)
444 if (found) 1110 continue;
445 break;
446 else
447 continue;
448 }
449 ++found;
450 ret = __iter_shared_inline_ref_inodes(fs_info, logical,
451 key.objectid,
452 key.offset,
453 extent_offset, path,
454 data_refs,
455 iterate, ctx);
456 if (ret)
457 break;
458 }
459 1111
460 if (!found) { 1112 data_offset = btrfs_file_extent_offset(eb, fi);
461 printk(KERN_ERR "btrfs: failed to follow shared data backref " 1113 data_len = btrfs_file_extent_num_bytes(eb, fi);
462 "to parent %llu\n", logical); 1114
463 WARN_ON(1); 1115 if (extent_item_pos < data_offset ||
464 ret = -EIO; 1116 extent_item_pos >= data_offset + data_len)
1117 continue;
1118
1119 pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), "
1120 "root %llu\n", orig_extent_item_objectid,
1121 key.objectid, key.offset, root);
1122 ret = iterate(key.objectid,
1123 key.offset + (extent_item_pos - data_offset),
1124 root, ctx);
1125 if (ret) {
1126 pr_debug("stopping iteration because ret=%d\n", ret);
1127 break;
1128 }
465 } 1129 }
466 1130
467 free_extent_buffer(eb); 1131 free_extent_buffer(eb);
1132
468 return ret; 1133 return ret;
469} 1134}
470 1135
471/* 1136/*
472 * calls iterate() for every inode that references the extent identified by 1137 * calls iterate() for every inode that references the extent identified by
473 * the given parameters. will use the path given as a parameter and return it 1138 * the given parameters.
474 * released.
475 * when the iterator function returns a non-zero value, iteration stops. 1139 * when the iterator function returns a non-zero value, iteration stops.
1140 * path is guaranteed to be in released state when iterate() is called.
476 */ 1141 */
477int iterate_extent_inodes(struct btrfs_fs_info *fs_info, 1142int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
478 struct btrfs_path *path, 1143 struct btrfs_path *path,
479 u64 extent_item_objectid, 1144 u64 extent_item_objectid, u64 extent_item_pos,
480 u64 extent_offset,
481 iterate_extent_inodes_t *iterate, void *ctx) 1145 iterate_extent_inodes_t *iterate, void *ctx)
482{ 1146{
483 unsigned long ptr = 0;
484 int last;
485 int ret; 1147 int ret;
486 int type;
487 u64 logical;
488 u32 item_size;
489 struct btrfs_extent_inline_ref *eiref;
490 struct btrfs_extent_data_ref *dref;
491 struct extent_buffer *eb;
492 struct btrfs_extent_item *ei;
493 struct btrfs_key key;
494 struct list_head data_refs = LIST_HEAD_INIT(data_refs); 1148 struct list_head data_refs = LIST_HEAD_INIT(data_refs);
495 struct list_head shared_refs = LIST_HEAD_INIT(shared_refs); 1149 struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
496 struct __data_ref *ref_d; 1150 struct btrfs_trans_handle *trans;
497 struct __shared_ref *ref_s; 1151 struct ulist *refs;
498 1152 struct ulist *roots;
499 eb = path->nodes[0]; 1153 struct ulist_node *ref_node = NULL;
500 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); 1154 struct ulist_node *root_node = NULL;
501 item_size = btrfs_item_size_nr(eb, path->slots[0]); 1155 struct seq_list seq_elem;
502 1156 struct btrfs_delayed_ref_root *delayed_refs;
503 /* first we iterate the inline refs, ... */ 1157
504 do { 1158 trans = btrfs_join_transaction(fs_info->extent_root);
505 last = __get_extent_inline_ref(&ptr, eb, ei, item_size, 1159 if (IS_ERR(trans))
506 &eiref, &type); 1160 return PTR_ERR(trans);
507 if (last == -ENOENT) { 1161
508 ret = 0; 1162 pr_debug("resolving all inodes for extent %llu\n",
509 break; 1163 extent_item_objectid);
510 } 1164
511 if (last < 0) { 1165 delayed_refs = &trans->transaction->delayed_refs;
512 ret = last; 1166 spin_lock(&delayed_refs->lock);
513 break; 1167 btrfs_get_delayed_seq(delayed_refs, &seq_elem);
514 } 1168 spin_unlock(&delayed_refs->lock);
1169
1170 ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
1171 extent_item_pos, seq_elem.seq,
1172 &refs);
515 1173
516 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1174 if (ret)
517 dref = (struct btrfs_extent_data_ref *)(&eiref->offset); 1175 goto out;
518 ret = __data_list_add_eb(&data_refs, eb, dref);
519 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
520 logical = btrfs_extent_inline_ref_offset(eb, eiref);
521 ret = __shared_list_add(&shared_refs, logical);
522 }
523 } while (!ret && !last);
524 1176
525 /* ... then we proceed to in-tree references and ... */ 1177 while (!ret && (ref_node = ulist_next(refs, ref_node))) {
526 while (!ret) { 1178 ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1,
527 ++path->slots[0]; 1179 seq_elem.seq, &roots);
528 if (path->slots[0] > btrfs_header_nritems(eb)) { 1180 if (ret)
529 ret = btrfs_next_leaf(fs_info->extent_root, path);
530 if (ret) {
531 if (ret == 1)
532 ret = 0; /* we're done */
533 break;
534 }
535 eb = path->nodes[0];
536 }
537 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
538 if (key.objectid != extent_item_objectid)
539 break; 1181 break;
540 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1182 while (!ret && (root_node = ulist_next(roots, root_node))) {
541 dref = btrfs_item_ptr(eb, path->slots[0], 1183 pr_debug("root %llu references leaf %llu\n",
542 struct btrfs_extent_data_ref); 1184 root_node->val, ref_node->val);
543 ret = __data_list_add_eb(&data_refs, eb, dref); 1185 ret = iterate_leaf_refs(fs_info, path, ref_node->val,
544 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1186 extent_item_objectid,
545 ret = __shared_list_add(&shared_refs, key.offset); 1187 extent_item_pos, root_node->val,
1188 iterate, ctx);
546 } 1189 }
547 } 1190 }
548 1191
549 btrfs_release_path(path); 1192 ulist_free(refs);
550 1193 ulist_free(roots);
551 /* 1194out:
552 * ... only at the very end we can process the refs we found. this is 1195 btrfs_put_delayed_seq(delayed_refs, &seq_elem);
553 * because the iterator function we call is allowed to make tree lookups 1196 btrfs_end_transaction(trans, fs_info->extent_root);
554 * and we have to avoid deadlocks. additionally, we need more tree
555 * lookups ourselves for shared data refs.
556 */
557 while (!list_empty(&data_refs)) {
558 ref_d = list_first_entry(&data_refs, struct __data_ref, list);
559 list_del(&ref_d->list);
560 if (!ret)
561 ret = iterate(ref_d->inum, extent_offset +
562 ref_d->extent_data_item_offset,
563 ref_d->root, ctx);
564 kfree(ref_d);
565 }
566
567 while (!list_empty(&shared_refs)) {
568 ref_s = list_first_entry(&shared_refs, struct __shared_ref,
569 list);
570 list_del(&ref_s->list);
571 if (!ret)
572 ret = __iter_shared_inline_ref(fs_info,
573 ref_s->disk_byte,
574 extent_item_objectid,
575 extent_offset, path,
576 &data_refs,
577 iterate, ctx);
578 kfree(ref_s);
579 }
580
581 return ret; 1197 return ret;
582} 1198}
583 1199
@@ -586,19 +1202,20 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
586 iterate_extent_inodes_t *iterate, void *ctx) 1202 iterate_extent_inodes_t *iterate, void *ctx)
587{ 1203{
588 int ret; 1204 int ret;
589 u64 offset; 1205 u64 extent_item_pos;
590 struct btrfs_key found_key; 1206 struct btrfs_key found_key;
591 1207
592 ret = extent_from_logical(fs_info, logical, path, 1208 ret = extent_from_logical(fs_info, logical, path,
593 &found_key); 1209 &found_key);
1210 btrfs_release_path(path);
594 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) 1211 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
595 ret = -EINVAL; 1212 ret = -EINVAL;
596 if (ret < 0) 1213 if (ret < 0)
597 return ret; 1214 return ret;
598 1215
599 offset = logical - found_key.objectid; 1216 extent_item_pos = logical - found_key.objectid;
600 ret = iterate_extent_inodes(fs_info, path, found_key.objectid, 1217 ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
601 offset, iterate, ctx); 1218 extent_item_pos, iterate, ctx);
602 1219
603 return ret; 1220 return ret;
604} 1221}
@@ -643,6 +1260,10 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
643 for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) { 1260 for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
644 name_len = btrfs_inode_ref_name_len(eb, iref); 1261 name_len = btrfs_inode_ref_name_len(eb, iref);
645 /* path must be released before calling iterate()! */ 1262 /* path must be released before calling iterate()! */
1263 pr_debug("following ref at offset %u for inode %llu in "
1264 "tree %llu\n", cur,
1265 (unsigned long long)found_key.objectid,
1266 (unsigned long long)fs_root->objectid);
646 ret = iterate(parent, iref, eb, ctx); 1267 ret = iterate(parent, iref, eb, ctx);
647 if (ret) { 1268 if (ret) {
648 free_extent_buffer(eb); 1269 free_extent_buffer(eb);
@@ -683,10 +1304,14 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
683 return PTR_ERR(fspath); 1304 return PTR_ERR(fspath);
684 1305
685 if (fspath > fspath_min) { 1306 if (fspath > fspath_min) {
1307 pr_debug("path resolved: %s\n", fspath);
686 ipath->fspath->val[i] = (u64)(unsigned long)fspath; 1308 ipath->fspath->val[i] = (u64)(unsigned long)fspath;
687 ++ipath->fspath->elem_cnt; 1309 ++ipath->fspath->elem_cnt;
688 ipath->fspath->bytes_left = fspath - fspath_min; 1310 ipath->fspath->bytes_left = fspath - fspath_min;
689 } else { 1311 } else {
1312 pr_debug("missed path, not enough space. missing bytes: %lu, "
1313 "constructed so far: %s\n",
1314 (unsigned long)(fspath_min - fspath), fspath_min);
690 ++ipath->fspath->elem_missed; 1315 ++ipath->fspath->elem_missed;
691 ipath->fspath->bytes_missing += fspath_min - fspath; 1316 ipath->fspath->bytes_missing += fspath_min - fspath;
692 ipath->fspath->bytes_left = 0; 1317 ipath->fspath->bytes_left = 0;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 92618837cb8f..d00dfa9ca934 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -20,6 +20,7 @@
20#define __BTRFS_BACKREF__ 20#define __BTRFS_BACKREF__
21 21
22#include "ioctl.h" 22#include "ioctl.h"
23#include "ulist.h"
23 24
24struct inode_fs_paths { 25struct inode_fs_paths {
25 struct btrfs_path *btrfs_path; 26 struct btrfs_path *btrfs_path;
@@ -54,6 +55,10 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
54 55
55int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); 56int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
56 57
58int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
59 struct btrfs_fs_info *fs_info, u64 bytenr,
60 u64 num_bytes, u64 seq, struct ulist **roots);
61
57struct btrfs_data_container *init_data_container(u32 total_bytes); 62struct btrfs_data_container *init_data_container(u32 total_bytes);
58struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, 63struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
59 struct btrfs_path *path); 64 struct btrfs_path *path);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 634608d2a6d0..9b9b15fd5204 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -51,6 +51,9 @@ struct btrfs_inode {
51 /* held while logging the inode in tree-log.c */ 51 /* held while logging the inode in tree-log.c */
52 struct mutex log_mutex; 52 struct mutex log_mutex;
53 53
54 /* held while doing delalloc reservations */
55 struct mutex delalloc_mutex;
56
54 /* used to order data wrt metadata */ 57 /* used to order data wrt metadata */
55 struct btrfs_ordered_inode_tree ordered_tree; 58 struct btrfs_ordered_inode_tree ordered_tree;
56 59
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
new file mode 100644
index 000000000000..ad0b3ba735b7
--- /dev/null
+++ b/fs/btrfs/check-integrity.c
@@ -0,0 +1,3068 @@
1/*
2 * Copyright (C) STRATO AG 2011. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19/*
20 * This module can be used to catch cases when the btrfs kernel
21 * code executes write requests to the disk that bring the file
22 * system in an inconsistent state. In such a state, a power-loss
23 * or kernel panic event would cause that the data on disk is
24 * lost or at least damaged.
25 *
26 * Code is added that examines all block write requests during
27 * runtime (including writes of the super block). Three rules
28 * are verified and an error is printed on violation of the
29 * rules:
30 * 1. It is not allowed to write a disk block which is
31 * currently referenced by the super block (either directly
32 * or indirectly).
33 * 2. When a super block is written, it is verified that all
34 * referenced (directly or indirectly) blocks fulfill the
35 * following requirements:
36 * 2a. All referenced blocks have either been present when
37 * the file system was mounted, (i.e., they have been
38 * referenced by the super block) or they have been
39 * written since then and the write completion callback
40 * was called and a FLUSH request to the device where
41 * these blocks are located was received and completed.
42 * 2b. All referenced blocks need to have a generation
43 * number which is equal to the parent's number.
44 *
45 * One issue that was found using this module was that the log
46 * tree on disk became temporarily corrupted because disk blocks
47 * that had been in use for the log tree had been freed and
48 * reused too early, while being referenced by the written super
49 * block.
50 *
51 * The search term in the kernel log that can be used to filter
52 * on the existence of detected integrity issues is
53 * "btrfs: attempt".
54 *
55 * The integrity check is enabled via mount options. These
56 * mount options are only supported if the integrity check
57 * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY.
58 *
59 * Example #1, apply integrity checks to all metadata:
60 * mount /dev/sdb1 /mnt -o check_int
61 *
62 * Example #2, apply integrity checks to all metadata and
63 * to data extents:
64 * mount /dev/sdb1 /mnt -o check_int_data
65 *
66 * Example #3, apply integrity checks to all metadata and dump
67 * the tree that the super block references to kernel messages
68 * each time after a super block was written:
69 * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263
70 *
71 * If the integrity check tool is included and activated in
72 * the mount options, plenty of kernel memory is used, and
73 * plenty of additional CPU cycles are spent. Enabling this
74 * functionality is not intended for normal use. In most
75 * cases, unless you are a btrfs developer who needs to verify
76 * the integrity of (super)-block write requests, do not
77 * enable the config option BTRFS_FS_CHECK_INTEGRITY to
78 * include and compile the integrity check tool.
79 */
80
81#include <linux/sched.h>
82#include <linux/slab.h>
83#include <linux/buffer_head.h>
84#include <linux/mutex.h>
85#include <linux/crc32c.h>
86#include <linux/genhd.h>
87#include <linux/blkdev.h>
88#include "ctree.h"
89#include "disk-io.h"
90#include "transaction.h"
91#include "extent_io.h"
92#include "disk-io.h"
93#include "volumes.h"
94#include "print-tree.h"
95#include "locking.h"
96#include "check-integrity.h"
97
98#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
99#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
100#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100
101#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051
102#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807
103#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530
104#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
105#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters,
106 * excluding " [...]" */
107#define BTRFSIC_BLOCK_SIZE PAGE_SIZE
108
109#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
110
111/*
112 * The definition of the bitmask fields for the print_mask.
113 * They are specified with the mount option check_integrity_print_mask.
114 */
115#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE 0x00000001
116#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION 0x00000002
117#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE 0x00000004
118#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE 0x00000008
119#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH 0x00000010
120#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH 0x00000020
121#define BTRFSIC_PRINT_MASK_VERBOSE 0x00000040
122#define BTRFSIC_PRINT_MASK_VERY_VERBOSE 0x00000080
123#define BTRFSIC_PRINT_MASK_INITIAL_TREE 0x00000100
124#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES 0x00000200
125#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400
126#define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800
127#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000
128
129struct btrfsic_dev_state;
130struct btrfsic_state;
131
132struct btrfsic_block {
133 u32 magic_num; /* only used for debug purposes */
134 unsigned int is_metadata:1; /* if it is meta-data, not data-data */
135 unsigned int is_superblock:1; /* if it is one of the superblocks */
136 unsigned int is_iodone:1; /* if is done by lower subsystem */
137 unsigned int iodone_w_error:1; /* error was indicated to endio */
138 unsigned int never_written:1; /* block was added because it was
139 * referenced, not because it was
140 * written */
141 unsigned int mirror_num:2; /* large enough to hold
142 * BTRFS_SUPER_MIRROR_MAX */
143 struct btrfsic_dev_state *dev_state;
144 u64 dev_bytenr; /* key, physical byte num on disk */
145 u64 logical_bytenr; /* logical byte num on disk */
146 u64 generation;
147 struct btrfs_disk_key disk_key; /* extra info to print in case of
148 * issues, will not always be correct */
149 struct list_head collision_resolving_node; /* list node */
150 struct list_head all_blocks_node; /* list node */
151
152 /* the following two lists contain block_link items */
153 struct list_head ref_to_list; /* list */
154 struct list_head ref_from_list; /* list */
155 struct btrfsic_block *next_in_same_bio;
156 void *orig_bio_bh_private;
157 union {
158 bio_end_io_t *bio;
159 bh_end_io_t *bh;
160 } orig_bio_bh_end_io;
161 int submit_bio_bh_rw;
162 u64 flush_gen; /* only valid if !never_written */
163};
164
165/*
166 * Elements of this type are allocated dynamically and required because
167 * each block object can refer to and can be ref from multiple blocks.
168 * The key to lookup them in the hashtable is the dev_bytenr of
169 * the block ref to plus the one from the block refered from.
170 * The fact that they are searchable via a hashtable and that a
171 * ref_cnt is maintained is not required for the btrfs integrity
172 * check algorithm itself, it is only used to make the output more
173 * beautiful in case that an error is detected (an error is defined
174 * as a write operation to a block while that block is still referenced).
175 */
176struct btrfsic_block_link {
177 u32 magic_num; /* only used for debug purposes */
178 u32 ref_cnt;
179 struct list_head node_ref_to; /* list node */
180 struct list_head node_ref_from; /* list node */
181 struct list_head collision_resolving_node; /* list node */
182 struct btrfsic_block *block_ref_to;
183 struct btrfsic_block *block_ref_from;
184 u64 parent_generation;
185};
186
187struct btrfsic_dev_state {
188 u32 magic_num; /* only used for debug purposes */
189 struct block_device *bdev;
190 struct btrfsic_state *state;
191 struct list_head collision_resolving_node; /* list node */
192 struct btrfsic_block dummy_block_for_bio_bh_flush;
193 u64 last_flush_gen;
194 char name[BDEVNAME_SIZE];
195};
196
197struct btrfsic_block_hashtable {
198 struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE];
199};
200
201struct btrfsic_block_link_hashtable {
202 struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE];
203};
204
205struct btrfsic_dev_state_hashtable {
206 struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE];
207};
208
209struct btrfsic_block_data_ctx {
210 u64 start; /* virtual bytenr */
211 u64 dev_bytenr; /* physical bytenr on device */
212 u32 len;
213 struct btrfsic_dev_state *dev;
214 char *data;
215 struct buffer_head *bh; /* do not use if set to NULL */
216};
217
218/* This structure is used to implement recursion without occupying
219 * any stack space, refer to btrfsic_process_metablock() */
220struct btrfsic_stack_frame {
221 u32 magic;
222 u32 nr;
223 int error;
224 int i;
225 int limit_nesting;
226 int num_copies;
227 int mirror_num;
228 struct btrfsic_block *block;
229 struct btrfsic_block_data_ctx *block_ctx;
230 struct btrfsic_block *next_block;
231 struct btrfsic_block_data_ctx next_block_ctx;
232 struct btrfs_header *hdr;
233 struct btrfsic_stack_frame *prev;
234};
235
236/* Some state per mounted filesystem */
237struct btrfsic_state {
238 u32 print_mask;
239 int include_extent_data;
240 int csum_size;
241 struct list_head all_blocks_list;
242 struct btrfsic_block_hashtable block_hashtable;
243 struct btrfsic_block_link_hashtable block_link_hashtable;
244 struct btrfs_root *root;
245 u64 max_superblock_generation;
246 struct btrfsic_block *latest_superblock;
247};
248
249static void btrfsic_block_init(struct btrfsic_block *b);
250static struct btrfsic_block *btrfsic_block_alloc(void);
251static void btrfsic_block_free(struct btrfsic_block *b);
252static void btrfsic_block_link_init(struct btrfsic_block_link *n);
253static struct btrfsic_block_link *btrfsic_block_link_alloc(void);
254static void btrfsic_block_link_free(struct btrfsic_block_link *n);
255static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds);
256static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void);
257static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds);
258static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h);
259static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
260 struct btrfsic_block_hashtable *h);
261static void btrfsic_block_hashtable_remove(struct btrfsic_block *b);
262static struct btrfsic_block *btrfsic_block_hashtable_lookup(
263 struct block_device *bdev,
264 u64 dev_bytenr,
265 struct btrfsic_block_hashtable *h);
266static void btrfsic_block_link_hashtable_init(
267 struct btrfsic_block_link_hashtable *h);
268static void btrfsic_block_link_hashtable_add(
269 struct btrfsic_block_link *l,
270 struct btrfsic_block_link_hashtable *h);
271static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l);
272static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
273 struct block_device *bdev_ref_to,
274 u64 dev_bytenr_ref_to,
275 struct block_device *bdev_ref_from,
276 u64 dev_bytenr_ref_from,
277 struct btrfsic_block_link_hashtable *h);
278static void btrfsic_dev_state_hashtable_init(
279 struct btrfsic_dev_state_hashtable *h);
280static void btrfsic_dev_state_hashtable_add(
281 struct btrfsic_dev_state *ds,
282 struct btrfsic_dev_state_hashtable *h);
283static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds);
284static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
285 struct block_device *bdev,
286 struct btrfsic_dev_state_hashtable *h);
287static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void);
288static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf);
289static int btrfsic_process_superblock(struct btrfsic_state *state,
290 struct btrfs_fs_devices *fs_devices);
291static int btrfsic_process_metablock(struct btrfsic_state *state,
292 struct btrfsic_block *block,
293 struct btrfsic_block_data_ctx *block_ctx,
294 struct btrfs_header *hdr,
295 int limit_nesting, int force_iodone_flag);
296static int btrfsic_create_link_to_next_block(
297 struct btrfsic_state *state,
298 struct btrfsic_block *block,
299 struct btrfsic_block_data_ctx
300 *block_ctx, u64 next_bytenr,
301 int limit_nesting,
302 struct btrfsic_block_data_ctx *next_block_ctx,
303 struct btrfsic_block **next_blockp,
304 int force_iodone_flag,
305 int *num_copiesp, int *mirror_nump,
306 struct btrfs_disk_key *disk_key,
307 u64 parent_generation);
308static int btrfsic_handle_extent_data(struct btrfsic_state *state,
309 struct btrfsic_block *block,
310 struct btrfsic_block_data_ctx *block_ctx,
311 u32 item_offset, int force_iodone_flag);
312static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
313 struct btrfsic_block_data_ctx *block_ctx_out,
314 int mirror_num);
315static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
316 u32 len, struct block_device *bdev,
317 struct btrfsic_block_data_ctx *block_ctx_out);
318static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
319static int btrfsic_read_block(struct btrfsic_state *state,
320 struct btrfsic_block_data_ctx *block_ctx);
321static void btrfsic_dump_database(struct btrfsic_state *state);
322static int btrfsic_test_for_metadata(struct btrfsic_state *state,
323 const u8 *data, unsigned int size);
324static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
325 u64 dev_bytenr, u8 *mapped_data,
326 unsigned int len, struct bio *bio,
327 int *bio_is_patched,
328 struct buffer_head *bh,
329 int submit_bio_bh_rw);
330static int btrfsic_process_written_superblock(
331 struct btrfsic_state *state,
332 struct btrfsic_block *const block,
333 struct btrfs_super_block *const super_hdr);
334static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status);
335static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate);
336static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
337 const struct btrfsic_block *block,
338 int recursion_level);
339static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
340 struct btrfsic_block *const block,
341 int recursion_level);
342static void btrfsic_print_add_link(const struct btrfsic_state *state,
343 const struct btrfsic_block_link *l);
344static void btrfsic_print_rem_link(const struct btrfsic_state *state,
345 const struct btrfsic_block_link *l);
346static char btrfsic_get_block_type(const struct btrfsic_state *state,
347 const struct btrfsic_block *block);
348static void btrfsic_dump_tree(const struct btrfsic_state *state);
349static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
350 const struct btrfsic_block *block,
351 int indent_level);
352static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
353 struct btrfsic_state *state,
354 struct btrfsic_block_data_ctx *next_block_ctx,
355 struct btrfsic_block *next_block,
356 struct btrfsic_block *from_block,
357 u64 parent_generation);
358static struct btrfsic_block *btrfsic_block_lookup_or_add(
359 struct btrfsic_state *state,
360 struct btrfsic_block_data_ctx *block_ctx,
361 const char *additional_string,
362 int is_metadata,
363 int is_iodone,
364 int never_written,
365 int mirror_num,
366 int *was_created);
367static int btrfsic_process_superblock_dev_mirror(
368 struct btrfsic_state *state,
369 struct btrfsic_dev_state *dev_state,
370 struct btrfs_device *device,
371 int superblock_mirror_num,
372 struct btrfsic_dev_state **selected_dev_state,
373 struct btrfs_super_block *selected_super);
374static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
375 struct block_device *bdev);
376static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
377 u64 bytenr,
378 struct btrfsic_dev_state *dev_state,
379 u64 dev_bytenr, char *data);
380
381static struct mutex btrfsic_mutex;
382static int btrfsic_is_initialized;
383static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable;
384
385
386static void btrfsic_block_init(struct btrfsic_block *b)
387{
388 b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER;
389 b->dev_state = NULL;
390 b->dev_bytenr = 0;
391 b->logical_bytenr = 0;
392 b->generation = BTRFSIC_GENERATION_UNKNOWN;
393 b->disk_key.objectid = 0;
394 b->disk_key.type = 0;
395 b->disk_key.offset = 0;
396 b->is_metadata = 0;
397 b->is_superblock = 0;
398 b->is_iodone = 0;
399 b->iodone_w_error = 0;
400 b->never_written = 0;
401 b->mirror_num = 0;
402 b->next_in_same_bio = NULL;
403 b->orig_bio_bh_private = NULL;
404 b->orig_bio_bh_end_io.bio = NULL;
405 INIT_LIST_HEAD(&b->collision_resolving_node);
406 INIT_LIST_HEAD(&b->all_blocks_node);
407 INIT_LIST_HEAD(&b->ref_to_list);
408 INIT_LIST_HEAD(&b->ref_from_list);
409 b->submit_bio_bh_rw = 0;
410 b->flush_gen = 0;
411}
412
413static struct btrfsic_block *btrfsic_block_alloc(void)
414{
415 struct btrfsic_block *b;
416
417 b = kzalloc(sizeof(*b), GFP_NOFS);
418 if (NULL != b)
419 btrfsic_block_init(b);
420
421 return b;
422}
423
424static void btrfsic_block_free(struct btrfsic_block *b)
425{
426 BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num));
427 kfree(b);
428}
429
430static void btrfsic_block_link_init(struct btrfsic_block_link *l)
431{
432 l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER;
433 l->ref_cnt = 1;
434 INIT_LIST_HEAD(&l->node_ref_to);
435 INIT_LIST_HEAD(&l->node_ref_from);
436 INIT_LIST_HEAD(&l->collision_resolving_node);
437 l->block_ref_to = NULL;
438 l->block_ref_from = NULL;
439}
440
441static struct btrfsic_block_link *btrfsic_block_link_alloc(void)
442{
443 struct btrfsic_block_link *l;
444
445 l = kzalloc(sizeof(*l), GFP_NOFS);
446 if (NULL != l)
447 btrfsic_block_link_init(l);
448
449 return l;
450}
451
452static void btrfsic_block_link_free(struct btrfsic_block_link *l)
453{
454 BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num));
455 kfree(l);
456}
457
458static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
459{
460 ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
461 ds->bdev = NULL;
462 ds->state = NULL;
463 ds->name[0] = '\0';
464 INIT_LIST_HEAD(&ds->collision_resolving_node);
465 ds->last_flush_gen = 0;
466 btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
467 ds->dummy_block_for_bio_bh_flush.is_iodone = 1;
468 ds->dummy_block_for_bio_bh_flush.dev_state = ds;
469}
470
471static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void)
472{
473 struct btrfsic_dev_state *ds;
474
475 ds = kzalloc(sizeof(*ds), GFP_NOFS);
476 if (NULL != ds)
477 btrfsic_dev_state_init(ds);
478
479 return ds;
480}
481
482static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds)
483{
484 BUG_ON(!(NULL == ds ||
485 BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num));
486 kfree(ds);
487}
488
489static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h)
490{
491 int i;
492
493 for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++)
494 INIT_LIST_HEAD(h->table + i);
495}
496
497static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
498 struct btrfsic_block_hashtable *h)
499{
500 const unsigned int hashval =
501 (((unsigned int)(b->dev_bytenr >> 16)) ^
502 ((unsigned int)((uintptr_t)b->dev_state->bdev))) &
503 (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
504
505 list_add(&b->collision_resolving_node, h->table + hashval);
506}
507
508static void btrfsic_block_hashtable_remove(struct btrfsic_block *b)
509{
510 list_del(&b->collision_resolving_node);
511}
512
513static struct btrfsic_block *btrfsic_block_hashtable_lookup(
514 struct block_device *bdev,
515 u64 dev_bytenr,
516 struct btrfsic_block_hashtable *h)
517{
518 const unsigned int hashval =
519 (((unsigned int)(dev_bytenr >> 16)) ^
520 ((unsigned int)((uintptr_t)bdev))) &
521 (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
522 struct list_head *elem;
523
524 list_for_each(elem, h->table + hashval) {
525 struct btrfsic_block *const b =
526 list_entry(elem, struct btrfsic_block,
527 collision_resolving_node);
528
529 if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr)
530 return b;
531 }
532
533 return NULL;
534}
535
536static void btrfsic_block_link_hashtable_init(
537 struct btrfsic_block_link_hashtable *h)
538{
539 int i;
540
541 for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++)
542 INIT_LIST_HEAD(h->table + i);
543}
544
545static void btrfsic_block_link_hashtable_add(
546 struct btrfsic_block_link *l,
547 struct btrfsic_block_link_hashtable *h)
548{
549 const unsigned int hashval =
550 (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^
551 ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^
552 ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^
553 ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev)))
554 & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
555
556 BUG_ON(NULL == l->block_ref_to);
557 BUG_ON(NULL == l->block_ref_from);
558 list_add(&l->collision_resolving_node, h->table + hashval);
559}
560
561static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l)
562{
563 list_del(&l->collision_resolving_node);
564}
565
566static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
567 struct block_device *bdev_ref_to,
568 u64 dev_bytenr_ref_to,
569 struct block_device *bdev_ref_from,
570 u64 dev_bytenr_ref_from,
571 struct btrfsic_block_link_hashtable *h)
572{
573 const unsigned int hashval =
574 (((unsigned int)(dev_bytenr_ref_to >> 16)) ^
575 ((unsigned int)(dev_bytenr_ref_from >> 16)) ^
576 ((unsigned int)((uintptr_t)bdev_ref_to)) ^
577 ((unsigned int)((uintptr_t)bdev_ref_from))) &
578 (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
579 struct list_head *elem;
580
581 list_for_each(elem, h->table + hashval) {
582 struct btrfsic_block_link *const l =
583 list_entry(elem, struct btrfsic_block_link,
584 collision_resolving_node);
585
586 BUG_ON(NULL == l->block_ref_to);
587 BUG_ON(NULL == l->block_ref_from);
588 if (l->block_ref_to->dev_state->bdev == bdev_ref_to &&
589 l->block_ref_to->dev_bytenr == dev_bytenr_ref_to &&
590 l->block_ref_from->dev_state->bdev == bdev_ref_from &&
591 l->block_ref_from->dev_bytenr == dev_bytenr_ref_from)
592 return l;
593 }
594
595 return NULL;
596}
597
598static void btrfsic_dev_state_hashtable_init(
599 struct btrfsic_dev_state_hashtable *h)
600{
601 int i;
602
603 for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++)
604 INIT_LIST_HEAD(h->table + i);
605}
606
607static void btrfsic_dev_state_hashtable_add(
608 struct btrfsic_dev_state *ds,
609 struct btrfsic_dev_state_hashtable *h)
610{
611 const unsigned int hashval =
612 (((unsigned int)((uintptr_t)ds->bdev)) &
613 (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
614
615 list_add(&ds->collision_resolving_node, h->table + hashval);
616}
617
618static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds)
619{
620 list_del(&ds->collision_resolving_node);
621}
622
623static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
624 struct block_device *bdev,
625 struct btrfsic_dev_state_hashtable *h)
626{
627 const unsigned int hashval =
628 (((unsigned int)((uintptr_t)bdev)) &
629 (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
630 struct list_head *elem;
631
632 list_for_each(elem, h->table + hashval) {
633 struct btrfsic_dev_state *const ds =
634 list_entry(elem, struct btrfsic_dev_state,
635 collision_resolving_node);
636
637 if (ds->bdev == bdev)
638 return ds;
639 }
640
641 return NULL;
642}
643
644static int btrfsic_process_superblock(struct btrfsic_state *state,
645 struct btrfs_fs_devices *fs_devices)
646{
647 int ret;
648 struct btrfs_super_block *selected_super;
649 struct list_head *dev_head = &fs_devices->devices;
650 struct btrfs_device *device;
651 struct btrfsic_dev_state *selected_dev_state = NULL;
652 int pass;
653
654 BUG_ON(NULL == state);
655 selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS);
656 if (NULL == selected_super) {
657 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
658 return -1;
659 }
660
661 list_for_each_entry(device, dev_head, dev_list) {
662 int i;
663 struct btrfsic_dev_state *dev_state;
664
665 if (!device->bdev || !device->name)
666 continue;
667
668 dev_state = btrfsic_dev_state_lookup(device->bdev);
669 BUG_ON(NULL == dev_state);
670 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
671 ret = btrfsic_process_superblock_dev_mirror(
672 state, dev_state, device, i,
673 &selected_dev_state, selected_super);
674 if (0 != ret && 0 == i) {
675 kfree(selected_super);
676 return ret;
677 }
678 }
679 }
680
681 if (NULL == state->latest_superblock) {
682 printk(KERN_INFO "btrfsic: no superblock found!\n");
683 kfree(selected_super);
684 return -1;
685 }
686
687 state->csum_size = btrfs_super_csum_size(selected_super);
688
689 for (pass = 0; pass < 3; pass++) {
690 int num_copies;
691 int mirror_num;
692 u64 next_bytenr;
693
694 switch (pass) {
695 case 0:
696 next_bytenr = btrfs_super_root(selected_super);
697 if (state->print_mask &
698 BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
699 printk(KERN_INFO "root@%llu\n",
700 (unsigned long long)next_bytenr);
701 break;
702 case 1:
703 next_bytenr = btrfs_super_chunk_root(selected_super);
704 if (state->print_mask &
705 BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
706 printk(KERN_INFO "chunk@%llu\n",
707 (unsigned long long)next_bytenr);
708 break;
709 case 2:
710 next_bytenr = btrfs_super_log_root(selected_super);
711 if (0 == next_bytenr)
712 continue;
713 if (state->print_mask &
714 BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
715 printk(KERN_INFO "log@%llu\n",
716 (unsigned long long)next_bytenr);
717 break;
718 }
719
720 num_copies =
721 btrfs_num_copies(&state->root->fs_info->mapping_tree,
722 next_bytenr, PAGE_SIZE);
723 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
724 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
725 (unsigned long long)next_bytenr, num_copies);
726
727 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
728 struct btrfsic_block *next_block;
729 struct btrfsic_block_data_ctx tmp_next_block_ctx;
730 struct btrfsic_block_link *l;
731 struct btrfs_header *hdr;
732
733 ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
734 &tmp_next_block_ctx,
735 mirror_num);
736 if (ret) {
737 printk(KERN_INFO "btrfsic:"
738 " btrfsic_map_block(root @%llu,"
739 " mirror %d) failed!\n",
740 (unsigned long long)next_bytenr,
741 mirror_num);
742 kfree(selected_super);
743 return -1;
744 }
745
746 next_block = btrfsic_block_hashtable_lookup(
747 tmp_next_block_ctx.dev->bdev,
748 tmp_next_block_ctx.dev_bytenr,
749 &state->block_hashtable);
750 BUG_ON(NULL == next_block);
751
752 l = btrfsic_block_link_hashtable_lookup(
753 tmp_next_block_ctx.dev->bdev,
754 tmp_next_block_ctx.dev_bytenr,
755 state->latest_superblock->dev_state->
756 bdev,
757 state->latest_superblock->dev_bytenr,
758 &state->block_link_hashtable);
759 BUG_ON(NULL == l);
760
761 ret = btrfsic_read_block(state, &tmp_next_block_ctx);
762 if (ret < (int)BTRFSIC_BLOCK_SIZE) {
763 printk(KERN_INFO
764 "btrfsic: read @logical %llu failed!\n",
765 (unsigned long long)
766 tmp_next_block_ctx.start);
767 btrfsic_release_block_ctx(&tmp_next_block_ctx);
768 kfree(selected_super);
769 return -1;
770 }
771
772 hdr = (struct btrfs_header *)tmp_next_block_ctx.data;
773 ret = btrfsic_process_metablock(state,
774 next_block,
775 &tmp_next_block_ctx,
776 hdr,
777 BTRFS_MAX_LEVEL + 3, 1);
778 btrfsic_release_block_ctx(&tmp_next_block_ctx);
779 }
780 }
781
782 kfree(selected_super);
783 return ret;
784}
785
786static int btrfsic_process_superblock_dev_mirror(
787 struct btrfsic_state *state,
788 struct btrfsic_dev_state *dev_state,
789 struct btrfs_device *device,
790 int superblock_mirror_num,
791 struct btrfsic_dev_state **selected_dev_state,
792 struct btrfs_super_block *selected_super)
793{
794 struct btrfs_super_block *super_tmp;
795 u64 dev_bytenr;
796 struct buffer_head *bh;
797 struct btrfsic_block *superblock_tmp;
798 int pass;
799 struct block_device *const superblock_bdev = device->bdev;
800
801 /* super block bytenr is always the unmapped device bytenr */
802 dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
803 bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096);
804 if (NULL == bh)
805 return -1;
806 super_tmp = (struct btrfs_super_block *)
807 (bh->b_data + (dev_bytenr & 4095));
808
809 if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
810 strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC,
811 sizeof(super_tmp->magic)) ||
812 memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) {
813 brelse(bh);
814 return 0;
815 }
816
817 superblock_tmp =
818 btrfsic_block_hashtable_lookup(superblock_bdev,
819 dev_bytenr,
820 &state->block_hashtable);
821 if (NULL == superblock_tmp) {
822 superblock_tmp = btrfsic_block_alloc();
823 if (NULL == superblock_tmp) {
824 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
825 brelse(bh);
826 return -1;
827 }
828 /* for superblock, only the dev_bytenr makes sense */
829 superblock_tmp->dev_bytenr = dev_bytenr;
830 superblock_tmp->dev_state = dev_state;
831 superblock_tmp->logical_bytenr = dev_bytenr;
832 superblock_tmp->generation = btrfs_super_generation(super_tmp);
833 superblock_tmp->is_metadata = 1;
834 superblock_tmp->is_superblock = 1;
835 superblock_tmp->is_iodone = 1;
836 superblock_tmp->never_written = 0;
837 superblock_tmp->mirror_num = 1 + superblock_mirror_num;
838 if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
839 printk(KERN_INFO "New initial S-block (bdev %p, %s)"
840 " @%llu (%s/%llu/%d)\n",
841 superblock_bdev, device->name,
842 (unsigned long long)dev_bytenr,
843 dev_state->name,
844 (unsigned long long)dev_bytenr,
845 superblock_mirror_num);
846 list_add(&superblock_tmp->all_blocks_node,
847 &state->all_blocks_list);
848 btrfsic_block_hashtable_add(superblock_tmp,
849 &state->block_hashtable);
850 }
851
852 /* select the one with the highest generation field */
853 if (btrfs_super_generation(super_tmp) >
854 state->max_superblock_generation ||
855 0 == state->max_superblock_generation) {
856 memcpy(selected_super, super_tmp, sizeof(*selected_super));
857 *selected_dev_state = dev_state;
858 state->max_superblock_generation =
859 btrfs_super_generation(super_tmp);
860 state->latest_superblock = superblock_tmp;
861 }
862
863 for (pass = 0; pass < 3; pass++) {
864 u64 next_bytenr;
865 int num_copies;
866 int mirror_num;
867 const char *additional_string = NULL;
868 struct btrfs_disk_key tmp_disk_key;
869
870 tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
871 tmp_disk_key.offset = 0;
872 switch (pass) {
873 case 0:
874 tmp_disk_key.objectid =
875 cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
876 additional_string = "initial root ";
877 next_bytenr = btrfs_super_root(super_tmp);
878 break;
879 case 1:
880 tmp_disk_key.objectid =
881 cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
882 additional_string = "initial chunk ";
883 next_bytenr = btrfs_super_chunk_root(super_tmp);
884 break;
885 case 2:
886 tmp_disk_key.objectid =
887 cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
888 additional_string = "initial log ";
889 next_bytenr = btrfs_super_log_root(super_tmp);
890 if (0 == next_bytenr)
891 continue;
892 break;
893 }
894
895 num_copies =
896 btrfs_num_copies(&state->root->fs_info->mapping_tree,
897 next_bytenr, PAGE_SIZE);
898 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
899 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
900 (unsigned long long)next_bytenr, num_copies);
901 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
902 struct btrfsic_block *next_block;
903 struct btrfsic_block_data_ctx tmp_next_block_ctx;
904 struct btrfsic_block_link *l;
905
906 if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
907 &tmp_next_block_ctx,
908 mirror_num)) {
909 printk(KERN_INFO "btrfsic: btrfsic_map_block("
910 "bytenr @%llu, mirror %d) failed!\n",
911 (unsigned long long)next_bytenr,
912 mirror_num);
913 brelse(bh);
914 return -1;
915 }
916
917 next_block = btrfsic_block_lookup_or_add(
918 state, &tmp_next_block_ctx,
919 additional_string, 1, 1, 0,
920 mirror_num, NULL);
921 if (NULL == next_block) {
922 btrfsic_release_block_ctx(&tmp_next_block_ctx);
923 brelse(bh);
924 return -1;
925 }
926
927 next_block->disk_key = tmp_disk_key;
928 next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
929 l = btrfsic_block_link_lookup_or_add(
930 state, &tmp_next_block_ctx,
931 next_block, superblock_tmp,
932 BTRFSIC_GENERATION_UNKNOWN);
933 btrfsic_release_block_ctx(&tmp_next_block_ctx);
934 if (NULL == l) {
935 brelse(bh);
936 return -1;
937 }
938 }
939 }
940 if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES)
941 btrfsic_dump_tree_sub(state, superblock_tmp, 0);
942
943 brelse(bh);
944 return 0;
945}
946
947static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
948{
949 struct btrfsic_stack_frame *sf;
950
951 sf = kzalloc(sizeof(*sf), GFP_NOFS);
952 if (NULL == sf)
953 printk(KERN_INFO "btrfsic: alloc memory failed!\n");
954 else
955 sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER;
956 return sf;
957}
958
959static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf)
960{
961 BUG_ON(!(NULL == sf ||
962 BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic));
963 kfree(sf);
964}
965
966static int btrfsic_process_metablock(
967 struct btrfsic_state *state,
968 struct btrfsic_block *const first_block,
969 struct btrfsic_block_data_ctx *const first_block_ctx,
970 struct btrfs_header *const first_hdr,
971 int first_limit_nesting, int force_iodone_flag)
972{
973 struct btrfsic_stack_frame initial_stack_frame = { 0 };
974 struct btrfsic_stack_frame *sf;
975 struct btrfsic_stack_frame *next_stack;
976
977 sf = &initial_stack_frame;
978 sf->error = 0;
979 sf->i = -1;
980 sf->limit_nesting = first_limit_nesting;
981 sf->block = first_block;
982 sf->block_ctx = first_block_ctx;
983 sf->next_block = NULL;
984 sf->hdr = first_hdr;
985 sf->prev = NULL;
986
987continue_with_new_stack_frame:
988 sf->block->generation = le64_to_cpu(sf->hdr->generation);
989 if (0 == sf->hdr->level) {
990 struct btrfs_leaf *const leafhdr =
991 (struct btrfs_leaf *)sf->hdr;
992
993 if (-1 == sf->i) {
994 sf->nr = le32_to_cpu(leafhdr->header.nritems);
995
996 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
997 printk(KERN_INFO
998 "leaf %llu items %d generation %llu"
999 " owner %llu\n",
1000 (unsigned long long)
1001 sf->block_ctx->start,
1002 sf->nr,
1003 (unsigned long long)
1004 le64_to_cpu(leafhdr->header.generation),
1005 (unsigned long long)
1006 le64_to_cpu(leafhdr->header.owner));
1007 }
1008
1009continue_with_current_leaf_stack_frame:
1010 if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
1011 sf->i++;
1012 sf->num_copies = 0;
1013 }
1014
1015 if (sf->i < sf->nr) {
1016 struct btrfs_item *disk_item = leafhdr->items + sf->i;
1017 struct btrfs_disk_key *disk_key = &disk_item->key;
1018 u8 type;
1019 const u32 item_offset = le32_to_cpu(disk_item->offset);
1020
1021 type = disk_key->type;
1022
1023 if (BTRFS_ROOT_ITEM_KEY == type) {
1024 const struct btrfs_root_item *const root_item =
1025 (struct btrfs_root_item *)
1026 (sf->block_ctx->data +
1027 offsetof(struct btrfs_leaf, items) +
1028 item_offset);
1029 const u64 next_bytenr =
1030 le64_to_cpu(root_item->bytenr);
1031
1032 sf->error =
1033 btrfsic_create_link_to_next_block(
1034 state,
1035 sf->block,
1036 sf->block_ctx,
1037 next_bytenr,
1038 sf->limit_nesting,
1039 &sf->next_block_ctx,
1040 &sf->next_block,
1041 force_iodone_flag,
1042 &sf->num_copies,
1043 &sf->mirror_num,
1044 disk_key,
1045 le64_to_cpu(root_item->
1046 generation));
1047 if (sf->error)
1048 goto one_stack_frame_backwards;
1049
1050 if (NULL != sf->next_block) {
1051 struct btrfs_header *const next_hdr =
1052 (struct btrfs_header *)
1053 sf->next_block_ctx.data;
1054
1055 next_stack =
1056 btrfsic_stack_frame_alloc();
1057 if (NULL == next_stack) {
1058 btrfsic_release_block_ctx(
1059 &sf->
1060 next_block_ctx);
1061 goto one_stack_frame_backwards;
1062 }
1063
1064 next_stack->i = -1;
1065 next_stack->block = sf->next_block;
1066 next_stack->block_ctx =
1067 &sf->next_block_ctx;
1068 next_stack->next_block = NULL;
1069 next_stack->hdr = next_hdr;
1070 next_stack->limit_nesting =
1071 sf->limit_nesting - 1;
1072 next_stack->prev = sf;
1073 sf = next_stack;
1074 goto continue_with_new_stack_frame;
1075 }
1076 } else if (BTRFS_EXTENT_DATA_KEY == type &&
1077 state->include_extent_data) {
1078 sf->error = btrfsic_handle_extent_data(
1079 state,
1080 sf->block,
1081 sf->block_ctx,
1082 item_offset,
1083 force_iodone_flag);
1084 if (sf->error)
1085 goto one_stack_frame_backwards;
1086 }
1087
1088 goto continue_with_current_leaf_stack_frame;
1089 }
1090 } else {
1091 struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr;
1092
1093 if (-1 == sf->i) {
1094 sf->nr = le32_to_cpu(nodehdr->header.nritems);
1095
1096 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1097 printk(KERN_INFO "node %llu level %d items %d"
1098 " generation %llu owner %llu\n",
1099 (unsigned long long)
1100 sf->block_ctx->start,
1101 nodehdr->header.level, sf->nr,
1102 (unsigned long long)
1103 le64_to_cpu(nodehdr->header.generation),
1104 (unsigned long long)
1105 le64_to_cpu(nodehdr->header.owner));
1106 }
1107
1108continue_with_current_node_stack_frame:
1109 if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
1110 sf->i++;
1111 sf->num_copies = 0;
1112 }
1113
1114 if (sf->i < sf->nr) {
1115 struct btrfs_key_ptr *disk_key_ptr =
1116 nodehdr->ptrs + sf->i;
1117 const u64 next_bytenr =
1118 le64_to_cpu(disk_key_ptr->blockptr);
1119
1120 sf->error = btrfsic_create_link_to_next_block(
1121 state,
1122 sf->block,
1123 sf->block_ctx,
1124 next_bytenr,
1125 sf->limit_nesting,
1126 &sf->next_block_ctx,
1127 &sf->next_block,
1128 force_iodone_flag,
1129 &sf->num_copies,
1130 &sf->mirror_num,
1131 &disk_key_ptr->key,
1132 le64_to_cpu(disk_key_ptr->generation));
1133 if (sf->error)
1134 goto one_stack_frame_backwards;
1135
1136 if (NULL != sf->next_block) {
1137 struct btrfs_header *const next_hdr =
1138 (struct btrfs_header *)
1139 sf->next_block_ctx.data;
1140
1141 next_stack = btrfsic_stack_frame_alloc();
1142 if (NULL == next_stack)
1143 goto one_stack_frame_backwards;
1144
1145 next_stack->i = -1;
1146 next_stack->block = sf->next_block;
1147 next_stack->block_ctx = &sf->next_block_ctx;
1148 next_stack->next_block = NULL;
1149 next_stack->hdr = next_hdr;
1150 next_stack->limit_nesting =
1151 sf->limit_nesting - 1;
1152 next_stack->prev = sf;
1153 sf = next_stack;
1154 goto continue_with_new_stack_frame;
1155 }
1156
1157 goto continue_with_current_node_stack_frame;
1158 }
1159 }
1160
1161one_stack_frame_backwards:
1162 if (NULL != sf->prev) {
1163 struct btrfsic_stack_frame *const prev = sf->prev;
1164
1165 /* the one for the initial block is freed in the caller */
1166 btrfsic_release_block_ctx(sf->block_ctx);
1167
1168 if (sf->error) {
1169 prev->error = sf->error;
1170 btrfsic_stack_frame_free(sf);
1171 sf = prev;
1172 goto one_stack_frame_backwards;
1173 }
1174
1175 btrfsic_stack_frame_free(sf);
1176 sf = prev;
1177 goto continue_with_new_stack_frame;
1178 } else {
1179 BUG_ON(&initial_stack_frame != sf);
1180 }
1181
1182 return sf->error;
1183}
1184
1185static int btrfsic_create_link_to_next_block(
1186 struct btrfsic_state *state,
1187 struct btrfsic_block *block,
1188 struct btrfsic_block_data_ctx *block_ctx,
1189 u64 next_bytenr,
1190 int limit_nesting,
1191 struct btrfsic_block_data_ctx *next_block_ctx,
1192 struct btrfsic_block **next_blockp,
1193 int force_iodone_flag,
1194 int *num_copiesp, int *mirror_nump,
1195 struct btrfs_disk_key *disk_key,
1196 u64 parent_generation)
1197{
1198 struct btrfsic_block *next_block = NULL;
1199 int ret;
1200 struct btrfsic_block_link *l;
1201 int did_alloc_block_link;
1202 int block_was_created;
1203
1204 *next_blockp = NULL;
1205 if (0 == *num_copiesp) {
1206 *num_copiesp =
1207 btrfs_num_copies(&state->root->fs_info->mapping_tree,
1208 next_bytenr, PAGE_SIZE);
1209 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
1210 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
1211 (unsigned long long)next_bytenr, *num_copiesp);
1212 *mirror_nump = 1;
1213 }
1214
1215 if (*mirror_nump > *num_copiesp)
1216 return 0;
1217
1218 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1219 printk(KERN_INFO
1220 "btrfsic_create_link_to_next_block(mirror_num=%d)\n",
1221 *mirror_nump);
1222 ret = btrfsic_map_block(state, next_bytenr,
1223 BTRFSIC_BLOCK_SIZE,
1224 next_block_ctx, *mirror_nump);
1225 if (ret) {
1226 printk(KERN_INFO
1227 "btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
1228 (unsigned long long)next_bytenr, *mirror_nump);
1229 btrfsic_release_block_ctx(next_block_ctx);
1230 *next_blockp = NULL;
1231 return -1;
1232 }
1233
1234 next_block = btrfsic_block_lookup_or_add(state,
1235 next_block_ctx, "referenced ",
1236 1, force_iodone_flag,
1237 !force_iodone_flag,
1238 *mirror_nump,
1239 &block_was_created);
1240 if (NULL == next_block) {
1241 btrfsic_release_block_ctx(next_block_ctx);
1242 *next_blockp = NULL;
1243 return -1;
1244 }
1245 if (block_was_created) {
1246 l = NULL;
1247 next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
1248 } else {
1249 if (next_block->logical_bytenr != next_bytenr &&
1250 !(!next_block->is_metadata &&
1251 0 == next_block->logical_bytenr)) {
1252 printk(KERN_INFO
1253 "Referenced block @%llu (%s/%llu/%d)"
1254 " found in hash table, %c,"
1255 " bytenr mismatch (!= stored %llu).\n",
1256 (unsigned long long)next_bytenr,
1257 next_block_ctx->dev->name,
1258 (unsigned long long)next_block_ctx->dev_bytenr,
1259 *mirror_nump,
1260 btrfsic_get_block_type(state, next_block),
1261 (unsigned long long)next_block->logical_bytenr);
1262 } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1263 printk(KERN_INFO
1264 "Referenced block @%llu (%s/%llu/%d)"
1265 " found in hash table, %c.\n",
1266 (unsigned long long)next_bytenr,
1267 next_block_ctx->dev->name,
1268 (unsigned long long)next_block_ctx->dev_bytenr,
1269 *mirror_nump,
1270 btrfsic_get_block_type(state, next_block));
1271 next_block->logical_bytenr = next_bytenr;
1272
1273 next_block->mirror_num = *mirror_nump;
1274 l = btrfsic_block_link_hashtable_lookup(
1275 next_block_ctx->dev->bdev,
1276 next_block_ctx->dev_bytenr,
1277 block_ctx->dev->bdev,
1278 block_ctx->dev_bytenr,
1279 &state->block_link_hashtable);
1280 }
1281
1282 next_block->disk_key = *disk_key;
1283 if (NULL == l) {
1284 l = btrfsic_block_link_alloc();
1285 if (NULL == l) {
1286 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
1287 btrfsic_release_block_ctx(next_block_ctx);
1288 *next_blockp = NULL;
1289 return -1;
1290 }
1291
1292 did_alloc_block_link = 1;
1293 l->block_ref_to = next_block;
1294 l->block_ref_from = block;
1295 l->ref_cnt = 1;
1296 l->parent_generation = parent_generation;
1297
1298 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1299 btrfsic_print_add_link(state, l);
1300
1301 list_add(&l->node_ref_to, &block->ref_to_list);
1302 list_add(&l->node_ref_from, &next_block->ref_from_list);
1303
1304 btrfsic_block_link_hashtable_add(l,
1305 &state->block_link_hashtable);
1306 } else {
1307 did_alloc_block_link = 0;
1308 if (0 == limit_nesting) {
1309 l->ref_cnt++;
1310 l->parent_generation = parent_generation;
1311 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1312 btrfsic_print_add_link(state, l);
1313 }
1314 }
1315
1316 if (limit_nesting > 0 && did_alloc_block_link) {
1317 ret = btrfsic_read_block(state, next_block_ctx);
1318 if (ret < (int)BTRFSIC_BLOCK_SIZE) {
1319 printk(KERN_INFO
1320 "btrfsic: read block @logical %llu failed!\n",
1321 (unsigned long long)next_bytenr);
1322 btrfsic_release_block_ctx(next_block_ctx);
1323 *next_blockp = NULL;
1324 return -1;
1325 }
1326
1327 *next_blockp = next_block;
1328 } else {
1329 *next_blockp = NULL;
1330 }
1331 (*mirror_nump)++;
1332
1333 return 0;
1334}
1335
1336static int btrfsic_handle_extent_data(
1337 struct btrfsic_state *state,
1338 struct btrfsic_block *block,
1339 struct btrfsic_block_data_ctx *block_ctx,
1340 u32 item_offset, int force_iodone_flag)
1341{
1342 int ret;
1343 struct btrfs_file_extent_item *file_extent_item =
1344 (struct btrfs_file_extent_item *)(block_ctx->data +
1345 offsetof(struct btrfs_leaf,
1346 items) + item_offset);
1347 u64 next_bytenr =
1348 le64_to_cpu(file_extent_item->disk_bytenr) +
1349 le64_to_cpu(file_extent_item->offset);
1350 u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes);
1351 u64 generation = le64_to_cpu(file_extent_item->generation);
1352 struct btrfsic_block_link *l;
1353
1354 if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
1355 printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu,"
1356 " offset = %llu, num_bytes = %llu\n",
1357 file_extent_item->type,
1358 (unsigned long long)
1359 le64_to_cpu(file_extent_item->disk_bytenr),
1360 (unsigned long long)
1361 le64_to_cpu(file_extent_item->offset),
1362 (unsigned long long)
1363 le64_to_cpu(file_extent_item->num_bytes));
1364 if (BTRFS_FILE_EXTENT_REG != file_extent_item->type ||
1365 ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr))
1366 return 0;
1367 while (num_bytes > 0) {
1368 u32 chunk_len;
1369 int num_copies;
1370 int mirror_num;
1371
1372 if (num_bytes > BTRFSIC_BLOCK_SIZE)
1373 chunk_len = BTRFSIC_BLOCK_SIZE;
1374 else
1375 chunk_len = num_bytes;
1376
1377 num_copies =
1378 btrfs_num_copies(&state->root->fs_info->mapping_tree,
1379 next_bytenr, PAGE_SIZE);
1380 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
1381 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
1382 (unsigned long long)next_bytenr, num_copies);
1383 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
1384 struct btrfsic_block_data_ctx next_block_ctx;
1385 struct btrfsic_block *next_block;
1386 int block_was_created;
1387
1388 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1389 printk(KERN_INFO "btrfsic_handle_extent_data("
1390 "mirror_num=%d)\n", mirror_num);
1391 if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
1392 printk(KERN_INFO
1393 "\tdisk_bytenr = %llu, num_bytes %u\n",
1394 (unsigned long long)next_bytenr,
1395 chunk_len);
1396 ret = btrfsic_map_block(state, next_bytenr,
1397 chunk_len, &next_block_ctx,
1398 mirror_num);
1399 if (ret) {
1400 printk(KERN_INFO
1401 "btrfsic: btrfsic_map_block(@%llu,"
1402 " mirror=%d) failed!\n",
1403 (unsigned long long)next_bytenr,
1404 mirror_num);
1405 return -1;
1406 }
1407
1408 next_block = btrfsic_block_lookup_or_add(
1409 state,
1410 &next_block_ctx,
1411 "referenced ",
1412 0,
1413 force_iodone_flag,
1414 !force_iodone_flag,
1415 mirror_num,
1416 &block_was_created);
1417 if (NULL == next_block) {
1418 printk(KERN_INFO
1419 "btrfsic: error, kmalloc failed!\n");
1420 btrfsic_release_block_ctx(&next_block_ctx);
1421 return -1;
1422 }
1423 if (!block_was_created) {
1424 if (next_block->logical_bytenr != next_bytenr &&
1425 !(!next_block->is_metadata &&
1426 0 == next_block->logical_bytenr)) {
1427 printk(KERN_INFO
1428 "Referenced block"
1429 " @%llu (%s/%llu/%d)"
1430 " found in hash table, D,"
1431 " bytenr mismatch"
1432 " (!= stored %llu).\n",
1433 (unsigned long long)next_bytenr,
1434 next_block_ctx.dev->name,
1435 (unsigned long long)
1436 next_block_ctx.dev_bytenr,
1437 mirror_num,
1438 (unsigned long long)
1439 next_block->logical_bytenr);
1440 }
1441 next_block->logical_bytenr = next_bytenr;
1442 next_block->mirror_num = mirror_num;
1443 }
1444
1445 l = btrfsic_block_link_lookup_or_add(state,
1446 &next_block_ctx,
1447 next_block, block,
1448 generation);
1449 btrfsic_release_block_ctx(&next_block_ctx);
1450 if (NULL == l)
1451 return -1;
1452 }
1453
1454 next_bytenr += chunk_len;
1455 num_bytes -= chunk_len;
1456 }
1457
1458 return 0;
1459}
1460
1461static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
1462 struct btrfsic_block_data_ctx *block_ctx_out,
1463 int mirror_num)
1464{
1465 int ret;
1466 u64 length;
1467 struct btrfs_bio *multi = NULL;
1468 struct btrfs_device *device;
1469
1470 length = len;
1471 ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ,
1472 bytenr, &length, &multi, mirror_num);
1473
1474 device = multi->stripes[0].dev;
1475 block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
1476 block_ctx_out->dev_bytenr = multi->stripes[0].physical;
1477 block_ctx_out->start = bytenr;
1478 block_ctx_out->len = len;
1479 block_ctx_out->data = NULL;
1480 block_ctx_out->bh = NULL;
1481
1482 if (0 == ret)
1483 kfree(multi);
1484 if (NULL == block_ctx_out->dev) {
1485 ret = -ENXIO;
1486 printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
1487 }
1488
1489 return ret;
1490}
1491
1492static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
1493 u32 len, struct block_device *bdev,
1494 struct btrfsic_block_data_ctx *block_ctx_out)
1495{
1496 block_ctx_out->dev = btrfsic_dev_state_lookup(bdev);
1497 block_ctx_out->dev_bytenr = bytenr;
1498 block_ctx_out->start = bytenr;
1499 block_ctx_out->len = len;
1500 block_ctx_out->data = NULL;
1501 block_ctx_out->bh = NULL;
1502 if (NULL != block_ctx_out->dev) {
1503 return 0;
1504 } else {
1505 printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n");
1506 return -ENXIO;
1507 }
1508}
1509
1510static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
1511{
1512 if (NULL != block_ctx->bh) {
1513 brelse(block_ctx->bh);
1514 block_ctx->bh = NULL;
1515 }
1516}
1517
1518static int btrfsic_read_block(struct btrfsic_state *state,
1519 struct btrfsic_block_data_ctx *block_ctx)
1520{
1521 block_ctx->bh = NULL;
1522 if (block_ctx->dev_bytenr & 4095) {
1523 printk(KERN_INFO
1524 "btrfsic: read_block() with unaligned bytenr %llu\n",
1525 (unsigned long long)block_ctx->dev_bytenr);
1526 return -1;
1527 }
1528 if (block_ctx->len > 4096) {
1529 printk(KERN_INFO
1530 "btrfsic: read_block() with too huge size %d\n",
1531 block_ctx->len);
1532 return -1;
1533 }
1534
1535 block_ctx->bh = __bread(block_ctx->dev->bdev,
1536 block_ctx->dev_bytenr >> 12, 4096);
1537 if (NULL == block_ctx->bh)
1538 return -1;
1539 block_ctx->data = block_ctx->bh->b_data;
1540
1541 return block_ctx->len;
1542}
1543
1544static void btrfsic_dump_database(struct btrfsic_state *state)
1545{
1546 struct list_head *elem_all;
1547
1548 BUG_ON(NULL == state);
1549
1550 printk(KERN_INFO "all_blocks_list:\n");
1551 list_for_each(elem_all, &state->all_blocks_list) {
1552 const struct btrfsic_block *const b_all =
1553 list_entry(elem_all, struct btrfsic_block,
1554 all_blocks_node);
1555 struct list_head *elem_ref_to;
1556 struct list_head *elem_ref_from;
1557
1558 printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n",
1559 btrfsic_get_block_type(state, b_all),
1560 (unsigned long long)b_all->logical_bytenr,
1561 b_all->dev_state->name,
1562 (unsigned long long)b_all->dev_bytenr,
1563 b_all->mirror_num);
1564
1565 list_for_each(elem_ref_to, &b_all->ref_to_list) {
1566 const struct btrfsic_block_link *const l =
1567 list_entry(elem_ref_to,
1568 struct btrfsic_block_link,
1569 node_ref_to);
1570
1571 printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
1572 " refers %u* to"
1573 " %c @%llu (%s/%llu/%d)\n",
1574 btrfsic_get_block_type(state, b_all),
1575 (unsigned long long)b_all->logical_bytenr,
1576 b_all->dev_state->name,
1577 (unsigned long long)b_all->dev_bytenr,
1578 b_all->mirror_num,
1579 l->ref_cnt,
1580 btrfsic_get_block_type(state, l->block_ref_to),
1581 (unsigned long long)
1582 l->block_ref_to->logical_bytenr,
1583 l->block_ref_to->dev_state->name,
1584 (unsigned long long)l->block_ref_to->dev_bytenr,
1585 l->block_ref_to->mirror_num);
1586 }
1587
1588 list_for_each(elem_ref_from, &b_all->ref_from_list) {
1589 const struct btrfsic_block_link *const l =
1590 list_entry(elem_ref_from,
1591 struct btrfsic_block_link,
1592 node_ref_from);
1593
1594 printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
1595 " is ref %u* from"
1596 " %c @%llu (%s/%llu/%d)\n",
1597 btrfsic_get_block_type(state, b_all),
1598 (unsigned long long)b_all->logical_bytenr,
1599 b_all->dev_state->name,
1600 (unsigned long long)b_all->dev_bytenr,
1601 b_all->mirror_num,
1602 l->ref_cnt,
1603 btrfsic_get_block_type(state, l->block_ref_from),
1604 (unsigned long long)
1605 l->block_ref_from->logical_bytenr,
1606 l->block_ref_from->dev_state->name,
1607 (unsigned long long)
1608 l->block_ref_from->dev_bytenr,
1609 l->block_ref_from->mirror_num);
1610 }
1611
1612 printk(KERN_INFO "\n");
1613 }
1614}
1615
1616/*
1617 * Test whether the disk block contains a tree block (leaf or node)
1618 * (note that this test fails for the super block)
1619 */
1620static int btrfsic_test_for_metadata(struct btrfsic_state *state,
1621 const u8 *data, unsigned int size)
1622{
1623 struct btrfs_header *h;
1624 u8 csum[BTRFS_CSUM_SIZE];
1625 u32 crc = ~(u32)0;
1626 int fail = 0;
1627 int crc_fail = 0;
1628
1629 h = (struct btrfs_header *)data;
1630
1631 if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE))
1632 fail++;
1633
1634 crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE);
1635 btrfs_csum_final(crc, csum);
1636 if (memcmp(csum, h->csum, state->csum_size))
1637 crc_fail++;
1638
1639 return fail || crc_fail;
1640}
1641
1642static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1643 u64 dev_bytenr,
1644 u8 *mapped_data, unsigned int len,
1645 struct bio *bio,
1646 int *bio_is_patched,
1647 struct buffer_head *bh,
1648 int submit_bio_bh_rw)
1649{
1650 int is_metadata;
1651 struct btrfsic_block *block;
1652 struct btrfsic_block_data_ctx block_ctx;
1653 int ret;
1654 struct btrfsic_state *state = dev_state->state;
1655 struct block_device *bdev = dev_state->bdev;
1656
1657 WARN_ON(len > PAGE_SIZE);
1658 is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len));
1659 if (NULL != bio_is_patched)
1660 *bio_is_patched = 0;
1661
1662 block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
1663 &state->block_hashtable);
1664 if (NULL != block) {
1665 u64 bytenr;
1666 struct list_head *elem_ref_to;
1667 struct list_head *tmp_ref_to;
1668
1669 if (block->is_superblock) {
1670 bytenr = le64_to_cpu(((struct btrfs_super_block *)
1671 mapped_data)->bytenr);
1672 is_metadata = 1;
1673 if (state->print_mask &
1674 BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
1675 printk(KERN_INFO
1676 "[before new superblock is written]:\n");
1677 btrfsic_dump_tree_sub(state, block, 0);
1678 }
1679 }
1680 if (is_metadata) {
1681 if (!block->is_superblock) {
1682 bytenr = le64_to_cpu(((struct btrfs_header *)
1683 mapped_data)->bytenr);
1684 btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
1685 dev_state,
1686 dev_bytenr,
1687 mapped_data);
1688 }
1689 if (block->logical_bytenr != bytenr) {
1690 printk(KERN_INFO
1691 "Written block @%llu (%s/%llu/%d)"
1692 " found in hash table, %c,"
1693 " bytenr mismatch"
1694 " (!= stored %llu).\n",
1695 (unsigned long long)bytenr,
1696 dev_state->name,
1697 (unsigned long long)dev_bytenr,
1698 block->mirror_num,
1699 btrfsic_get_block_type(state, block),
1700 (unsigned long long)
1701 block->logical_bytenr);
1702 block->logical_bytenr = bytenr;
1703 } else if (state->print_mask &
1704 BTRFSIC_PRINT_MASK_VERBOSE)
1705 printk(KERN_INFO
1706 "Written block @%llu (%s/%llu/%d)"
1707 " found in hash table, %c.\n",
1708 (unsigned long long)bytenr,
1709 dev_state->name,
1710 (unsigned long long)dev_bytenr,
1711 block->mirror_num,
1712 btrfsic_get_block_type(state, block));
1713 } else {
1714 bytenr = block->logical_bytenr;
1715 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1716 printk(KERN_INFO
1717 "Written block @%llu (%s/%llu/%d)"
1718 " found in hash table, %c.\n",
1719 (unsigned long long)bytenr,
1720 dev_state->name,
1721 (unsigned long long)dev_bytenr,
1722 block->mirror_num,
1723 btrfsic_get_block_type(state, block));
1724 }
1725
1726 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1727 printk(KERN_INFO
1728 "ref_to_list: %cE, ref_from_list: %cE\n",
1729 list_empty(&block->ref_to_list) ? ' ' : '!',
1730 list_empty(&block->ref_from_list) ? ' ' : '!');
1731 if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
1732 printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
1733 " @%llu (%s/%llu/%d), old(gen=%llu,"
1734 " objectid=%llu, type=%d, offset=%llu),"
1735 " new(gen=%llu),"
1736 " which is referenced by most recent superblock"
1737 " (superblockgen=%llu)!\n",
1738 btrfsic_get_block_type(state, block),
1739 (unsigned long long)bytenr,
1740 dev_state->name,
1741 (unsigned long long)dev_bytenr,
1742 block->mirror_num,
1743 (unsigned long long)block->generation,
1744 (unsigned long long)
1745 le64_to_cpu(block->disk_key.objectid),
1746 block->disk_key.type,
1747 (unsigned long long)
1748 le64_to_cpu(block->disk_key.offset),
1749 (unsigned long long)
1750 le64_to_cpu(((struct btrfs_header *)
1751 mapped_data)->generation),
1752 (unsigned long long)
1753 state->max_superblock_generation);
1754 btrfsic_dump_tree(state);
1755 }
1756
1757 if (!block->is_iodone && !block->never_written) {
1758 printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
1759 " @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu,"
1760 " which is not yet iodone!\n",
1761 btrfsic_get_block_type(state, block),
1762 (unsigned long long)bytenr,
1763 dev_state->name,
1764 (unsigned long long)dev_bytenr,
1765 block->mirror_num,
1766 (unsigned long long)block->generation,
1767 (unsigned long long)
1768 le64_to_cpu(((struct btrfs_header *)
1769 mapped_data)->generation));
1770 /* it would not be safe to go on */
1771 btrfsic_dump_tree(state);
1772 return;
1773 }
1774
1775 /*
1776 * Clear all references of this block. Do not free
1777 * the block itself even if is not referenced anymore
1778 * because it still carries valueable information
1779 * like whether it was ever written and IO completed.
1780 */
1781 list_for_each_safe(elem_ref_to, tmp_ref_to,
1782 &block->ref_to_list) {
1783 struct btrfsic_block_link *const l =
1784 list_entry(elem_ref_to,
1785 struct btrfsic_block_link,
1786 node_ref_to);
1787
1788 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1789 btrfsic_print_rem_link(state, l);
1790 l->ref_cnt--;
1791 if (0 == l->ref_cnt) {
1792 list_del(&l->node_ref_to);
1793 list_del(&l->node_ref_from);
1794 btrfsic_block_link_hashtable_remove(l);
1795 btrfsic_block_link_free(l);
1796 }
1797 }
1798
1799 if (block->is_superblock)
1800 ret = btrfsic_map_superblock(state, bytenr, len,
1801 bdev, &block_ctx);
1802 else
1803 ret = btrfsic_map_block(state, bytenr, len,
1804 &block_ctx, 0);
1805 if (ret) {
1806 printk(KERN_INFO
1807 "btrfsic: btrfsic_map_block(root @%llu)"
1808 " failed!\n", (unsigned long long)bytenr);
1809 return;
1810 }
1811 block_ctx.data = mapped_data;
1812 /* the following is required in case of writes to mirrors,
1813 * use the same that was used for the lookup */
1814 block_ctx.dev = dev_state;
1815 block_ctx.dev_bytenr = dev_bytenr;
1816
1817 if (is_metadata || state->include_extent_data) {
1818 block->never_written = 0;
1819 block->iodone_w_error = 0;
1820 if (NULL != bio) {
1821 block->is_iodone = 0;
1822 BUG_ON(NULL == bio_is_patched);
1823 if (!*bio_is_patched) {
1824 block->orig_bio_bh_private =
1825 bio->bi_private;
1826 block->orig_bio_bh_end_io.bio =
1827 bio->bi_end_io;
1828 block->next_in_same_bio = NULL;
1829 bio->bi_private = block;
1830 bio->bi_end_io = btrfsic_bio_end_io;
1831 *bio_is_patched = 1;
1832 } else {
1833 struct btrfsic_block *chained_block =
1834 (struct btrfsic_block *)
1835 bio->bi_private;
1836
1837 BUG_ON(NULL == chained_block);
1838 block->orig_bio_bh_private =
1839 chained_block->orig_bio_bh_private;
1840 block->orig_bio_bh_end_io.bio =
1841 chained_block->orig_bio_bh_end_io.
1842 bio;
1843 block->next_in_same_bio = chained_block;
1844 bio->bi_private = block;
1845 }
1846 } else if (NULL != bh) {
1847 block->is_iodone = 0;
1848 block->orig_bio_bh_private = bh->b_private;
1849 block->orig_bio_bh_end_io.bh = bh->b_end_io;
1850 block->next_in_same_bio = NULL;
1851 bh->b_private = block;
1852 bh->b_end_io = btrfsic_bh_end_io;
1853 } else {
1854 block->is_iodone = 1;
1855 block->orig_bio_bh_private = NULL;
1856 block->orig_bio_bh_end_io.bio = NULL;
1857 block->next_in_same_bio = NULL;
1858 }
1859 }
1860
1861 block->flush_gen = dev_state->last_flush_gen + 1;
1862 block->submit_bio_bh_rw = submit_bio_bh_rw;
1863 if (is_metadata) {
1864 block->logical_bytenr = bytenr;
1865 block->is_metadata = 1;
1866 if (block->is_superblock) {
1867 ret = btrfsic_process_written_superblock(
1868 state,
1869 block,
1870 (struct btrfs_super_block *)
1871 mapped_data);
1872 if (state->print_mask &
1873 BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
1874 printk(KERN_INFO
1875 "[after new superblock is written]:\n");
1876 btrfsic_dump_tree_sub(state, block, 0);
1877 }
1878 } else {
1879 block->mirror_num = 0; /* unknown */
1880 ret = btrfsic_process_metablock(
1881 state,
1882 block,
1883 &block_ctx,
1884 (struct btrfs_header *)
1885 block_ctx.data,
1886 0, 0);
1887 }
1888 if (ret)
1889 printk(KERN_INFO
1890 "btrfsic: btrfsic_process_metablock"
1891 "(root @%llu) failed!\n",
1892 (unsigned long long)dev_bytenr);
1893 } else {
1894 block->is_metadata = 0;
1895 block->mirror_num = 0; /* unknown */
1896 block->generation = BTRFSIC_GENERATION_UNKNOWN;
1897 if (!state->include_extent_data
1898 && list_empty(&block->ref_from_list)) {
1899 /*
1900 * disk block is overwritten with extent
1901 * data (not meta data) and we are configured
1902 * to not include extent data: take the
1903 * chance and free the block's memory
1904 */
1905 btrfsic_block_hashtable_remove(block);
1906 list_del(&block->all_blocks_node);
1907 btrfsic_block_free(block);
1908 }
1909 }
1910 btrfsic_release_block_ctx(&block_ctx);
1911 } else {
1912 /* block has not been found in hash table */
1913 u64 bytenr;
1914
1915 if (!is_metadata) {
1916 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1917 printk(KERN_INFO "Written block (%s/%llu/?)"
1918 " !found in hash table, D.\n",
1919 dev_state->name,
1920 (unsigned long long)dev_bytenr);
1921 if (!state->include_extent_data)
1922 return; /* ignore that written D block */
1923
1924 /* this is getting ugly for the
1925 * include_extent_data case... */
1926 bytenr = 0; /* unknown */
1927 block_ctx.start = bytenr;
1928 block_ctx.len = len;
1929 block_ctx.bh = NULL;
1930 } else {
1931 bytenr = le64_to_cpu(((struct btrfs_header *)
1932 mapped_data)->bytenr);
1933 btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
1934 dev_bytenr,
1935 mapped_data);
1936 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1937 printk(KERN_INFO
1938 "Written block @%llu (%s/%llu/?)"
1939 " !found in hash table, M.\n",
1940 (unsigned long long)bytenr,
1941 dev_state->name,
1942 (unsigned long long)dev_bytenr);
1943
1944 ret = btrfsic_map_block(state, bytenr, len, &block_ctx,
1945 0);
1946 if (ret) {
1947 printk(KERN_INFO
1948 "btrfsic: btrfsic_map_block(root @%llu)"
1949 " failed!\n",
1950 (unsigned long long)dev_bytenr);
1951 return;
1952 }
1953 }
1954 block_ctx.data = mapped_data;
1955 /* the following is required in case of writes to mirrors,
1956 * use the same that was used for the lookup */
1957 block_ctx.dev = dev_state;
1958 block_ctx.dev_bytenr = dev_bytenr;
1959
1960 block = btrfsic_block_alloc();
1961 if (NULL == block) {
1962 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
1963 btrfsic_release_block_ctx(&block_ctx);
1964 return;
1965 }
1966 block->dev_state = dev_state;
1967 block->dev_bytenr = dev_bytenr;
1968 block->logical_bytenr = bytenr;
1969 block->is_metadata = is_metadata;
1970 block->never_written = 0;
1971 block->iodone_w_error = 0;
1972 block->mirror_num = 0; /* unknown */
1973 block->flush_gen = dev_state->last_flush_gen + 1;
1974 block->submit_bio_bh_rw = submit_bio_bh_rw;
1975 if (NULL != bio) {
1976 block->is_iodone = 0;
1977 BUG_ON(NULL == bio_is_patched);
1978 if (!*bio_is_patched) {
1979 block->orig_bio_bh_private = bio->bi_private;
1980 block->orig_bio_bh_end_io.bio = bio->bi_end_io;
1981 block->next_in_same_bio = NULL;
1982 bio->bi_private = block;
1983 bio->bi_end_io = btrfsic_bio_end_io;
1984 *bio_is_patched = 1;
1985 } else {
1986 struct btrfsic_block *chained_block =
1987 (struct btrfsic_block *)
1988 bio->bi_private;
1989
1990 BUG_ON(NULL == chained_block);
1991 block->orig_bio_bh_private =
1992 chained_block->orig_bio_bh_private;
1993 block->orig_bio_bh_end_io.bio =
1994 chained_block->orig_bio_bh_end_io.bio;
1995 block->next_in_same_bio = chained_block;
1996 bio->bi_private = block;
1997 }
1998 } else if (NULL != bh) {
1999 block->is_iodone = 0;
2000 block->orig_bio_bh_private = bh->b_private;
2001 block->orig_bio_bh_end_io.bh = bh->b_end_io;
2002 block->next_in_same_bio = NULL;
2003 bh->b_private = block;
2004 bh->b_end_io = btrfsic_bh_end_io;
2005 } else {
2006 block->is_iodone = 1;
2007 block->orig_bio_bh_private = NULL;
2008 block->orig_bio_bh_end_io.bio = NULL;
2009 block->next_in_same_bio = NULL;
2010 }
2011 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2012 printk(KERN_INFO
2013 "New written %c-block @%llu (%s/%llu/%d)\n",
2014 is_metadata ? 'M' : 'D',
2015 (unsigned long long)block->logical_bytenr,
2016 block->dev_state->name,
2017 (unsigned long long)block->dev_bytenr,
2018 block->mirror_num);
2019 list_add(&block->all_blocks_node, &state->all_blocks_list);
2020 btrfsic_block_hashtable_add(block, &state->block_hashtable);
2021
2022 if (is_metadata) {
2023 ret = btrfsic_process_metablock(state, block,
2024 &block_ctx,
2025 (struct btrfs_header *)
2026 block_ctx.data, 0, 0);
2027 if (ret)
2028 printk(KERN_INFO
2029 "btrfsic: process_metablock(root @%llu)"
2030 " failed!\n",
2031 (unsigned long long)dev_bytenr);
2032 }
2033 btrfsic_release_block_ctx(&block_ctx);
2034 }
2035}
2036
2037static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
2038{
2039 struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private;
2040 int iodone_w_error;
2041
2042 /* mutex is not held! This is not save if IO is not yet completed
2043 * on umount */
2044 iodone_w_error = 0;
2045 if (bio_error_status)
2046 iodone_w_error = 1;
2047
2048 BUG_ON(NULL == block);
2049 bp->bi_private = block->orig_bio_bh_private;
2050 bp->bi_end_io = block->orig_bio_bh_end_io.bio;
2051
2052 do {
2053 struct btrfsic_block *next_block;
2054 struct btrfsic_dev_state *const dev_state = block->dev_state;
2055
2056 if ((dev_state->state->print_mask &
2057 BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
2058 printk(KERN_INFO
2059 "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
2060 bio_error_status,
2061 btrfsic_get_block_type(dev_state->state, block),
2062 (unsigned long long)block->logical_bytenr,
2063 dev_state->name,
2064 (unsigned long long)block->dev_bytenr,
2065 block->mirror_num);
2066 next_block = block->next_in_same_bio;
2067 block->iodone_w_error = iodone_w_error;
2068 if (block->submit_bio_bh_rw & REQ_FLUSH) {
2069 dev_state->last_flush_gen++;
2070 if ((dev_state->state->print_mask &
2071 BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
2072 printk(KERN_INFO
2073 "bio_end_io() new %s flush_gen=%llu\n",
2074 dev_state->name,
2075 (unsigned long long)
2076 dev_state->last_flush_gen);
2077 }
2078 if (block->submit_bio_bh_rw & REQ_FUA)
2079 block->flush_gen = 0; /* FUA completed means block is
2080 * on disk */
2081 block->is_iodone = 1; /* for FLUSH, this releases the block */
2082 block = next_block;
2083 } while (NULL != block);
2084
2085 bp->bi_end_io(bp, bio_error_status);
2086}
2087
2088static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
2089{
2090 struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private;
2091 int iodone_w_error = !uptodate;
2092 struct btrfsic_dev_state *dev_state;
2093
2094 BUG_ON(NULL == block);
2095 dev_state = block->dev_state;
2096 if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
2097 printk(KERN_INFO
2098 "bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n",
2099 iodone_w_error,
2100 btrfsic_get_block_type(dev_state->state, block),
2101 (unsigned long long)block->logical_bytenr,
2102 block->dev_state->name,
2103 (unsigned long long)block->dev_bytenr,
2104 block->mirror_num);
2105
2106 block->iodone_w_error = iodone_w_error;
2107 if (block->submit_bio_bh_rw & REQ_FLUSH) {
2108 dev_state->last_flush_gen++;
2109 if ((dev_state->state->print_mask &
2110 BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
2111 printk(KERN_INFO
2112 "bh_end_io() new %s flush_gen=%llu\n",
2113 dev_state->name,
2114 (unsigned long long)dev_state->last_flush_gen);
2115 }
2116 if (block->submit_bio_bh_rw & REQ_FUA)
2117 block->flush_gen = 0; /* FUA completed means block is on disk */
2118
2119 bh->b_private = block->orig_bio_bh_private;
2120 bh->b_end_io = block->orig_bio_bh_end_io.bh;
2121 block->is_iodone = 1; /* for FLUSH, this releases the block */
2122 bh->b_end_io(bh, uptodate);
2123}
2124
2125static int btrfsic_process_written_superblock(
2126 struct btrfsic_state *state,
2127 struct btrfsic_block *const superblock,
2128 struct btrfs_super_block *const super_hdr)
2129{
2130 int pass;
2131
2132 superblock->generation = btrfs_super_generation(super_hdr);
2133 if (!(superblock->generation > state->max_superblock_generation ||
2134 0 == state->max_superblock_generation)) {
2135 if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
2136 printk(KERN_INFO
2137 "btrfsic: superblock @%llu (%s/%llu/%d)"
2138 " with old gen %llu <= %llu\n",
2139 (unsigned long long)superblock->logical_bytenr,
2140 superblock->dev_state->name,
2141 (unsigned long long)superblock->dev_bytenr,
2142 superblock->mirror_num,
2143 (unsigned long long)
2144 btrfs_super_generation(super_hdr),
2145 (unsigned long long)
2146 state->max_superblock_generation);
2147 } else {
2148 if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
2149 printk(KERN_INFO
2150 "btrfsic: got new superblock @%llu (%s/%llu/%d)"
2151 " with new gen %llu > %llu\n",
2152 (unsigned long long)superblock->logical_bytenr,
2153 superblock->dev_state->name,
2154 (unsigned long long)superblock->dev_bytenr,
2155 superblock->mirror_num,
2156 (unsigned long long)
2157 btrfs_super_generation(super_hdr),
2158 (unsigned long long)
2159 state->max_superblock_generation);
2160
2161 state->max_superblock_generation =
2162 btrfs_super_generation(super_hdr);
2163 state->latest_superblock = superblock;
2164 }
2165
2166 for (pass = 0; pass < 3; pass++) {
2167 int ret;
2168 u64 next_bytenr;
2169 struct btrfsic_block *next_block;
2170 struct btrfsic_block_data_ctx tmp_next_block_ctx;
2171 struct btrfsic_block_link *l;
2172 int num_copies;
2173 int mirror_num;
2174 const char *additional_string = NULL;
2175 struct btrfs_disk_key tmp_disk_key;
2176
2177 tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
2178 tmp_disk_key.offset = 0;
2179
2180 switch (pass) {
2181 case 0:
2182 tmp_disk_key.objectid =
2183 cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
2184 additional_string = "root ";
2185 next_bytenr = btrfs_super_root(super_hdr);
2186 if (state->print_mask &
2187 BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
2188 printk(KERN_INFO "root@%llu\n",
2189 (unsigned long long)next_bytenr);
2190 break;
2191 case 1:
2192 tmp_disk_key.objectid =
2193 cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
2194 additional_string = "chunk ";
2195 next_bytenr = btrfs_super_chunk_root(super_hdr);
2196 if (state->print_mask &
2197 BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
2198 printk(KERN_INFO "chunk@%llu\n",
2199 (unsigned long long)next_bytenr);
2200 break;
2201 case 2:
2202 tmp_disk_key.objectid =
2203 cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
2204 additional_string = "log ";
2205 next_bytenr = btrfs_super_log_root(super_hdr);
2206 if (0 == next_bytenr)
2207 continue;
2208 if (state->print_mask &
2209 BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
2210 printk(KERN_INFO "log@%llu\n",
2211 (unsigned long long)next_bytenr);
2212 break;
2213 }
2214
2215 num_copies =
2216 btrfs_num_copies(&state->root->fs_info->mapping_tree,
2217 next_bytenr, PAGE_SIZE);
2218 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
2219 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
2220 (unsigned long long)next_bytenr, num_copies);
2221 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
2222 int was_created;
2223
2224 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2225 printk(KERN_INFO
2226 "btrfsic_process_written_superblock("
2227 "mirror_num=%d)\n", mirror_num);
2228 ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
2229 &tmp_next_block_ctx,
2230 mirror_num);
2231 if (ret) {
2232 printk(KERN_INFO
2233 "btrfsic: btrfsic_map_block(@%llu,"
2234 " mirror=%d) failed!\n",
2235 (unsigned long long)next_bytenr,
2236 mirror_num);
2237 return -1;
2238 }
2239
2240 next_block = btrfsic_block_lookup_or_add(
2241 state,
2242 &tmp_next_block_ctx,
2243 additional_string,
2244 1, 0, 1,
2245 mirror_num,
2246 &was_created);
2247 if (NULL == next_block) {
2248 printk(KERN_INFO
2249 "btrfsic: error, kmalloc failed!\n");
2250 btrfsic_release_block_ctx(&tmp_next_block_ctx);
2251 return -1;
2252 }
2253
2254 next_block->disk_key = tmp_disk_key;
2255 if (was_created)
2256 next_block->generation =
2257 BTRFSIC_GENERATION_UNKNOWN;
2258 l = btrfsic_block_link_lookup_or_add(
2259 state,
2260 &tmp_next_block_ctx,
2261 next_block,
2262 superblock,
2263 BTRFSIC_GENERATION_UNKNOWN);
2264 btrfsic_release_block_ctx(&tmp_next_block_ctx);
2265 if (NULL == l)
2266 return -1;
2267 }
2268 }
2269
2270 if (-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)) {
2271 WARN_ON(1);
2272 btrfsic_dump_tree(state);
2273 }
2274
2275 return 0;
2276}
2277
2278static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
2279 struct btrfsic_block *const block,
2280 int recursion_level)
2281{
2282 struct list_head *elem_ref_to;
2283 int ret = 0;
2284
2285 if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
2286 /*
2287 * Note that this situation can happen and does not
2288 * indicate an error in regular cases. It happens
2289 * when disk blocks are freed and later reused.
2290 * The check-integrity module is not aware of any
2291 * block free operations, it just recognizes block
2292 * write operations. Therefore it keeps the linkage
2293 * information for a block until a block is
2294 * rewritten. This can temporarily cause incorrect
2295 * and even circular linkage informations. This
2296 * causes no harm unless such blocks are referenced
2297 * by the most recent super block.
2298 */
2299 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2300 printk(KERN_INFO
2301 "btrfsic: abort cyclic linkage (case 1).\n");
2302
2303 return ret;
2304 }
2305
2306 /*
2307 * This algorithm is recursive because the amount of used stack
2308 * space is very small and the max recursion depth is limited.
2309 */
2310 list_for_each(elem_ref_to, &block->ref_to_list) {
2311 const struct btrfsic_block_link *const l =
2312 list_entry(elem_ref_to, struct btrfsic_block_link,
2313 node_ref_to);
2314
2315 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2316 printk(KERN_INFO
2317 "rl=%d, %c @%llu (%s/%llu/%d)"
2318 " %u* refers to %c @%llu (%s/%llu/%d)\n",
2319 recursion_level,
2320 btrfsic_get_block_type(state, block),
2321 (unsigned long long)block->logical_bytenr,
2322 block->dev_state->name,
2323 (unsigned long long)block->dev_bytenr,
2324 block->mirror_num,
2325 l->ref_cnt,
2326 btrfsic_get_block_type(state, l->block_ref_to),
2327 (unsigned long long)
2328 l->block_ref_to->logical_bytenr,
2329 l->block_ref_to->dev_state->name,
2330 (unsigned long long)l->block_ref_to->dev_bytenr,
2331 l->block_ref_to->mirror_num);
2332 if (l->block_ref_to->never_written) {
2333 printk(KERN_INFO "btrfs: attempt to write superblock"
2334 " which references block %c @%llu (%s/%llu/%d)"
2335 " which is never written!\n",
2336 btrfsic_get_block_type(state, l->block_ref_to),
2337 (unsigned long long)
2338 l->block_ref_to->logical_bytenr,
2339 l->block_ref_to->dev_state->name,
2340 (unsigned long long)l->block_ref_to->dev_bytenr,
2341 l->block_ref_to->mirror_num);
2342 ret = -1;
2343 } else if (!l->block_ref_to->is_iodone) {
2344 printk(KERN_INFO "btrfs: attempt to write superblock"
2345 " which references block %c @%llu (%s/%llu/%d)"
2346 " which is not yet iodone!\n",
2347 btrfsic_get_block_type(state, l->block_ref_to),
2348 (unsigned long long)
2349 l->block_ref_to->logical_bytenr,
2350 l->block_ref_to->dev_state->name,
2351 (unsigned long long)l->block_ref_to->dev_bytenr,
2352 l->block_ref_to->mirror_num);
2353 ret = -1;
2354 } else if (l->parent_generation !=
2355 l->block_ref_to->generation &&
2356 BTRFSIC_GENERATION_UNKNOWN !=
2357 l->parent_generation &&
2358 BTRFSIC_GENERATION_UNKNOWN !=
2359 l->block_ref_to->generation) {
2360 printk(KERN_INFO "btrfs: attempt to write superblock"
2361 " which references block %c @%llu (%s/%llu/%d)"
2362 " with generation %llu !="
2363 " parent generation %llu!\n",
2364 btrfsic_get_block_type(state, l->block_ref_to),
2365 (unsigned long long)
2366 l->block_ref_to->logical_bytenr,
2367 l->block_ref_to->dev_state->name,
2368 (unsigned long long)l->block_ref_to->dev_bytenr,
2369 l->block_ref_to->mirror_num,
2370 (unsigned long long)l->block_ref_to->generation,
2371 (unsigned long long)l->parent_generation);
2372 ret = -1;
2373 } else if (l->block_ref_to->flush_gen >
2374 l->block_ref_to->dev_state->last_flush_gen) {
2375 printk(KERN_INFO "btrfs: attempt to write superblock"
2376 " which references block %c @%llu (%s/%llu/%d)"
2377 " which is not flushed out of disk's write cache"
2378 " (block flush_gen=%llu,"
2379 " dev->flush_gen=%llu)!\n",
2380 btrfsic_get_block_type(state, l->block_ref_to),
2381 (unsigned long long)
2382 l->block_ref_to->logical_bytenr,
2383 l->block_ref_to->dev_state->name,
2384 (unsigned long long)l->block_ref_to->dev_bytenr,
2385 l->block_ref_to->mirror_num,
2386 (unsigned long long)block->flush_gen,
2387 (unsigned long long)
2388 l->block_ref_to->dev_state->last_flush_gen);
2389 ret = -1;
2390 } else if (-1 == btrfsic_check_all_ref_blocks(state,
2391 l->block_ref_to,
2392 recursion_level +
2393 1)) {
2394 ret = -1;
2395 }
2396 }
2397
2398 return ret;
2399}
2400
2401static int btrfsic_is_block_ref_by_superblock(
2402 const struct btrfsic_state *state,
2403 const struct btrfsic_block *block,
2404 int recursion_level)
2405{
2406 struct list_head *elem_ref_from;
2407
2408 if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
2409 /* refer to comment at "abort cyclic linkage (case 1)" */
2410 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2411 printk(KERN_INFO
2412 "btrfsic: abort cyclic linkage (case 2).\n");
2413
2414 return 0;
2415 }
2416
2417 /*
2418 * This algorithm is recursive because the amount of used stack space
2419 * is very small and the max recursion depth is limited.
2420 */
2421 list_for_each(elem_ref_from, &block->ref_from_list) {
2422 const struct btrfsic_block_link *const l =
2423 list_entry(elem_ref_from, struct btrfsic_block_link,
2424 node_ref_from);
2425
2426 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2427 printk(KERN_INFO
2428 "rl=%d, %c @%llu (%s/%llu/%d)"
2429 " is ref %u* from %c @%llu (%s/%llu/%d)\n",
2430 recursion_level,
2431 btrfsic_get_block_type(state, block),
2432 (unsigned long long)block->logical_bytenr,
2433 block->dev_state->name,
2434 (unsigned long long)block->dev_bytenr,
2435 block->mirror_num,
2436 l->ref_cnt,
2437 btrfsic_get_block_type(state, l->block_ref_from),
2438 (unsigned long long)
2439 l->block_ref_from->logical_bytenr,
2440 l->block_ref_from->dev_state->name,
2441 (unsigned long long)
2442 l->block_ref_from->dev_bytenr,
2443 l->block_ref_from->mirror_num);
2444 if (l->block_ref_from->is_superblock &&
2445 state->latest_superblock->dev_bytenr ==
2446 l->block_ref_from->dev_bytenr &&
2447 state->latest_superblock->dev_state->bdev ==
2448 l->block_ref_from->dev_state->bdev)
2449 return 1;
2450 else if (btrfsic_is_block_ref_by_superblock(state,
2451 l->block_ref_from,
2452 recursion_level +
2453 1))
2454 return 1;
2455 }
2456
2457 return 0;
2458}
2459
2460static void btrfsic_print_add_link(const struct btrfsic_state *state,
2461 const struct btrfsic_block_link *l)
2462{
2463 printk(KERN_INFO
2464 "Add %u* link from %c @%llu (%s/%llu/%d)"
2465 " to %c @%llu (%s/%llu/%d).\n",
2466 l->ref_cnt,
2467 btrfsic_get_block_type(state, l->block_ref_from),
2468 (unsigned long long)l->block_ref_from->logical_bytenr,
2469 l->block_ref_from->dev_state->name,
2470 (unsigned long long)l->block_ref_from->dev_bytenr,
2471 l->block_ref_from->mirror_num,
2472 btrfsic_get_block_type(state, l->block_ref_to),
2473 (unsigned long long)l->block_ref_to->logical_bytenr,
2474 l->block_ref_to->dev_state->name,
2475 (unsigned long long)l->block_ref_to->dev_bytenr,
2476 l->block_ref_to->mirror_num);
2477}
2478
2479static void btrfsic_print_rem_link(const struct btrfsic_state *state,
2480 const struct btrfsic_block_link *l)
2481{
2482 printk(KERN_INFO
2483 "Rem %u* link from %c @%llu (%s/%llu/%d)"
2484 " to %c @%llu (%s/%llu/%d).\n",
2485 l->ref_cnt,
2486 btrfsic_get_block_type(state, l->block_ref_from),
2487 (unsigned long long)l->block_ref_from->logical_bytenr,
2488 l->block_ref_from->dev_state->name,
2489 (unsigned long long)l->block_ref_from->dev_bytenr,
2490 l->block_ref_from->mirror_num,
2491 btrfsic_get_block_type(state, l->block_ref_to),
2492 (unsigned long long)l->block_ref_to->logical_bytenr,
2493 l->block_ref_to->dev_state->name,
2494 (unsigned long long)l->block_ref_to->dev_bytenr,
2495 l->block_ref_to->mirror_num);
2496}
2497
2498static char btrfsic_get_block_type(const struct btrfsic_state *state,
2499 const struct btrfsic_block *block)
2500{
2501 if (block->is_superblock &&
2502 state->latest_superblock->dev_bytenr == block->dev_bytenr &&
2503 state->latest_superblock->dev_state->bdev == block->dev_state->bdev)
2504 return 'S';
2505 else if (block->is_superblock)
2506 return 's';
2507 else if (block->is_metadata)
2508 return 'M';
2509 else
2510 return 'D';
2511}
2512
2513static void btrfsic_dump_tree(const struct btrfsic_state *state)
2514{
2515 btrfsic_dump_tree_sub(state, state->latest_superblock, 0);
2516}
2517
2518static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
2519 const struct btrfsic_block *block,
2520 int indent_level)
2521{
2522 struct list_head *elem_ref_to;
2523 int indent_add;
2524 static char buf[80];
2525 int cursor_position;
2526
2527 /*
2528 * Should better fill an on-stack buffer with a complete line and
2529 * dump it at once when it is time to print a newline character.
2530 */
2531
2532 /*
2533 * This algorithm is recursive because the amount of used stack space
2534 * is very small and the max recursion depth is limited.
2535 */
2536 indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)",
2537 btrfsic_get_block_type(state, block),
2538 (unsigned long long)block->logical_bytenr,
2539 block->dev_state->name,
2540 (unsigned long long)block->dev_bytenr,
2541 block->mirror_num);
2542 if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
2543 printk("[...]\n");
2544 return;
2545 }
2546 printk(buf);
2547 indent_level += indent_add;
2548 if (list_empty(&block->ref_to_list)) {
2549 printk("\n");
2550 return;
2551 }
2552 if (block->mirror_num > 1 &&
2553 !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) {
2554 printk(" [...]\n");
2555 return;
2556 }
2557
2558 cursor_position = indent_level;
2559 list_for_each(elem_ref_to, &block->ref_to_list) {
2560 const struct btrfsic_block_link *const l =
2561 list_entry(elem_ref_to, struct btrfsic_block_link,
2562 node_ref_to);
2563
2564 while (cursor_position < indent_level) {
2565 printk(" ");
2566 cursor_position++;
2567 }
2568 if (l->ref_cnt > 1)
2569 indent_add = sprintf(buf, " %d*--> ", l->ref_cnt);
2570 else
2571 indent_add = sprintf(buf, " --> ");
2572 if (indent_level + indent_add >
2573 BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
2574 printk("[...]\n");
2575 cursor_position = 0;
2576 continue;
2577 }
2578
2579 printk(buf);
2580
2581 btrfsic_dump_tree_sub(state, l->block_ref_to,
2582 indent_level + indent_add);
2583 cursor_position = 0;
2584 }
2585}
2586
2587static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
2588 struct btrfsic_state *state,
2589 struct btrfsic_block_data_ctx *next_block_ctx,
2590 struct btrfsic_block *next_block,
2591 struct btrfsic_block *from_block,
2592 u64 parent_generation)
2593{
2594 struct btrfsic_block_link *l;
2595
2596 l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev,
2597 next_block_ctx->dev_bytenr,
2598 from_block->dev_state->bdev,
2599 from_block->dev_bytenr,
2600 &state->block_link_hashtable);
2601 if (NULL == l) {
2602 l = btrfsic_block_link_alloc();
2603 if (NULL == l) {
2604 printk(KERN_INFO
2605 "btrfsic: error, kmalloc" " failed!\n");
2606 return NULL;
2607 }
2608
2609 l->block_ref_to = next_block;
2610 l->block_ref_from = from_block;
2611 l->ref_cnt = 1;
2612 l->parent_generation = parent_generation;
2613
2614 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2615 btrfsic_print_add_link(state, l);
2616
2617 list_add(&l->node_ref_to, &from_block->ref_to_list);
2618 list_add(&l->node_ref_from, &next_block->ref_from_list);
2619
2620 btrfsic_block_link_hashtable_add(l,
2621 &state->block_link_hashtable);
2622 } else {
2623 l->ref_cnt++;
2624 l->parent_generation = parent_generation;
2625 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2626 btrfsic_print_add_link(state, l);
2627 }
2628
2629 return l;
2630}
2631
2632static struct btrfsic_block *btrfsic_block_lookup_or_add(
2633 struct btrfsic_state *state,
2634 struct btrfsic_block_data_ctx *block_ctx,
2635 const char *additional_string,
2636 int is_metadata,
2637 int is_iodone,
2638 int never_written,
2639 int mirror_num,
2640 int *was_created)
2641{
2642 struct btrfsic_block *block;
2643
2644 block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev,
2645 block_ctx->dev_bytenr,
2646 &state->block_hashtable);
2647 if (NULL == block) {
2648 struct btrfsic_dev_state *dev_state;
2649
2650 block = btrfsic_block_alloc();
2651 if (NULL == block) {
2652 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
2653 return NULL;
2654 }
2655 dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev);
2656 if (NULL == dev_state) {
2657 printk(KERN_INFO
2658 "btrfsic: error, lookup dev_state failed!\n");
2659 btrfsic_block_free(block);
2660 return NULL;
2661 }
2662 block->dev_state = dev_state;
2663 block->dev_bytenr = block_ctx->dev_bytenr;
2664 block->logical_bytenr = block_ctx->start;
2665 block->is_metadata = is_metadata;
2666 block->is_iodone = is_iodone;
2667 block->never_written = never_written;
2668 block->mirror_num = mirror_num;
2669 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2670 printk(KERN_INFO
2671 "New %s%c-block @%llu (%s/%llu/%d)\n",
2672 additional_string,
2673 btrfsic_get_block_type(state, block),
2674 (unsigned long long)block->logical_bytenr,
2675 dev_state->name,
2676 (unsigned long long)block->dev_bytenr,
2677 mirror_num);
2678 list_add(&block->all_blocks_node, &state->all_blocks_list);
2679 btrfsic_block_hashtable_add(block, &state->block_hashtable);
2680 if (NULL != was_created)
2681 *was_created = 1;
2682 } else {
2683 if (NULL != was_created)
2684 *was_created = 0;
2685 }
2686
2687 return block;
2688}
2689
2690static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
2691 u64 bytenr,
2692 struct btrfsic_dev_state *dev_state,
2693 u64 dev_bytenr, char *data)
2694{
2695 int num_copies;
2696 int mirror_num;
2697 int ret;
2698 struct btrfsic_block_data_ctx block_ctx;
2699 int match = 0;
2700
2701 num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
2702 bytenr, PAGE_SIZE);
2703
2704 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
2705 ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
2706 &block_ctx, mirror_num);
2707 if (ret) {
2708 printk(KERN_INFO "btrfsic:"
2709 " btrfsic_map_block(logical @%llu,"
2710 " mirror %d) failed!\n",
2711 (unsigned long long)bytenr, mirror_num);
2712 continue;
2713 }
2714
2715 if (dev_state->bdev == block_ctx.dev->bdev &&
2716 dev_bytenr == block_ctx.dev_bytenr) {
2717 match++;
2718 btrfsic_release_block_ctx(&block_ctx);
2719 break;
2720 }
2721 btrfsic_release_block_ctx(&block_ctx);
2722 }
2723
2724 if (!match) {
2725 printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio,"
2726 " buffer->log_bytenr=%llu, submit_bio(bdev=%s,"
2727 " phys_bytenr=%llu)!\n",
2728 (unsigned long long)bytenr, dev_state->name,
2729 (unsigned long long)dev_bytenr);
2730 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
2731 ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
2732 &block_ctx, mirror_num);
2733 if (ret)
2734 continue;
2735
2736 printk(KERN_INFO "Read logical bytenr @%llu maps to"
2737 " (%s/%llu/%d)\n",
2738 (unsigned long long)bytenr,
2739 block_ctx.dev->name,
2740 (unsigned long long)block_ctx.dev_bytenr,
2741 mirror_num);
2742 }
2743 WARN_ON(1);
2744 }
2745}
2746
2747static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
2748 struct block_device *bdev)
2749{
2750 struct btrfsic_dev_state *ds;
2751
2752 ds = btrfsic_dev_state_hashtable_lookup(bdev,
2753 &btrfsic_dev_state_hashtable);
2754 return ds;
2755}
2756
2757int btrfsic_submit_bh(int rw, struct buffer_head *bh)
2758{
2759 struct btrfsic_dev_state *dev_state;
2760
2761 if (!btrfsic_is_initialized)
2762 return submit_bh(rw, bh);
2763
2764 mutex_lock(&btrfsic_mutex);
2765 /* since btrfsic_submit_bh() might also be called before
2766 * btrfsic_mount(), this might return NULL */
2767 dev_state = btrfsic_dev_state_lookup(bh->b_bdev);
2768
2769 /* Only called to write the superblock (incl. FLUSH/FUA) */
2770 if (NULL != dev_state &&
2771 (rw & WRITE) && bh->b_size > 0) {
2772 u64 dev_bytenr;
2773
2774 dev_bytenr = 4096 * bh->b_blocknr;
2775 if (dev_state->state->print_mask &
2776 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
2777 printk(KERN_INFO
2778 "submit_bh(rw=0x%x, blocknr=%lu (bytenr %llu),"
2779 " size=%lu, data=%p, bdev=%p)\n",
2780 rw, bh->b_blocknr,
2781 (unsigned long long)dev_bytenr, bh->b_size,
2782 bh->b_data, bh->b_bdev);
2783 btrfsic_process_written_block(dev_state, dev_bytenr,
2784 bh->b_data, bh->b_size, NULL,
2785 NULL, bh, rw);
2786 } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
2787 if (dev_state->state->print_mask &
2788 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
2789 printk(KERN_INFO
2790 "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n",
2791 rw, bh->b_bdev);
2792 if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
2793 if ((dev_state->state->print_mask &
2794 (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
2795 BTRFSIC_PRINT_MASK_VERBOSE)))
2796 printk(KERN_INFO
2797 "btrfsic_submit_bh(%s) with FLUSH"
2798 " but dummy block already in use"
2799 " (ignored)!\n",
2800 dev_state->name);
2801 } else {
2802 struct btrfsic_block *const block =
2803 &dev_state->dummy_block_for_bio_bh_flush;
2804
2805 block->is_iodone = 0;
2806 block->never_written = 0;
2807 block->iodone_w_error = 0;
2808 block->flush_gen = dev_state->last_flush_gen + 1;
2809 block->submit_bio_bh_rw = rw;
2810 block->orig_bio_bh_private = bh->b_private;
2811 block->orig_bio_bh_end_io.bh = bh->b_end_io;
2812 block->next_in_same_bio = NULL;
2813 bh->b_private = block;
2814 bh->b_end_io = btrfsic_bh_end_io;
2815 }
2816 }
2817 mutex_unlock(&btrfsic_mutex);
2818 return submit_bh(rw, bh);
2819}
2820
2821void btrfsic_submit_bio(int rw, struct bio *bio)
2822{
2823 struct btrfsic_dev_state *dev_state;
2824
2825 if (!btrfsic_is_initialized) {
2826 submit_bio(rw, bio);
2827 return;
2828 }
2829
2830 mutex_lock(&btrfsic_mutex);
2831 /* since btrfsic_submit_bio() is also called before
2832 * btrfsic_mount(), this might return NULL */
2833 dev_state = btrfsic_dev_state_lookup(bio->bi_bdev);
2834 if (NULL != dev_state &&
2835 (rw & WRITE) && NULL != bio->bi_io_vec) {
2836 unsigned int i;
2837 u64 dev_bytenr;
2838 int bio_is_patched;
2839
2840 dev_bytenr = 512 * bio->bi_sector;
2841 bio_is_patched = 0;
2842 if (dev_state->state->print_mask &
2843 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
2844 printk(KERN_INFO
2845 "submit_bio(rw=0x%x, bi_vcnt=%u,"
2846 " bi_sector=%lu (bytenr %llu), bi_bdev=%p)\n",
2847 rw, bio->bi_vcnt, bio->bi_sector,
2848 (unsigned long long)dev_bytenr,
2849 bio->bi_bdev);
2850
2851 for (i = 0; i < bio->bi_vcnt; i++) {
2852 u8 *mapped_data;
2853
2854 mapped_data = kmap(bio->bi_io_vec[i].bv_page);
2855 if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
2856 BTRFSIC_PRINT_MASK_VERBOSE) ==
2857 (dev_state->state->print_mask &
2858 (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
2859 BTRFSIC_PRINT_MASK_VERBOSE)))
2860 printk(KERN_INFO
2861 "#%u: page=%p, mapped=%p, len=%u,"
2862 " offset=%u\n",
2863 i, bio->bi_io_vec[i].bv_page,
2864 mapped_data,
2865 bio->bi_io_vec[i].bv_len,
2866 bio->bi_io_vec[i].bv_offset);
2867 btrfsic_process_written_block(dev_state, dev_bytenr,
2868 mapped_data,
2869 bio->bi_io_vec[i].bv_len,
2870 bio, &bio_is_patched,
2871 NULL, rw);
2872 kunmap(bio->bi_io_vec[i].bv_page);
2873 dev_bytenr += bio->bi_io_vec[i].bv_len;
2874 }
2875 } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
2876 if (dev_state->state->print_mask &
2877 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
2878 printk(KERN_INFO
2879 "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n",
2880 rw, bio->bi_bdev);
2881 if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
2882 if ((dev_state->state->print_mask &
2883 (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
2884 BTRFSIC_PRINT_MASK_VERBOSE)))
2885 printk(KERN_INFO
2886 "btrfsic_submit_bio(%s) with FLUSH"
2887 " but dummy block already in use"
2888 " (ignored)!\n",
2889 dev_state->name);
2890 } else {
2891 struct btrfsic_block *const block =
2892 &dev_state->dummy_block_for_bio_bh_flush;
2893
2894 block->is_iodone = 0;
2895 block->never_written = 0;
2896 block->iodone_w_error = 0;
2897 block->flush_gen = dev_state->last_flush_gen + 1;
2898 block->submit_bio_bh_rw = rw;
2899 block->orig_bio_bh_private = bio->bi_private;
2900 block->orig_bio_bh_end_io.bio = bio->bi_end_io;
2901 block->next_in_same_bio = NULL;
2902 bio->bi_private = block;
2903 bio->bi_end_io = btrfsic_bio_end_io;
2904 }
2905 }
2906 mutex_unlock(&btrfsic_mutex);
2907
2908 submit_bio(rw, bio);
2909}
2910
2911int btrfsic_mount(struct btrfs_root *root,
2912 struct btrfs_fs_devices *fs_devices,
2913 int including_extent_data, u32 print_mask)
2914{
2915 int ret;
2916 struct btrfsic_state *state;
2917 struct list_head *dev_head = &fs_devices->devices;
2918 struct btrfs_device *device;
2919
2920 state = kzalloc(sizeof(*state), GFP_NOFS);
2921 if (NULL == state) {
2922 printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n");
2923 return -1;
2924 }
2925
2926 if (!btrfsic_is_initialized) {
2927 mutex_init(&btrfsic_mutex);
2928 btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable);
2929 btrfsic_is_initialized = 1;
2930 }
2931 mutex_lock(&btrfsic_mutex);
2932 state->root = root;
2933 state->print_mask = print_mask;
2934 state->include_extent_data = including_extent_data;
2935 state->csum_size = 0;
2936 INIT_LIST_HEAD(&state->all_blocks_list);
2937 btrfsic_block_hashtable_init(&state->block_hashtable);
2938 btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
2939 state->max_superblock_generation = 0;
2940 state->latest_superblock = NULL;
2941
2942 list_for_each_entry(device, dev_head, dev_list) {
2943 struct btrfsic_dev_state *ds;
2944 char *p;
2945
2946 if (!device->bdev || !device->name)
2947 continue;
2948
2949 ds = btrfsic_dev_state_alloc();
2950 if (NULL == ds) {
2951 printk(KERN_INFO
2952 "btrfs check-integrity: kmalloc() failed!\n");
2953 mutex_unlock(&btrfsic_mutex);
2954 return -1;
2955 }
2956 ds->bdev = device->bdev;
2957 ds->state = state;
2958 bdevname(ds->bdev, ds->name);
2959 ds->name[BDEVNAME_SIZE - 1] = '\0';
2960 for (p = ds->name; *p != '\0'; p++);
2961 while (p > ds->name && *p != '/')
2962 p--;
2963 if (*p == '/')
2964 p++;
2965 strlcpy(ds->name, p, sizeof(ds->name));
2966 btrfsic_dev_state_hashtable_add(ds,
2967 &btrfsic_dev_state_hashtable);
2968 }
2969
2970 ret = btrfsic_process_superblock(state, fs_devices);
2971 if (0 != ret) {
2972 mutex_unlock(&btrfsic_mutex);
2973 btrfsic_unmount(root, fs_devices);
2974 return ret;
2975 }
2976
2977 if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE)
2978 btrfsic_dump_database(state);
2979 if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE)
2980 btrfsic_dump_tree(state);
2981
2982 mutex_unlock(&btrfsic_mutex);
2983 return 0;
2984}
2985
2986void btrfsic_unmount(struct btrfs_root *root,
2987 struct btrfs_fs_devices *fs_devices)
2988{
2989 struct list_head *elem_all;
2990 struct list_head *tmp_all;
2991 struct btrfsic_state *state;
2992 struct list_head *dev_head = &fs_devices->devices;
2993 struct btrfs_device *device;
2994
2995 if (!btrfsic_is_initialized)
2996 return;
2997
2998 mutex_lock(&btrfsic_mutex);
2999
3000 state = NULL;
3001 list_for_each_entry(device, dev_head, dev_list) {
3002 struct btrfsic_dev_state *ds;
3003
3004 if (!device->bdev || !device->name)
3005 continue;
3006
3007 ds = btrfsic_dev_state_hashtable_lookup(
3008 device->bdev,
3009 &btrfsic_dev_state_hashtable);
3010 if (NULL != ds) {
3011 state = ds->state;
3012 btrfsic_dev_state_hashtable_remove(ds);
3013 btrfsic_dev_state_free(ds);
3014 }
3015 }
3016
3017 if (NULL == state) {
3018 printk(KERN_INFO
3019 "btrfsic: error, cannot find state information"
3020 " on umount!\n");
3021 mutex_unlock(&btrfsic_mutex);
3022 return;
3023 }
3024
3025 /*
3026 * Don't care about keeping the lists' state up to date,
3027 * just free all memory that was allocated dynamically.
3028 * Free the blocks and the block_links.
3029 */
3030 list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) {
3031 struct btrfsic_block *const b_all =
3032 list_entry(elem_all, struct btrfsic_block,
3033 all_blocks_node);
3034 struct list_head *elem_ref_to;
3035 struct list_head *tmp_ref_to;
3036
3037 list_for_each_safe(elem_ref_to, tmp_ref_to,
3038 &b_all->ref_to_list) {
3039 struct btrfsic_block_link *const l =
3040 list_entry(elem_ref_to,
3041 struct btrfsic_block_link,
3042 node_ref_to);
3043
3044 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
3045 btrfsic_print_rem_link(state, l);
3046
3047 l->ref_cnt--;
3048 if (0 == l->ref_cnt)
3049 btrfsic_block_link_free(l);
3050 }
3051
3052 if (b_all->is_iodone)
3053 btrfsic_block_free(b_all);
3054 else
3055 printk(KERN_INFO "btrfs: attempt to free %c-block"
3056 " @%llu (%s/%llu/%d) on umount which is"
3057 " not yet iodone!\n",
3058 btrfsic_get_block_type(state, b_all),
3059 (unsigned long long)b_all->logical_bytenr,
3060 b_all->dev_state->name,
3061 (unsigned long long)b_all->dev_bytenr,
3062 b_all->mirror_num);
3063 }
3064
3065 mutex_unlock(&btrfsic_mutex);
3066
3067 kfree(state);
3068}
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
new file mode 100644
index 000000000000..8b59175cc502
--- /dev/null
+++ b/fs/btrfs/check-integrity.h
@@ -0,0 +1,36 @@
1/*
2 * Copyright (C) STRATO AG 2011. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#if !defined(__BTRFS_CHECK_INTEGRITY__)
20#define __BTRFS_CHECK_INTEGRITY__
21
22#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
23int btrfsic_submit_bh(int rw, struct buffer_head *bh);
24void btrfsic_submit_bio(int rw, struct bio *bio);
25#else
26#define btrfsic_submit_bh submit_bh
27#define btrfsic_submit_bio submit_bio
28#endif
29
30int btrfsic_mount(struct btrfs_root *root,
31 struct btrfs_fs_devices *fs_devices,
32 int including_extent_data, u32 print_mask);
33void btrfsic_unmount(struct btrfs_root *root,
34 struct btrfs_fs_devices *fs_devices);
35
36#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index dede441bdeee..0639a555e16e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -240,7 +240,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
240 240
241 cow = btrfs_alloc_free_block(trans, root, buf->len, 0, 241 cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
242 new_root_objectid, &disk_key, level, 242 new_root_objectid, &disk_key, level,
243 buf->start, 0); 243 buf->start, 0, 1);
244 if (IS_ERR(cow)) 244 if (IS_ERR(cow))
245 return PTR_ERR(cow); 245 return PTR_ERR(cow);
246 246
@@ -261,9 +261,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
261 261
262 WARN_ON(btrfs_header_generation(buf) > trans->transid); 262 WARN_ON(btrfs_header_generation(buf) > trans->transid);
263 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 263 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
264 ret = btrfs_inc_ref(trans, root, cow, 1); 264 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
265 else 265 else
266 ret = btrfs_inc_ref(trans, root, cow, 0); 266 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
267 267
268 if (ret) 268 if (ret)
269 return ret; 269 return ret;
@@ -350,14 +350,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
350 if ((owner == root->root_key.objectid || 350 if ((owner == root->root_key.objectid ||
351 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && 351 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
352 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { 352 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
353 ret = btrfs_inc_ref(trans, root, buf, 1); 353 ret = btrfs_inc_ref(trans, root, buf, 1, 1);
354 BUG_ON(ret); 354 BUG_ON(ret);
355 355
356 if (root->root_key.objectid == 356 if (root->root_key.objectid ==
357 BTRFS_TREE_RELOC_OBJECTID) { 357 BTRFS_TREE_RELOC_OBJECTID) {
358 ret = btrfs_dec_ref(trans, root, buf, 0); 358 ret = btrfs_dec_ref(trans, root, buf, 0, 1);
359 BUG_ON(ret); 359 BUG_ON(ret);
360 ret = btrfs_inc_ref(trans, root, cow, 1); 360 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
361 BUG_ON(ret); 361 BUG_ON(ret);
362 } 362 }
363 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 363 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
@@ -365,9 +365,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
365 365
366 if (root->root_key.objectid == 366 if (root->root_key.objectid ==
367 BTRFS_TREE_RELOC_OBJECTID) 367 BTRFS_TREE_RELOC_OBJECTID)
368 ret = btrfs_inc_ref(trans, root, cow, 1); 368 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
369 else 369 else
370 ret = btrfs_inc_ref(trans, root, cow, 0); 370 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
371 BUG_ON(ret); 371 BUG_ON(ret);
372 } 372 }
373 if (new_flags != 0) { 373 if (new_flags != 0) {
@@ -381,11 +381,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
381 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 381 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
382 if (root->root_key.objectid == 382 if (root->root_key.objectid ==
383 BTRFS_TREE_RELOC_OBJECTID) 383 BTRFS_TREE_RELOC_OBJECTID)
384 ret = btrfs_inc_ref(trans, root, cow, 1); 384 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
385 else 385 else
386 ret = btrfs_inc_ref(trans, root, cow, 0); 386 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
387 BUG_ON(ret); 387 BUG_ON(ret);
388 ret = btrfs_dec_ref(trans, root, buf, 1); 388 ret = btrfs_dec_ref(trans, root, buf, 1, 1);
389 BUG_ON(ret); 389 BUG_ON(ret);
390 } 390 }
391 clean_tree_block(trans, root, buf); 391 clean_tree_block(trans, root, buf);
@@ -446,7 +446,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
446 446
447 cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, 447 cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
448 root->root_key.objectid, &disk_key, 448 root->root_key.objectid, &disk_key,
449 level, search_start, empty_size); 449 level, search_start, empty_size, 1);
450 if (IS_ERR(cow)) 450 if (IS_ERR(cow))
451 return PTR_ERR(cow); 451 return PTR_ERR(cow);
452 452
@@ -484,7 +484,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
484 rcu_assign_pointer(root->node, cow); 484 rcu_assign_pointer(root->node, cow);
485 485
486 btrfs_free_tree_block(trans, root, buf, parent_start, 486 btrfs_free_tree_block(trans, root, buf, parent_start,
487 last_ref); 487 last_ref, 1);
488 free_extent_buffer(buf); 488 free_extent_buffer(buf);
489 add_root_to_dirty_list(root); 489 add_root_to_dirty_list(root);
490 } else { 490 } else {
@@ -500,7 +500,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
500 trans->transid); 500 trans->transid);
501 btrfs_mark_buffer_dirty(parent); 501 btrfs_mark_buffer_dirty(parent);
502 btrfs_free_tree_block(trans, root, buf, parent_start, 502 btrfs_free_tree_block(trans, root, buf, parent_start,
503 last_ref); 503 last_ref, 1);
504 } 504 }
505 if (unlock_orig) 505 if (unlock_orig)
506 btrfs_tree_unlock(buf); 506 btrfs_tree_unlock(buf);
@@ -957,7 +957,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
957 free_extent_buffer(mid); 957 free_extent_buffer(mid);
958 958
959 root_sub_used(root, mid->len); 959 root_sub_used(root, mid->len);
960 btrfs_free_tree_block(trans, root, mid, 0, 1); 960 btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
961 /* once for the root ptr */ 961 /* once for the root ptr */
962 free_extent_buffer(mid); 962 free_extent_buffer(mid);
963 return 0; 963 return 0;
@@ -1015,7 +1015,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1015 if (wret) 1015 if (wret)
1016 ret = wret; 1016 ret = wret;
1017 root_sub_used(root, right->len); 1017 root_sub_used(root, right->len);
1018 btrfs_free_tree_block(trans, root, right, 0, 1); 1018 btrfs_free_tree_block(trans, root, right, 0, 1, 0);
1019 free_extent_buffer(right); 1019 free_extent_buffer(right);
1020 right = NULL; 1020 right = NULL;
1021 } else { 1021 } else {
@@ -1055,7 +1055,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1055 if (wret) 1055 if (wret)
1056 ret = wret; 1056 ret = wret;
1057 root_sub_used(root, mid->len); 1057 root_sub_used(root, mid->len);
1058 btrfs_free_tree_block(trans, root, mid, 0, 1); 1058 btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
1059 free_extent_buffer(mid); 1059 free_extent_buffer(mid);
1060 mid = NULL; 1060 mid = NULL;
1061 } else { 1061 } else {
@@ -2089,7 +2089,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2089 2089
2090 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 2090 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2091 root->root_key.objectid, &lower_key, 2091 root->root_key.objectid, &lower_key,
2092 level, root->node->start, 0); 2092 level, root->node->start, 0, 0);
2093 if (IS_ERR(c)) 2093 if (IS_ERR(c))
2094 return PTR_ERR(c); 2094 return PTR_ERR(c);
2095 2095
@@ -2216,7 +2216,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2216 2216
2217 split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 2217 split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2218 root->root_key.objectid, 2218 root->root_key.objectid,
2219 &disk_key, level, c->start, 0); 2219 &disk_key, level, c->start, 0, 0);
2220 if (IS_ERR(split)) 2220 if (IS_ERR(split))
2221 return PTR_ERR(split); 2221 return PTR_ERR(split);
2222 2222
@@ -2970,7 +2970,7 @@ again:
2970 2970
2971 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 2971 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
2972 root->root_key.objectid, 2972 root->root_key.objectid,
2973 &disk_key, 0, l->start, 0); 2973 &disk_key, 0, l->start, 0, 0);
2974 if (IS_ERR(right)) 2974 if (IS_ERR(right))
2975 return PTR_ERR(right); 2975 return PTR_ERR(right);
2976 2976
@@ -3781,7 +3781,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3781 3781
3782 root_sub_used(root, leaf->len); 3782 root_sub_used(root, leaf->len);
3783 3783
3784 btrfs_free_tree_block(trans, root, leaf, 0, 1); 3784 btrfs_free_tree_block(trans, root, leaf, 0, 1, 0);
3785 return 0; 3785 return 0;
3786} 3786}
3787/* 3787/*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 67385033323d..27ebe61d3ccc 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -86,6 +86,9 @@ struct btrfs_ordered_sum;
86/* holds checksums of all the data extents */ 86/* holds checksums of all the data extents */
87#define BTRFS_CSUM_TREE_OBJECTID 7ULL 87#define BTRFS_CSUM_TREE_OBJECTID 7ULL
88 88
89/* for storing balance parameters in the root tree */
90#define BTRFS_BALANCE_OBJECTID -4ULL
91
89/* orhpan objectid for tracking unlinked/truncated files */ 92/* orhpan objectid for tracking unlinked/truncated files */
90#define BTRFS_ORPHAN_OBJECTID -5ULL 93#define BTRFS_ORPHAN_OBJECTID -5ULL
91 94
@@ -692,6 +695,54 @@ struct btrfs_root_ref {
692 __le16 name_len; 695 __le16 name_len;
693} __attribute__ ((__packed__)); 696} __attribute__ ((__packed__));
694 697
698struct btrfs_disk_balance_args {
699 /*
700 * profiles to operate on, single is denoted by
701 * BTRFS_AVAIL_ALLOC_BIT_SINGLE
702 */
703 __le64 profiles;
704
705 /* usage filter */
706 __le64 usage;
707
708 /* devid filter */
709 __le64 devid;
710
711 /* devid subset filter [pstart..pend) */
712 __le64 pstart;
713 __le64 pend;
714
715 /* btrfs virtual address space subset filter [vstart..vend) */
716 __le64 vstart;
717 __le64 vend;
718
719 /*
720 * profile to convert to, single is denoted by
721 * BTRFS_AVAIL_ALLOC_BIT_SINGLE
722 */
723 __le64 target;
724
725 /* BTRFS_BALANCE_ARGS_* */
726 __le64 flags;
727
728 __le64 unused[8];
729} __attribute__ ((__packed__));
730
731/*
732 * store balance parameters to disk so that balance can be properly
733 * resumed after crash or unmount
734 */
735struct btrfs_balance_item {
736 /* BTRFS_BALANCE_* */
737 __le64 flags;
738
739 struct btrfs_disk_balance_args data;
740 struct btrfs_disk_balance_args meta;
741 struct btrfs_disk_balance_args sys;
742
743 __le64 unused[4];
744} __attribute__ ((__packed__));
745
695#define BTRFS_FILE_EXTENT_INLINE 0 746#define BTRFS_FILE_EXTENT_INLINE 0
696#define BTRFS_FILE_EXTENT_REG 1 747#define BTRFS_FILE_EXTENT_REG 1
697#define BTRFS_FILE_EXTENT_PREALLOC 2 748#define BTRFS_FILE_EXTENT_PREALLOC 2
@@ -751,14 +802,32 @@ struct btrfs_csum_item {
751} __attribute__ ((__packed__)); 802} __attribute__ ((__packed__));
752 803
753/* different types of block groups (and chunks) */ 804/* different types of block groups (and chunks) */
754#define BTRFS_BLOCK_GROUP_DATA (1 << 0) 805#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0)
755#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1) 806#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1)
756#define BTRFS_BLOCK_GROUP_METADATA (1 << 2) 807#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2)
757#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3) 808#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3)
758#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) 809#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
759#define BTRFS_BLOCK_GROUP_DUP (1 << 5) 810#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
760#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) 811#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
761#define BTRFS_NR_RAID_TYPES 5 812#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
813#define BTRFS_NR_RAID_TYPES 5
814
815#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \
816 BTRFS_BLOCK_GROUP_SYSTEM | \
817 BTRFS_BLOCK_GROUP_METADATA)
818
819#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
820 BTRFS_BLOCK_GROUP_RAID1 | \
821 BTRFS_BLOCK_GROUP_DUP | \
822 BTRFS_BLOCK_GROUP_RAID10)
823/*
824 * We need a bit for restriper to be able to tell when chunks of type
825 * SINGLE are available. This "extended" profile format is used in
826 * fs_info->avail_*_alloc_bits (in-memory) and balance item fields
827 * (on-disk). The corresponding on-disk bit in chunk.type is reserved
828 * to avoid remappings between two formats in future.
829 */
830#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48)
762 831
763struct btrfs_block_group_item { 832struct btrfs_block_group_item {
764 __le64 used; 833 __le64 used;
@@ -916,6 +985,7 @@ struct btrfs_block_group_cache {
916struct reloc_control; 985struct reloc_control;
917struct btrfs_device; 986struct btrfs_device;
918struct btrfs_fs_devices; 987struct btrfs_fs_devices;
988struct btrfs_balance_control;
919struct btrfs_delayed_root; 989struct btrfs_delayed_root;
920struct btrfs_fs_info { 990struct btrfs_fs_info {
921 u8 fsid[BTRFS_FSID_SIZE]; 991 u8 fsid[BTRFS_FSID_SIZE];
@@ -971,7 +1041,7 @@ struct btrfs_fs_info {
971 * is required instead of the faster short fsync log commits 1041 * is required instead of the faster short fsync log commits
972 */ 1042 */
973 u64 last_trans_log_full_commit; 1043 u64 last_trans_log_full_commit;
974 unsigned long mount_opt:20; 1044 unsigned long mount_opt:21;
975 unsigned long compress_type:4; 1045 unsigned long compress_type:4;
976 u64 max_inline; 1046 u64 max_inline;
977 u64 alloc_start; 1047 u64 alloc_start;
@@ -1132,12 +1202,23 @@ struct btrfs_fs_info {
1132 spinlock_t ref_cache_lock; 1202 spinlock_t ref_cache_lock;
1133 u64 total_ref_cache_size; 1203 u64 total_ref_cache_size;
1134 1204
1205 /*
1206 * these three are in extended format (availability of single
1207 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
1208 * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits)
1209 */
1135 u64 avail_data_alloc_bits; 1210 u64 avail_data_alloc_bits;
1136 u64 avail_metadata_alloc_bits; 1211 u64 avail_metadata_alloc_bits;
1137 u64 avail_system_alloc_bits; 1212 u64 avail_system_alloc_bits;
1138 u64 data_alloc_profile; 1213
1139 u64 metadata_alloc_profile; 1214 /* restriper state */
1140 u64 system_alloc_profile; 1215 spinlock_t balance_lock;
1216 struct mutex balance_mutex;
1217 atomic_t balance_running;
1218 atomic_t balance_pause_req;
1219 atomic_t balance_cancel_req;
1220 struct btrfs_balance_control *balance_ctl;
1221 wait_queue_head_t balance_wait_q;
1141 1222
1142 unsigned data_chunk_allocations; 1223 unsigned data_chunk_allocations;
1143 unsigned metadata_ratio; 1224 unsigned metadata_ratio;
@@ -1155,6 +1236,10 @@ struct btrfs_fs_info {
1155 int scrub_workers_refcnt; 1236 int scrub_workers_refcnt;
1156 struct btrfs_workers scrub_workers; 1237 struct btrfs_workers scrub_workers;
1157 1238
1239#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1240 u32 check_integrity_print_mask;
1241#endif
1242
1158 /* filesystem state */ 1243 /* filesystem state */
1159 u64 fs_state; 1244 u64 fs_state;
1160 1245
@@ -1383,6 +1468,8 @@ struct btrfs_ioctl_defrag_range_args {
1383#define BTRFS_DEV_ITEM_KEY 216 1468#define BTRFS_DEV_ITEM_KEY 216
1384#define BTRFS_CHUNK_ITEM_KEY 228 1469#define BTRFS_CHUNK_ITEM_KEY 228
1385 1470
1471#define BTRFS_BALANCE_ITEM_KEY 248
1472
1386/* 1473/*
1387 * string items are for debugging. They just store a short string of 1474 * string items are for debugging. They just store a short string of
1388 * data in the FS 1475 * data in the FS
@@ -1413,6 +1500,9 @@ struct btrfs_ioctl_defrag_range_args {
1413#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) 1500#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
1414#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) 1501#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
1415#define BTRFS_MOUNT_RECOVERY (1 << 18) 1502#define BTRFS_MOUNT_RECOVERY (1 << 18)
1503#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19)
1504#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20)
1505#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
1416 1506
1417#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1507#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1418#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1508#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -2077,8 +2167,86 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
2077BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, 2167BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
2078 num_devices, 64); 2168 num_devices, 64);
2079 2169
2080/* struct btrfs_super_block */ 2170/* struct btrfs_balance_item */
2171BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
2081 2172
2173static inline void btrfs_balance_data(struct extent_buffer *eb,
2174 struct btrfs_balance_item *bi,
2175 struct btrfs_disk_balance_args *ba)
2176{
2177 read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
2178}
2179
2180static inline void btrfs_set_balance_data(struct extent_buffer *eb,
2181 struct btrfs_balance_item *bi,
2182 struct btrfs_disk_balance_args *ba)
2183{
2184 write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
2185}
2186
2187static inline void btrfs_balance_meta(struct extent_buffer *eb,
2188 struct btrfs_balance_item *bi,
2189 struct btrfs_disk_balance_args *ba)
2190{
2191 read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
2192}
2193
2194static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
2195 struct btrfs_balance_item *bi,
2196 struct btrfs_disk_balance_args *ba)
2197{
2198 write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
2199}
2200
2201static inline void btrfs_balance_sys(struct extent_buffer *eb,
2202 struct btrfs_balance_item *bi,
2203 struct btrfs_disk_balance_args *ba)
2204{
2205 read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
2206}
2207
2208static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
2209 struct btrfs_balance_item *bi,
2210 struct btrfs_disk_balance_args *ba)
2211{
2212 write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
2213}
2214
2215static inline void
2216btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
2217 struct btrfs_disk_balance_args *disk)
2218{
2219 memset(cpu, 0, sizeof(*cpu));
2220
2221 cpu->profiles = le64_to_cpu(disk->profiles);
2222 cpu->usage = le64_to_cpu(disk->usage);
2223 cpu->devid = le64_to_cpu(disk->devid);
2224 cpu->pstart = le64_to_cpu(disk->pstart);
2225 cpu->pend = le64_to_cpu(disk->pend);
2226 cpu->vstart = le64_to_cpu(disk->vstart);
2227 cpu->vend = le64_to_cpu(disk->vend);
2228 cpu->target = le64_to_cpu(disk->target);
2229 cpu->flags = le64_to_cpu(disk->flags);
2230}
2231
2232static inline void
2233btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
2234 struct btrfs_balance_args *cpu)
2235{
2236 memset(disk, 0, sizeof(*disk));
2237
2238 disk->profiles = cpu_to_le64(cpu->profiles);
2239 disk->usage = cpu_to_le64(cpu->usage);
2240 disk->devid = cpu_to_le64(cpu->devid);
2241 disk->pstart = cpu_to_le64(cpu->pstart);
2242 disk->pend = cpu_to_le64(cpu->pend);
2243 disk->vstart = cpu_to_le64(cpu->vstart);
2244 disk->vend = cpu_to_le64(cpu->vend);
2245 disk->target = cpu_to_le64(cpu->target);
2246 disk->flags = cpu_to_le64(cpu->flags);
2247}
2248
2249/* struct btrfs_super_block */
2082BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); 2250BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
2083BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); 2251BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
2084BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, 2252BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
@@ -2196,7 +2364,7 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
2196 return btrfs_item_size(eb, e) - offset; 2364 return btrfs_item_size(eb, e) - offset;
2197} 2365}
2198 2366
2199static inline struct btrfs_root *btrfs_sb(struct super_block *sb) 2367static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
2200{ 2368{
2201 return sb->s_fs_info; 2369 return sb->s_fs_info;
2202} 2370}
@@ -2277,11 +2445,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
2277 struct btrfs_root *root, u32 blocksize, 2445 struct btrfs_root *root, u32 blocksize,
2278 u64 parent, u64 root_objectid, 2446 u64 parent, u64 root_objectid,
2279 struct btrfs_disk_key *key, int level, 2447 struct btrfs_disk_key *key, int level,
2280 u64 hint, u64 empty_size); 2448 u64 hint, u64 empty_size, int for_cow);
2281void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 2449void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
2282 struct btrfs_root *root, 2450 struct btrfs_root *root,
2283 struct extent_buffer *buf, 2451 struct extent_buffer *buf,
2284 u64 parent, int last_ref); 2452 u64 parent, int last_ref, int for_cow);
2285struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 2453struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
2286 struct btrfs_root *root, 2454 struct btrfs_root *root,
2287 u64 bytenr, u32 blocksize, 2455 u64 bytenr, u32 blocksize,
@@ -2301,17 +2469,17 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
2301 u64 search_end, struct btrfs_key *ins, 2469 u64 search_end, struct btrfs_key *ins,
2302 u64 data); 2470 u64 data);
2303int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2471int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2304 struct extent_buffer *buf, int full_backref); 2472 struct extent_buffer *buf, int full_backref, int for_cow);
2305int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2473int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2306 struct extent_buffer *buf, int full_backref); 2474 struct extent_buffer *buf, int full_backref, int for_cow);
2307int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 2475int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2308 struct btrfs_root *root, 2476 struct btrfs_root *root,
2309 u64 bytenr, u64 num_bytes, u64 flags, 2477 u64 bytenr, u64 num_bytes, u64 flags,
2310 int is_data); 2478 int is_data);
2311int btrfs_free_extent(struct btrfs_trans_handle *trans, 2479int btrfs_free_extent(struct btrfs_trans_handle *trans,
2312 struct btrfs_root *root, 2480 struct btrfs_root *root,
2313 u64 bytenr, u64 num_bytes, u64 parent, 2481 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
2314 u64 root_objectid, u64 owner, u64 offset); 2482 u64 owner, u64 offset, int for_cow);
2315 2483
2316int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); 2484int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
2317int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, 2485int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
@@ -2323,7 +2491,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2323int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2491int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2324 struct btrfs_root *root, 2492 struct btrfs_root *root,
2325 u64 bytenr, u64 num_bytes, u64 parent, 2493 u64 bytenr, u64 num_bytes, u64 parent,
2326 u64 root_objectid, u64 owner, u64 offset); 2494 u64 root_objectid, u64 owner, u64 offset, int for_cow);
2327 2495
2328int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 2496int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2329 struct btrfs_root *root); 2497 struct btrfs_root *root);
@@ -2482,10 +2650,18 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
2482} 2650}
2483 2651
2484int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); 2652int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
2653static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
2654{
2655 ++p->slots[0];
2656 if (p->slots[0] >= btrfs_header_nritems(p->nodes[0]))
2657 return btrfs_next_leaf(root, p);
2658 return 0;
2659}
2485int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); 2660int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
2486int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); 2661int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
2487void btrfs_drop_snapshot(struct btrfs_root *root, 2662void btrfs_drop_snapshot(struct btrfs_root *root,
2488 struct btrfs_block_rsv *block_rsv, int update_ref); 2663 struct btrfs_block_rsv *block_rsv, int update_ref,
2664 int for_reloc);
2489int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 2665int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
2490 struct btrfs_root *root, 2666 struct btrfs_root *root,
2491 struct extent_buffer *node, 2667 struct extent_buffer *node,
@@ -2500,6 +2676,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
2500} 2676}
2501static inline void free_fs_info(struct btrfs_fs_info *fs_info) 2677static inline void free_fs_info(struct btrfs_fs_info *fs_info)
2502{ 2678{
2679 kfree(fs_info->balance_ctl);
2503 kfree(fs_info->delayed_root); 2680 kfree(fs_info->delayed_root);
2504 kfree(fs_info->extent_root); 2681 kfree(fs_info->extent_root);
2505 kfree(fs_info->tree_root); 2682 kfree(fs_info->tree_root);
@@ -2510,6 +2687,24 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
2510 kfree(fs_info->super_for_commit); 2687 kfree(fs_info->super_for_commit);
2511 kfree(fs_info); 2688 kfree(fs_info);
2512} 2689}
2690/**
2691 * profile_is_valid - tests whether a given profile is valid and reduced
2692 * @flags: profile to validate
2693 * @extended: if true @flags is treated as an extended profile
2694 */
2695static inline int profile_is_valid(u64 flags, int extended)
2696{
2697 u64 mask = ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
2698
2699 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
2700 if (extended)
2701 mask &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2702
2703 if (flags & mask)
2704 return 0;
2705 /* true if zero or exactly one bit set */
2706 return (flags & (~flags + 1)) == flags;
2707}
2513 2708
2514/* root-item.c */ 2709/* root-item.c */
2515int btrfs_find_root_ref(struct btrfs_root *tree_root, 2710int btrfs_find_root_ref(struct btrfs_root *tree_root,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 9c1eccc2c503..fe4cd0f1cef1 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -595,8 +595,12 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
595 595
596 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 596 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
597 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); 597 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
598 if (!ret) 598 if (!ret) {
599 trace_btrfs_space_reservation(root->fs_info, "delayed_item",
600 item->key.objectid,
601 num_bytes, 1);
599 item->bytes_reserved = num_bytes; 602 item->bytes_reserved = num_bytes;
603 }
600 604
601 return ret; 605 return ret;
602} 606}
@@ -610,6 +614,9 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
610 return; 614 return;
611 615
612 rsv = &root->fs_info->delayed_block_rsv; 616 rsv = &root->fs_info->delayed_block_rsv;
617 trace_btrfs_space_reservation(root->fs_info, "delayed_item",
618 item->key.objectid, item->bytes_reserved,
619 0);
613 btrfs_block_rsv_release(root, rsv, 620 btrfs_block_rsv_release(root, rsv,
614 item->bytes_reserved); 621 item->bytes_reserved);
615} 622}
@@ -624,7 +631,7 @@ static int btrfs_delayed_inode_reserve_metadata(
624 struct btrfs_block_rsv *dst_rsv; 631 struct btrfs_block_rsv *dst_rsv;
625 u64 num_bytes; 632 u64 num_bytes;
626 int ret; 633 int ret;
627 int release = false; 634 bool release = false;
628 635
629 src_rsv = trans->block_rsv; 636 src_rsv = trans->block_rsv;
630 dst_rsv = &root->fs_info->delayed_block_rsv; 637 dst_rsv = &root->fs_info->delayed_block_rsv;
@@ -651,8 +658,13 @@ static int btrfs_delayed_inode_reserve_metadata(
651 */ 658 */
652 if (ret == -EAGAIN) 659 if (ret == -EAGAIN)
653 ret = -ENOSPC; 660 ret = -ENOSPC;
654 if (!ret) 661 if (!ret) {
655 node->bytes_reserved = num_bytes; 662 node->bytes_reserved = num_bytes;
663 trace_btrfs_space_reservation(root->fs_info,
664 "delayed_inode",
665 btrfs_ino(inode),
666 num_bytes, 1);
667 }
656 return ret; 668 return ret;
657 } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { 669 } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
658 spin_lock(&BTRFS_I(inode)->lock); 670 spin_lock(&BTRFS_I(inode)->lock);
@@ -707,11 +719,17 @@ out:
707 * reservation here. I think it may be time for a documentation page on 719 * reservation here. I think it may be time for a documentation page on
708 * how block rsvs. work. 720 * how block rsvs. work.
709 */ 721 */
710 if (!ret) 722 if (!ret) {
723 trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
724 btrfs_ino(inode), num_bytes, 1);
711 node->bytes_reserved = num_bytes; 725 node->bytes_reserved = num_bytes;
726 }
712 727
713 if (release) 728 if (release) {
729 trace_btrfs_space_reservation(root->fs_info, "delalloc",
730 btrfs_ino(inode), num_bytes, 0);
714 btrfs_block_rsv_release(root, src_rsv, num_bytes); 731 btrfs_block_rsv_release(root, src_rsv, num_bytes);
732 }
715 733
716 return ret; 734 return ret;
717} 735}
@@ -725,6 +743,8 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
725 return; 743 return;
726 744
727 rsv = &root->fs_info->delayed_block_rsv; 745 rsv = &root->fs_info->delayed_block_rsv;
746 trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
747 node->inode_id, node->bytes_reserved, 0);
728 btrfs_block_rsv_release(root, rsv, 748 btrfs_block_rsv_release(root, rsv,
729 node->bytes_reserved); 749 node->bytes_reserved);
730 node->bytes_reserved = 0; 750 node->bytes_reserved = 0;
@@ -1372,13 +1392,6 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
1372 goto release_node; 1392 goto release_node;
1373 } 1393 }
1374 1394
1375 ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
1376 /*
1377 * we have reserved enough space when we start a new transaction,
1378 * so reserving metadata failure is impossible
1379 */
1380 BUG_ON(ret);
1381
1382 delayed_item->key.objectid = btrfs_ino(dir); 1395 delayed_item->key.objectid = btrfs_ino(dir);
1383 btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY); 1396 btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY);
1384 delayed_item->key.offset = index; 1397 delayed_item->key.offset = index;
@@ -1391,6 +1404,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
1391 dir_item->type = type; 1404 dir_item->type = type;
1392 memcpy((char *)(dir_item + 1), name, name_len); 1405 memcpy((char *)(dir_item + 1), name, name_len);
1393 1406
1407 ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
1408 /*
1409 * we have reserved enough space when we start a new transaction,
1410 * so reserving metadata failure is impossible
1411 */
1412 BUG_ON(ret);
1413
1414
1394 mutex_lock(&delayed_node->mutex); 1415 mutex_lock(&delayed_node->mutex);
1395 ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item); 1416 ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
1396 if (unlikely(ret)) { 1417 if (unlikely(ret)) {
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 125cf76fcd08..66e4f29505a3 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -101,6 +101,11 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
101 return -1; 101 return -1;
102 if (ref1->type > ref2->type) 102 if (ref1->type > ref2->type)
103 return 1; 103 return 1;
104 /* merging of sequenced refs is not allowed */
105 if (ref1->seq < ref2->seq)
106 return -1;
107 if (ref1->seq > ref2->seq)
108 return 1;
104 if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || 109 if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
105 ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { 110 ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
106 return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), 111 return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
@@ -150,16 +155,22 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
150 155
151/* 156/*
152 * find an head entry based on bytenr. This returns the delayed ref 157 * find an head entry based on bytenr. This returns the delayed ref
153 * head if it was able to find one, or NULL if nothing was in that spot 158 * head if it was able to find one, or NULL if nothing was in that spot.
159 * If return_bigger is given, the next bigger entry is returned if no exact
160 * match is found.
154 */ 161 */
155static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, 162static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
156 u64 bytenr, 163 u64 bytenr,
157 struct btrfs_delayed_ref_node **last) 164 struct btrfs_delayed_ref_node **last,
165 int return_bigger)
158{ 166{
159 struct rb_node *n = root->rb_node; 167 struct rb_node *n;
160 struct btrfs_delayed_ref_node *entry; 168 struct btrfs_delayed_ref_node *entry;
161 int cmp; 169 int cmp = 0;
162 170
171again:
172 n = root->rb_node;
173 entry = NULL;
163 while (n) { 174 while (n) {
164 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 175 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
165 WARN_ON(!entry->in_tree); 176 WARN_ON(!entry->in_tree);
@@ -182,6 +193,19 @@ static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
182 else 193 else
183 return entry; 194 return entry;
184 } 195 }
196 if (entry && return_bigger) {
197 if (cmp > 0) {
198 n = rb_next(&entry->rb_node);
199 if (!n)
200 n = rb_first(root);
201 entry = rb_entry(n, struct btrfs_delayed_ref_node,
202 rb_node);
203 bytenr = entry->bytenr;
204 return_bigger = 0;
205 goto again;
206 }
207 return entry;
208 }
185 return NULL; 209 return NULL;
186} 210}
187 211
@@ -209,6 +233,24 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
209 return 0; 233 return 0;
210} 234}
211 235
236int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
237 u64 seq)
238{
239 struct seq_list *elem;
240
241 assert_spin_locked(&delayed_refs->lock);
242 if (list_empty(&delayed_refs->seq_head))
243 return 0;
244
245 elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list);
246 if (seq >= elem->seq) {
247 pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n",
248 seq, elem->seq, delayed_refs);
249 return 1;
250 }
251 return 0;
252}
253
212int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 254int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
213 struct list_head *cluster, u64 start) 255 struct list_head *cluster, u64 start)
214{ 256{
@@ -223,20 +265,8 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
223 node = rb_first(&delayed_refs->root); 265 node = rb_first(&delayed_refs->root);
224 } else { 266 } else {
225 ref = NULL; 267 ref = NULL;
226 find_ref_head(&delayed_refs->root, start, &ref); 268 find_ref_head(&delayed_refs->root, start + 1, &ref, 1);
227 if (ref) { 269 if (ref) {
228 struct btrfs_delayed_ref_node *tmp;
229
230 node = rb_prev(&ref->rb_node);
231 while (node) {
232 tmp = rb_entry(node,
233 struct btrfs_delayed_ref_node,
234 rb_node);
235 if (tmp->bytenr < start)
236 break;
237 ref = tmp;
238 node = rb_prev(&ref->rb_node);
239 }
240 node = &ref->rb_node; 270 node = &ref->rb_node;
241 } else 271 } else
242 node = rb_first(&delayed_refs->root); 272 node = rb_first(&delayed_refs->root);
@@ -390,7 +420,8 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
390 * this does all the dirty work in terms of maintaining the correct 420 * this does all the dirty work in terms of maintaining the correct
391 * overall modification count. 421 * overall modification count.
392 */ 422 */
393static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, 423static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info,
424 struct btrfs_trans_handle *trans,
394 struct btrfs_delayed_ref_node *ref, 425 struct btrfs_delayed_ref_node *ref,
395 u64 bytenr, u64 num_bytes, 426 u64 bytenr, u64 num_bytes,
396 int action, int is_data) 427 int action, int is_data)
@@ -437,6 +468,7 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
437 ref->action = 0; 468 ref->action = 0;
438 ref->is_head = 1; 469 ref->is_head = 1;
439 ref->in_tree = 1; 470 ref->in_tree = 1;
471 ref->seq = 0;
440 472
441 head_ref = btrfs_delayed_node_to_head(ref); 473 head_ref = btrfs_delayed_node_to_head(ref);
442 head_ref->must_insert_reserved = must_insert_reserved; 474 head_ref->must_insert_reserved = must_insert_reserved;
@@ -468,14 +500,17 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
468/* 500/*
469 * helper to insert a delayed tree ref into the rbtree. 501 * helper to insert a delayed tree ref into the rbtree.
470 */ 502 */
471static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans, 503static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
504 struct btrfs_trans_handle *trans,
472 struct btrfs_delayed_ref_node *ref, 505 struct btrfs_delayed_ref_node *ref,
473 u64 bytenr, u64 num_bytes, u64 parent, 506 u64 bytenr, u64 num_bytes, u64 parent,
474 u64 ref_root, int level, int action) 507 u64 ref_root, int level, int action,
508 int for_cow)
475{ 509{
476 struct btrfs_delayed_ref_node *existing; 510 struct btrfs_delayed_ref_node *existing;
477 struct btrfs_delayed_tree_ref *full_ref; 511 struct btrfs_delayed_tree_ref *full_ref;
478 struct btrfs_delayed_ref_root *delayed_refs; 512 struct btrfs_delayed_ref_root *delayed_refs;
513 u64 seq = 0;
479 514
480 if (action == BTRFS_ADD_DELAYED_EXTENT) 515 if (action == BTRFS_ADD_DELAYED_EXTENT)
481 action = BTRFS_ADD_DELAYED_REF; 516 action = BTRFS_ADD_DELAYED_REF;
@@ -491,14 +526,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
491 ref->is_head = 0; 526 ref->is_head = 0;
492 ref->in_tree = 1; 527 ref->in_tree = 1;
493 528
529 if (need_ref_seq(for_cow, ref_root))
530 seq = inc_delayed_seq(delayed_refs);
531 ref->seq = seq;
532
494 full_ref = btrfs_delayed_node_to_tree_ref(ref); 533 full_ref = btrfs_delayed_node_to_tree_ref(ref);
495 if (parent) { 534 full_ref->parent = parent;
496 full_ref->parent = parent; 535 full_ref->root = ref_root;
536 if (parent)
497 ref->type = BTRFS_SHARED_BLOCK_REF_KEY; 537 ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
498 } else { 538 else
499 full_ref->root = ref_root;
500 ref->type = BTRFS_TREE_BLOCK_REF_KEY; 539 ref->type = BTRFS_TREE_BLOCK_REF_KEY;
501 }
502 full_ref->level = level; 540 full_ref->level = level;
503 541
504 trace_btrfs_delayed_tree_ref(ref, full_ref, action); 542 trace_btrfs_delayed_tree_ref(ref, full_ref, action);
@@ -522,15 +560,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
522/* 560/*
523 * helper to insert a delayed data ref into the rbtree. 561 * helper to insert a delayed data ref into the rbtree.
524 */ 562 */
525static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, 563static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info,
564 struct btrfs_trans_handle *trans,
526 struct btrfs_delayed_ref_node *ref, 565 struct btrfs_delayed_ref_node *ref,
527 u64 bytenr, u64 num_bytes, u64 parent, 566 u64 bytenr, u64 num_bytes, u64 parent,
528 u64 ref_root, u64 owner, u64 offset, 567 u64 ref_root, u64 owner, u64 offset,
529 int action) 568 int action, int for_cow)
530{ 569{
531 struct btrfs_delayed_ref_node *existing; 570 struct btrfs_delayed_ref_node *existing;
532 struct btrfs_delayed_data_ref *full_ref; 571 struct btrfs_delayed_data_ref *full_ref;
533 struct btrfs_delayed_ref_root *delayed_refs; 572 struct btrfs_delayed_ref_root *delayed_refs;
573 u64 seq = 0;
534 574
535 if (action == BTRFS_ADD_DELAYED_EXTENT) 575 if (action == BTRFS_ADD_DELAYED_EXTENT)
536 action = BTRFS_ADD_DELAYED_REF; 576 action = BTRFS_ADD_DELAYED_REF;
@@ -546,14 +586,18 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
546 ref->is_head = 0; 586 ref->is_head = 0;
547 ref->in_tree = 1; 587 ref->in_tree = 1;
548 588
589 if (need_ref_seq(for_cow, ref_root))
590 seq = inc_delayed_seq(delayed_refs);
591 ref->seq = seq;
592
549 full_ref = btrfs_delayed_node_to_data_ref(ref); 593 full_ref = btrfs_delayed_node_to_data_ref(ref);
550 if (parent) { 594 full_ref->parent = parent;
551 full_ref->parent = parent; 595 full_ref->root = ref_root;
596 if (parent)
552 ref->type = BTRFS_SHARED_DATA_REF_KEY; 597 ref->type = BTRFS_SHARED_DATA_REF_KEY;
553 } else { 598 else
554 full_ref->root = ref_root;
555 ref->type = BTRFS_EXTENT_DATA_REF_KEY; 599 ref->type = BTRFS_EXTENT_DATA_REF_KEY;
556 } 600
557 full_ref->objectid = owner; 601 full_ref->objectid = owner;
558 full_ref->offset = offset; 602 full_ref->offset = offset;
559 603
@@ -580,10 +624,12 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
580 * to make sure the delayed ref is eventually processed before this 624 * to make sure the delayed ref is eventually processed before this
581 * transaction commits. 625 * transaction commits.
582 */ 626 */
583int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, 627int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
628 struct btrfs_trans_handle *trans,
584 u64 bytenr, u64 num_bytes, u64 parent, 629 u64 bytenr, u64 num_bytes, u64 parent,
585 u64 ref_root, int level, int action, 630 u64 ref_root, int level, int action,
586 struct btrfs_delayed_extent_op *extent_op) 631 struct btrfs_delayed_extent_op *extent_op,
632 int for_cow)
587{ 633{
588 struct btrfs_delayed_tree_ref *ref; 634 struct btrfs_delayed_tree_ref *ref;
589 struct btrfs_delayed_ref_head *head_ref; 635 struct btrfs_delayed_ref_head *head_ref;
@@ -610,13 +656,17 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
610 * insert both the head node and the new ref without dropping 656 * insert both the head node and the new ref without dropping
611 * the spin lock 657 * the spin lock
612 */ 658 */
613 ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, 659 ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
614 action, 0); 660 num_bytes, action, 0);
615 BUG_ON(ret); 661 BUG_ON(ret);
616 662
617 ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes, 663 ret = add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
618 parent, ref_root, level, action); 664 num_bytes, parent, ref_root, level, action,
665 for_cow);
619 BUG_ON(ret); 666 BUG_ON(ret);
667 if (!need_ref_seq(for_cow, ref_root) &&
668 waitqueue_active(&delayed_refs->seq_wait))
669 wake_up(&delayed_refs->seq_wait);
620 spin_unlock(&delayed_refs->lock); 670 spin_unlock(&delayed_refs->lock);
621 return 0; 671 return 0;
622} 672}
@@ -624,11 +674,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
624/* 674/*
625 * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. 675 * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
626 */ 676 */
627int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, 677int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
678 struct btrfs_trans_handle *trans,
628 u64 bytenr, u64 num_bytes, 679 u64 bytenr, u64 num_bytes,
629 u64 parent, u64 ref_root, 680 u64 parent, u64 ref_root,
630 u64 owner, u64 offset, int action, 681 u64 owner, u64 offset, int action,
631 struct btrfs_delayed_extent_op *extent_op) 682 struct btrfs_delayed_extent_op *extent_op,
683 int for_cow)
632{ 684{
633 struct btrfs_delayed_data_ref *ref; 685 struct btrfs_delayed_data_ref *ref;
634 struct btrfs_delayed_ref_head *head_ref; 686 struct btrfs_delayed_ref_head *head_ref;
@@ -655,18 +707,23 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
655 * insert both the head node and the new ref without dropping 707 * insert both the head node and the new ref without dropping
656 * the spin lock 708 * the spin lock
657 */ 709 */
658 ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, 710 ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
659 action, 1); 711 num_bytes, action, 1);
660 BUG_ON(ret); 712 BUG_ON(ret);
661 713
662 ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes, 714 ret = add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
663 parent, ref_root, owner, offset, action); 715 num_bytes, parent, ref_root, owner, offset,
716 action, for_cow);
664 BUG_ON(ret); 717 BUG_ON(ret);
718 if (!need_ref_seq(for_cow, ref_root) &&
719 waitqueue_active(&delayed_refs->seq_wait))
720 wake_up(&delayed_refs->seq_wait);
665 spin_unlock(&delayed_refs->lock); 721 spin_unlock(&delayed_refs->lock);
666 return 0; 722 return 0;
667} 723}
668 724
669int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, 725int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
726 struct btrfs_trans_handle *trans,
670 u64 bytenr, u64 num_bytes, 727 u64 bytenr, u64 num_bytes,
671 struct btrfs_delayed_extent_op *extent_op) 728 struct btrfs_delayed_extent_op *extent_op)
672{ 729{
@@ -683,11 +740,13 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
683 delayed_refs = &trans->transaction->delayed_refs; 740 delayed_refs = &trans->transaction->delayed_refs;
684 spin_lock(&delayed_refs->lock); 741 spin_lock(&delayed_refs->lock);
685 742
686 ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, 743 ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
687 num_bytes, BTRFS_UPDATE_DELAYED_HEAD, 744 num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
688 extent_op->is_data); 745 extent_op->is_data);
689 BUG_ON(ret); 746 BUG_ON(ret);
690 747
748 if (waitqueue_active(&delayed_refs->seq_wait))
749 wake_up(&delayed_refs->seq_wait);
691 spin_unlock(&delayed_refs->lock); 750 spin_unlock(&delayed_refs->lock);
692 return 0; 751 return 0;
693} 752}
@@ -704,7 +763,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
704 struct btrfs_delayed_ref_root *delayed_refs; 763 struct btrfs_delayed_ref_root *delayed_refs;
705 764
706 delayed_refs = &trans->transaction->delayed_refs; 765 delayed_refs = &trans->transaction->delayed_refs;
707 ref = find_ref_head(&delayed_refs->root, bytenr, NULL); 766 ref = find_ref_head(&delayed_refs->root, bytenr, NULL, 0);
708 if (ref) 767 if (ref)
709 return btrfs_delayed_node_to_head(ref); 768 return btrfs_delayed_node_to_head(ref);
710 return NULL; 769 return NULL;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index e287e3b0eab0..d8f244d94925 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -33,6 +33,9 @@ struct btrfs_delayed_ref_node {
33 /* the size of the extent */ 33 /* the size of the extent */
34 u64 num_bytes; 34 u64 num_bytes;
35 35
36 /* seq number to keep track of insertion order */
37 u64 seq;
38
36 /* ref count on this data structure */ 39 /* ref count on this data structure */
37 atomic_t refs; 40 atomic_t refs;
38 41
@@ -98,19 +101,15 @@ struct btrfs_delayed_ref_head {
98 101
99struct btrfs_delayed_tree_ref { 102struct btrfs_delayed_tree_ref {
100 struct btrfs_delayed_ref_node node; 103 struct btrfs_delayed_ref_node node;
101 union { 104 u64 root;
102 u64 root; 105 u64 parent;
103 u64 parent;
104 };
105 int level; 106 int level;
106}; 107};
107 108
108struct btrfs_delayed_data_ref { 109struct btrfs_delayed_data_ref {
109 struct btrfs_delayed_ref_node node; 110 struct btrfs_delayed_ref_node node;
110 union { 111 u64 root;
111 u64 root; 112 u64 parent;
112 u64 parent;
113 };
114 u64 objectid; 113 u64 objectid;
115 u64 offset; 114 u64 offset;
116}; 115};
@@ -140,6 +139,26 @@ struct btrfs_delayed_ref_root {
140 int flushing; 139 int flushing;
141 140
142 u64 run_delayed_start; 141 u64 run_delayed_start;
142
143 /*
144 * seq number of delayed refs. We need to know if a backref was being
145 * added before the currently processed ref or afterwards.
146 */
147 u64 seq;
148
149 /*
150 * seq_list holds a list of all seq numbers that are currently being
151 * added to the list. While walking backrefs (btrfs_find_all_roots,
152 * qgroups), which might take some time, no newer ref must be processed,
153 * as it might influence the outcome of the walk.
154 */
155 struct list_head seq_head;
156
157 /*
158 * when the only refs we have in the list must not be processed, we want
159 * to wait for more refs to show up or for the end of backref walking.
160 */
161 wait_queue_head_t seq_wait;
143}; 162};
144 163
145static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) 164static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
@@ -151,16 +170,21 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
151 } 170 }
152} 171}
153 172
154int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, 173int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
174 struct btrfs_trans_handle *trans,
155 u64 bytenr, u64 num_bytes, u64 parent, 175 u64 bytenr, u64 num_bytes, u64 parent,
156 u64 ref_root, int level, int action, 176 u64 ref_root, int level, int action,
157 struct btrfs_delayed_extent_op *extent_op); 177 struct btrfs_delayed_extent_op *extent_op,
158int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, 178 int for_cow);
179int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
180 struct btrfs_trans_handle *trans,
159 u64 bytenr, u64 num_bytes, 181 u64 bytenr, u64 num_bytes,
160 u64 parent, u64 ref_root, 182 u64 parent, u64 ref_root,
161 u64 owner, u64 offset, int action, 183 u64 owner, u64 offset, int action,
162 struct btrfs_delayed_extent_op *extent_op); 184 struct btrfs_delayed_extent_op *extent_op,
163int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, 185 int for_cow);
186int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
187 struct btrfs_trans_handle *trans,
164 u64 bytenr, u64 num_bytes, 188 u64 bytenr, u64 num_bytes,
165 struct btrfs_delayed_extent_op *extent_op); 189 struct btrfs_delayed_extent_op *extent_op);
166 190
@@ -170,6 +194,60 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
170 struct btrfs_delayed_ref_head *head); 194 struct btrfs_delayed_ref_head *head);
171int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 195int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
172 struct list_head *cluster, u64 search_start); 196 struct list_head *cluster, u64 search_start);
197
198struct seq_list {
199 struct list_head list;
200 u64 seq;
201};
202
203static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs)
204{
205 assert_spin_locked(&delayed_refs->lock);
206 ++delayed_refs->seq;
207 return delayed_refs->seq;
208}
209
210static inline void
211btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
212 struct seq_list *elem)
213{
214 assert_spin_locked(&delayed_refs->lock);
215 elem->seq = delayed_refs->seq;
216 list_add_tail(&elem->list, &delayed_refs->seq_head);
217}
218
219static inline void
220btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
221 struct seq_list *elem)
222{
223 spin_lock(&delayed_refs->lock);
224 list_del(&elem->list);
225 wake_up(&delayed_refs->seq_wait);
226 spin_unlock(&delayed_refs->lock);
227}
228
229int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
230 u64 seq);
231
232/*
233 * delayed refs with a ref_seq > 0 must be held back during backref walking.
234 * this only applies to items in one of the fs-trees. for_cow items never need
235 * to be held back, so they won't get a ref_seq number.
236 */
237static inline int need_ref_seq(int for_cow, u64 rootid)
238{
239 if (for_cow)
240 return 0;
241
242 if (rootid == BTRFS_FS_TREE_OBJECTID)
243 return 1;
244
245 if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
246 return 1;
247
248 return 0;
249}
250
173/* 251/*
174 * a node might live in a head or a regular ref, this lets you 252 * a node might live in a head or a regular ref, this lets you
175 * test for the proper type to use. 253 * test for the proper type to use.
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d8525662ca7a..7aa9cd36bf1b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -43,6 +43,7 @@
43#include "tree-log.h" 43#include "tree-log.h"
44#include "free-space-cache.h" 44#include "free-space-cache.h"
45#include "inode-map.h" 45#include "inode-map.h"
46#include "check-integrity.h"
46 47
47static struct extent_io_ops btree_extent_io_ops; 48static struct extent_io_ops btree_extent_io_ops;
48static void end_workqueue_fn(struct btrfs_work *work); 49static void end_workqueue_fn(struct btrfs_work *work);
@@ -1143,7 +1144,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1143 root->orphan_item_inserted = 0; 1144 root->orphan_item_inserted = 0;
1144 root->orphan_cleanup_state = 0; 1145 root->orphan_cleanup_state = 0;
1145 1146
1146 root->fs_info = fs_info;
1147 root->objectid = objectid; 1147 root->objectid = objectid;
1148 root->last_trans = 0; 1148 root->last_trans = 0;
1149 root->highest_objectid = 0; 1149 root->highest_objectid = 0;
@@ -1217,6 +1217,14 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
1217 return 0; 1217 return 0;
1218} 1218}
1219 1219
1220static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
1221{
1222 struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
1223 if (root)
1224 root->fs_info = fs_info;
1225 return root;
1226}
1227
1220static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, 1228static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1221 struct btrfs_fs_info *fs_info) 1229 struct btrfs_fs_info *fs_info)
1222{ 1230{
@@ -1224,7 +1232,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1224 struct btrfs_root *tree_root = fs_info->tree_root; 1232 struct btrfs_root *tree_root = fs_info->tree_root;
1225 struct extent_buffer *leaf; 1233 struct extent_buffer *leaf;
1226 1234
1227 root = kzalloc(sizeof(*root), GFP_NOFS); 1235 root = btrfs_alloc_root(fs_info);
1228 if (!root) 1236 if (!root)
1229 return ERR_PTR(-ENOMEM); 1237 return ERR_PTR(-ENOMEM);
1230 1238
@@ -1244,7 +1252,8 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1244 root->ref_cows = 0; 1252 root->ref_cows = 0;
1245 1253
1246 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 1254 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
1247 BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0); 1255 BTRFS_TREE_LOG_OBJECTID, NULL,
1256 0, 0, 0, 0);
1248 if (IS_ERR(leaf)) { 1257 if (IS_ERR(leaf)) {
1249 kfree(root); 1258 kfree(root);
1250 return ERR_CAST(leaf); 1259 return ERR_CAST(leaf);
@@ -1318,7 +1327,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1318 u32 blocksize; 1327 u32 blocksize;
1319 int ret = 0; 1328 int ret = 0;
1320 1329
1321 root = kzalloc(sizeof(*root), GFP_NOFS); 1330 root = btrfs_alloc_root(fs_info);
1322 if (!root) 1331 if (!root)
1323 return ERR_PTR(-ENOMEM); 1332 return ERR_PTR(-ENOMEM);
1324 if (location->offset == (u64)-1) { 1333 if (location->offset == (u64)-1) {
@@ -1874,9 +1883,9 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
1874} 1883}
1875 1884
1876 1885
1877struct btrfs_root *open_ctree(struct super_block *sb, 1886int open_ctree(struct super_block *sb,
1878 struct btrfs_fs_devices *fs_devices, 1887 struct btrfs_fs_devices *fs_devices,
1879 char *options) 1888 char *options)
1880{ 1889{
1881 u32 sectorsize; 1890 u32 sectorsize;
1882 u32 nodesize; 1891 u32 nodesize;
@@ -1888,8 +1897,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1888 struct btrfs_key location; 1897 struct btrfs_key location;
1889 struct buffer_head *bh; 1898 struct buffer_head *bh;
1890 struct btrfs_super_block *disk_super; 1899 struct btrfs_super_block *disk_super;
1891 struct btrfs_root *tree_root = btrfs_sb(sb); 1900 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1892 struct btrfs_fs_info *fs_info = tree_root->fs_info; 1901 struct btrfs_root *tree_root;
1893 struct btrfs_root *extent_root; 1902 struct btrfs_root *extent_root;
1894 struct btrfs_root *csum_root; 1903 struct btrfs_root *csum_root;
1895 struct btrfs_root *chunk_root; 1904 struct btrfs_root *chunk_root;
@@ -1900,16 +1909,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1900 int num_backups_tried = 0; 1909 int num_backups_tried = 0;
1901 int backup_index = 0; 1910 int backup_index = 0;
1902 1911
1903 extent_root = fs_info->extent_root = 1912 tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
1904 kzalloc(sizeof(struct btrfs_root), GFP_NOFS); 1913 extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info);
1905 csum_root = fs_info->csum_root = 1914 csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
1906 kzalloc(sizeof(struct btrfs_root), GFP_NOFS); 1915 chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
1907 chunk_root = fs_info->chunk_root = 1916 dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
1908 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1909 dev_root = fs_info->dev_root =
1910 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1911 1917
1912 if (!extent_root || !csum_root || !chunk_root || !dev_root) { 1918 if (!tree_root || !extent_root || !csum_root ||
1919 !chunk_root || !dev_root) {
1913 err = -ENOMEM; 1920 err = -ENOMEM;
1914 goto fail; 1921 goto fail;
1915 } 1922 }
@@ -1998,6 +2005,17 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1998 init_waitqueue_head(&fs_info->scrub_pause_wait); 2005 init_waitqueue_head(&fs_info->scrub_pause_wait);
1999 init_rwsem(&fs_info->scrub_super_lock); 2006 init_rwsem(&fs_info->scrub_super_lock);
2000 fs_info->scrub_workers_refcnt = 0; 2007 fs_info->scrub_workers_refcnt = 0;
2008#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
2009 fs_info->check_integrity_print_mask = 0;
2010#endif
2011
2012 spin_lock_init(&fs_info->balance_lock);
2013 mutex_init(&fs_info->balance_mutex);
2014 atomic_set(&fs_info->balance_running, 0);
2015 atomic_set(&fs_info->balance_pause_req, 0);
2016 atomic_set(&fs_info->balance_cancel_req, 0);
2017 fs_info->balance_ctl = NULL;
2018 init_waitqueue_head(&fs_info->balance_wait_q);
2001 2019
2002 sb->s_blocksize = 4096; 2020 sb->s_blocksize = 4096;
2003 sb->s_blocksize_bits = blksize_bits(4096); 2021 sb->s_blocksize_bits = blksize_bits(4096);
@@ -2267,9 +2285,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
2267 (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), 2285 (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
2268 BTRFS_UUID_SIZE); 2286 BTRFS_UUID_SIZE);
2269 2287
2270 mutex_lock(&fs_info->chunk_mutex);
2271 ret = btrfs_read_chunk_tree(chunk_root); 2288 ret = btrfs_read_chunk_tree(chunk_root);
2272 mutex_unlock(&fs_info->chunk_mutex);
2273 if (ret) { 2289 if (ret) {
2274 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", 2290 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
2275 sb->s_id); 2291 sb->s_id);
@@ -2318,9 +2334,6 @@ retry_root_backup:
2318 2334
2319 fs_info->generation = generation; 2335 fs_info->generation = generation;
2320 fs_info->last_trans_committed = generation; 2336 fs_info->last_trans_committed = generation;
2321 fs_info->data_alloc_profile = (u64)-1;
2322 fs_info->metadata_alloc_profile = (u64)-1;
2323 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
2324 2337
2325 ret = btrfs_init_space_info(fs_info); 2338 ret = btrfs_init_space_info(fs_info);
2326 if (ret) { 2339 if (ret) {
@@ -2353,6 +2366,19 @@ retry_root_backup:
2353 btrfs_set_opt(fs_info->mount_opt, SSD); 2366 btrfs_set_opt(fs_info->mount_opt, SSD);
2354 } 2367 }
2355 2368
2369#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
2370 if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
2371 ret = btrfsic_mount(tree_root, fs_devices,
2372 btrfs_test_opt(tree_root,
2373 CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
2374 1 : 0,
2375 fs_info->check_integrity_print_mask);
2376 if (ret)
2377 printk(KERN_WARNING "btrfs: failed to initialize"
2378 " integrity check module %s\n", sb->s_id);
2379 }
2380#endif
2381
2356 /* do not make disk changes in broken FS */ 2382 /* do not make disk changes in broken FS */
2357 if (btrfs_super_log_root(disk_super) != 0 && 2383 if (btrfs_super_log_root(disk_super) != 0 &&
2358 !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) { 2384 !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
@@ -2368,7 +2394,7 @@ retry_root_backup:
2368 btrfs_level_size(tree_root, 2394 btrfs_level_size(tree_root,
2369 btrfs_super_log_root_level(disk_super)); 2395 btrfs_super_log_root_level(disk_super));
2370 2396
2371 log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); 2397 log_tree_root = btrfs_alloc_root(fs_info);
2372 if (!log_tree_root) { 2398 if (!log_tree_root) {
2373 err = -ENOMEM; 2399 err = -ENOMEM;
2374 goto fail_trans_kthread; 2400 goto fail_trans_kthread;
@@ -2423,13 +2449,17 @@ retry_root_backup:
2423 if (!err) 2449 if (!err)
2424 err = btrfs_orphan_cleanup(fs_info->tree_root); 2450 err = btrfs_orphan_cleanup(fs_info->tree_root);
2425 up_read(&fs_info->cleanup_work_sem); 2451 up_read(&fs_info->cleanup_work_sem);
2452
2453 if (!err)
2454 err = btrfs_recover_balance(fs_info->tree_root);
2455
2426 if (err) { 2456 if (err) {
2427 close_ctree(tree_root); 2457 close_ctree(tree_root);
2428 return ERR_PTR(err); 2458 return err;
2429 } 2459 }
2430 } 2460 }
2431 2461
2432 return tree_root; 2462 return 0;
2433 2463
2434fail_trans_kthread: 2464fail_trans_kthread:
2435 kthread_stop(fs_info->transaction_kthread); 2465 kthread_stop(fs_info->transaction_kthread);
@@ -2475,8 +2505,7 @@ fail_srcu:
2475 cleanup_srcu_struct(&fs_info->subvol_srcu); 2505 cleanup_srcu_struct(&fs_info->subvol_srcu);
2476fail: 2506fail:
2477 btrfs_close_devices(fs_info->fs_devices); 2507 btrfs_close_devices(fs_info->fs_devices);
2478 free_fs_info(fs_info); 2508 return err;
2479 return ERR_PTR(err);
2480 2509
2481recovery_tree_root: 2510recovery_tree_root:
2482 if (!btrfs_test_opt(tree_root, RECOVERY)) 2511 if (!btrfs_test_opt(tree_root, RECOVERY))
@@ -2631,7 +2660,7 @@ static int write_dev_supers(struct btrfs_device *device,
2631 * we fua the first super. The others we allow 2660 * we fua the first super. The others we allow
2632 * to go down lazy. 2661 * to go down lazy.
2633 */ 2662 */
2634 ret = submit_bh(WRITE_FUA, bh); 2663 ret = btrfsic_submit_bh(WRITE_FUA, bh);
2635 if (ret) 2664 if (ret)
2636 errors++; 2665 errors++;
2637 } 2666 }
@@ -2708,7 +2737,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
2708 device->flush_bio = bio; 2737 device->flush_bio = bio;
2709 2738
2710 bio_get(bio); 2739 bio_get(bio);
2711 submit_bio(WRITE_FLUSH, bio); 2740 btrfsic_submit_bio(WRITE_FLUSH, bio);
2712 2741
2713 return 0; 2742 return 0;
2714} 2743}
@@ -2972,6 +3001,9 @@ int close_ctree(struct btrfs_root *root)
2972 fs_info->closing = 1; 3001 fs_info->closing = 1;
2973 smp_mb(); 3002 smp_mb();
2974 3003
3004 /* pause restriper - we want to resume on mount */
3005 btrfs_pause_balance(root->fs_info);
3006
2975 btrfs_scrub_cancel(root); 3007 btrfs_scrub_cancel(root);
2976 3008
2977 /* wait for any defraggers to finish */ 3009 /* wait for any defraggers to finish */
@@ -2979,7 +3011,7 @@ int close_ctree(struct btrfs_root *root)
2979 (atomic_read(&fs_info->defrag_running) == 0)); 3011 (atomic_read(&fs_info->defrag_running) == 0));
2980 3012
2981 /* clear out the rbtree of defraggable inodes */ 3013 /* clear out the rbtree of defraggable inodes */
2982 btrfs_run_defrag_inodes(root->fs_info); 3014 btrfs_run_defrag_inodes(fs_info);
2983 3015
2984 /* 3016 /*
2985 * Here come 2 situations when btrfs is broken to flip readonly: 3017 * Here come 2 situations when btrfs is broken to flip readonly:
@@ -3008,8 +3040,8 @@ int close_ctree(struct btrfs_root *root)
3008 3040
3009 btrfs_put_block_group_cache(fs_info); 3041 btrfs_put_block_group_cache(fs_info);
3010 3042
3011 kthread_stop(root->fs_info->transaction_kthread); 3043 kthread_stop(fs_info->transaction_kthread);
3012 kthread_stop(root->fs_info->cleaner_kthread); 3044 kthread_stop(fs_info->cleaner_kthread);
3013 3045
3014 fs_info->closing = 2; 3046 fs_info->closing = 2;
3015 smp_mb(); 3047 smp_mb();
@@ -3027,14 +3059,14 @@ int close_ctree(struct btrfs_root *root)
3027 free_extent_buffer(fs_info->extent_root->commit_root); 3059 free_extent_buffer(fs_info->extent_root->commit_root);
3028 free_extent_buffer(fs_info->tree_root->node); 3060 free_extent_buffer(fs_info->tree_root->node);
3029 free_extent_buffer(fs_info->tree_root->commit_root); 3061 free_extent_buffer(fs_info->tree_root->commit_root);
3030 free_extent_buffer(root->fs_info->chunk_root->node); 3062 free_extent_buffer(fs_info->chunk_root->node);
3031 free_extent_buffer(root->fs_info->chunk_root->commit_root); 3063 free_extent_buffer(fs_info->chunk_root->commit_root);
3032 free_extent_buffer(root->fs_info->dev_root->node); 3064 free_extent_buffer(fs_info->dev_root->node);
3033 free_extent_buffer(root->fs_info->dev_root->commit_root); 3065 free_extent_buffer(fs_info->dev_root->commit_root);
3034 free_extent_buffer(root->fs_info->csum_root->node); 3066 free_extent_buffer(fs_info->csum_root->node);
3035 free_extent_buffer(root->fs_info->csum_root->commit_root); 3067 free_extent_buffer(fs_info->csum_root->commit_root);
3036 3068
3037 btrfs_free_block_groups(root->fs_info); 3069 btrfs_free_block_groups(fs_info);
3038 3070
3039 del_fs_roots(fs_info); 3071 del_fs_roots(fs_info);
3040 3072
@@ -3054,14 +3086,17 @@ int close_ctree(struct btrfs_root *root)
3054 btrfs_stop_workers(&fs_info->caching_workers); 3086 btrfs_stop_workers(&fs_info->caching_workers);
3055 btrfs_stop_workers(&fs_info->readahead_workers); 3087 btrfs_stop_workers(&fs_info->readahead_workers);
3056 3088
3089#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3090 if (btrfs_test_opt(root, CHECK_INTEGRITY))
3091 btrfsic_unmount(root, fs_info->fs_devices);
3092#endif
3093
3057 btrfs_close_devices(fs_info->fs_devices); 3094 btrfs_close_devices(fs_info->fs_devices);
3058 btrfs_mapping_tree_free(&fs_info->mapping_tree); 3095 btrfs_mapping_tree_free(&fs_info->mapping_tree);
3059 3096
3060 bdi_destroy(&fs_info->bdi); 3097 bdi_destroy(&fs_info->bdi);
3061 cleanup_srcu_struct(&fs_info->subvol_srcu); 3098 cleanup_srcu_struct(&fs_info->subvol_srcu);
3062 3099
3063 free_fs_info(fs_info);
3064
3065 return 0; 3100 return 0;
3066} 3101}
3067 3102
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c99d0a8f13fa..e4bc4741319b 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -46,9 +46,9 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
46 u64 bytenr, u32 blocksize); 46 u64 bytenr, u32 blocksize);
47int clean_tree_block(struct btrfs_trans_handle *trans, 47int clean_tree_block(struct btrfs_trans_handle *trans,
48 struct btrfs_root *root, struct extent_buffer *buf); 48 struct btrfs_root *root, struct extent_buffer *buf);
49struct btrfs_root *open_ctree(struct super_block *sb, 49int open_ctree(struct super_block *sb,
50 struct btrfs_fs_devices *fs_devices, 50 struct btrfs_fs_devices *fs_devices,
51 char *options); 51 char *options);
52int close_ctree(struct btrfs_root *root); 52int close_ctree(struct btrfs_root *root);
53int write_ctree_super(struct btrfs_trans_handle *trans, 53int write_ctree_super(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root, int max_mirrors); 54 struct btrfs_root *root, int max_mirrors);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 1b8dc33778f9..5f77166fd01c 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -67,7 +67,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
67 u64 root_objectid, u32 generation, 67 u64 root_objectid, u32 generation,
68 int check_generation) 68 int check_generation)
69{ 69{
70 struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info; 70 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
71 struct btrfs_root *root; 71 struct btrfs_root *root;
72 struct inode *inode; 72 struct inode *inode;
73 struct btrfs_key key; 73 struct btrfs_key key;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5fbe576d2ba..700879ed64cf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -618,8 +618,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
618 struct list_head *head = &info->space_info; 618 struct list_head *head = &info->space_info;
619 struct btrfs_space_info *found; 619 struct btrfs_space_info *found;
620 620
621 flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM | 621 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
622 BTRFS_BLOCK_GROUP_METADATA;
623 622
624 rcu_read_lock(); 623 rcu_read_lock();
625 list_for_each_entry_rcu(found, head, list) { 624 list_for_each_entry_rcu(found, head, list) {
@@ -1872,20 +1871,24 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1872int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1871int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1873 struct btrfs_root *root, 1872 struct btrfs_root *root,
1874 u64 bytenr, u64 num_bytes, u64 parent, 1873 u64 bytenr, u64 num_bytes, u64 parent,
1875 u64 root_objectid, u64 owner, u64 offset) 1874 u64 root_objectid, u64 owner, u64 offset, int for_cow)
1876{ 1875{
1877 int ret; 1876 int ret;
1877 struct btrfs_fs_info *fs_info = root->fs_info;
1878
1878 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && 1879 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1879 root_objectid == BTRFS_TREE_LOG_OBJECTID); 1880 root_objectid == BTRFS_TREE_LOG_OBJECTID);
1880 1881
1881 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1882 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1882 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, 1883 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
1884 num_bytes,
1883 parent, root_objectid, (int)owner, 1885 parent, root_objectid, (int)owner,
1884 BTRFS_ADD_DELAYED_REF, NULL); 1886 BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1885 } else { 1887 } else {
1886 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, 1888 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
1889 num_bytes,
1887 parent, root_objectid, owner, offset, 1890 parent, root_objectid, owner, offset,
1888 BTRFS_ADD_DELAYED_REF, NULL); 1891 BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1889 } 1892 }
1890 return ret; 1893 return ret;
1891} 1894}
@@ -2233,6 +2236,28 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2233 } 2236 }
2234 2237
2235 /* 2238 /*
2239 * locked_ref is the head node, so we have to go one
2240 * node back for any delayed ref updates
2241 */
2242 ref = select_delayed_ref(locked_ref);
2243
2244 if (ref && ref->seq &&
2245 btrfs_check_delayed_seq(delayed_refs, ref->seq)) {
2246 /*
2247 * there are still refs with lower seq numbers in the
2248 * process of being added. Don't run this ref yet.
2249 */
2250 list_del_init(&locked_ref->cluster);
2251 mutex_unlock(&locked_ref->mutex);
2252 locked_ref = NULL;
2253 delayed_refs->num_heads_ready++;
2254 spin_unlock(&delayed_refs->lock);
2255 cond_resched();
2256 spin_lock(&delayed_refs->lock);
2257 continue;
2258 }
2259
2260 /*
2236 * record the must insert reserved flag before we 2261 * record the must insert reserved flag before we
2237 * drop the spin lock. 2262 * drop the spin lock.
2238 */ 2263 */
@@ -2242,11 +2267,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2242 extent_op = locked_ref->extent_op; 2267 extent_op = locked_ref->extent_op;
2243 locked_ref->extent_op = NULL; 2268 locked_ref->extent_op = NULL;
2244 2269
2245 /*
2246 * locked_ref is the head node, so we have to go one
2247 * node back for any delayed ref updates
2248 */
2249 ref = select_delayed_ref(locked_ref);
2250 if (!ref) { 2270 if (!ref) {
2251 /* All delayed refs have been processed, Go ahead 2271 /* All delayed refs have been processed, Go ahead
2252 * and send the head node to run_one_delayed_ref, 2272 * and send the head node to run_one_delayed_ref,
@@ -2267,9 +2287,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2267 BUG_ON(ret); 2287 BUG_ON(ret);
2268 kfree(extent_op); 2288 kfree(extent_op);
2269 2289
2270 cond_resched(); 2290 goto next;
2271 spin_lock(&delayed_refs->lock);
2272 continue;
2273 } 2291 }
2274 2292
2275 list_del_init(&locked_ref->cluster); 2293 list_del_init(&locked_ref->cluster);
@@ -2279,7 +2297,12 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2279 ref->in_tree = 0; 2297 ref->in_tree = 0;
2280 rb_erase(&ref->rb_node, &delayed_refs->root); 2298 rb_erase(&ref->rb_node, &delayed_refs->root);
2281 delayed_refs->num_entries--; 2299 delayed_refs->num_entries--;
2282 2300 /*
2301 * we modified num_entries, but as we're currently running
2302 * delayed refs, skip
2303 * wake_up(&delayed_refs->seq_wait);
2304 * here.
2305 */
2283 spin_unlock(&delayed_refs->lock); 2306 spin_unlock(&delayed_refs->lock);
2284 2307
2285 ret = run_one_delayed_ref(trans, root, ref, extent_op, 2308 ret = run_one_delayed_ref(trans, root, ref, extent_op,
@@ -2289,13 +2312,34 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2289 btrfs_put_delayed_ref(ref); 2312 btrfs_put_delayed_ref(ref);
2290 kfree(extent_op); 2313 kfree(extent_op);
2291 count++; 2314 count++;
2292 2315next:
2316 do_chunk_alloc(trans, root->fs_info->extent_root,
2317 2 * 1024 * 1024,
2318 btrfs_get_alloc_profile(root, 0),
2319 CHUNK_ALLOC_NO_FORCE);
2293 cond_resched(); 2320 cond_resched();
2294 spin_lock(&delayed_refs->lock); 2321 spin_lock(&delayed_refs->lock);
2295 } 2322 }
2296 return count; 2323 return count;
2297} 2324}
2298 2325
2326
2327static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs,
2328 unsigned long num_refs)
2329{
2330 struct list_head *first_seq = delayed_refs->seq_head.next;
2331
2332 spin_unlock(&delayed_refs->lock);
2333 pr_debug("waiting for more refs (num %ld, first %p)\n",
2334 num_refs, first_seq);
2335 wait_event(delayed_refs->seq_wait,
2336 num_refs != delayed_refs->num_entries ||
2337 delayed_refs->seq_head.next != first_seq);
2338 pr_debug("done waiting for more refs (num %ld, first %p)\n",
2339 delayed_refs->num_entries, delayed_refs->seq_head.next);
2340 spin_lock(&delayed_refs->lock);
2341}
2342
2299/* 2343/*
2300 * this starts processing the delayed reference count updates and 2344 * this starts processing the delayed reference count updates and
2301 * extent insertions we have queued up so far. count can be 2345 * extent insertions we have queued up so far. count can be
@@ -2311,15 +2355,23 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2311 struct btrfs_delayed_ref_node *ref; 2355 struct btrfs_delayed_ref_node *ref;
2312 struct list_head cluster; 2356 struct list_head cluster;
2313 int ret; 2357 int ret;
2358 u64 delayed_start;
2314 int run_all = count == (unsigned long)-1; 2359 int run_all = count == (unsigned long)-1;
2315 int run_most = 0; 2360 int run_most = 0;
2361 unsigned long num_refs = 0;
2362 int consider_waiting;
2316 2363
2317 if (root == root->fs_info->extent_root) 2364 if (root == root->fs_info->extent_root)
2318 root = root->fs_info->tree_root; 2365 root = root->fs_info->tree_root;
2319 2366
2367 do_chunk_alloc(trans, root->fs_info->extent_root,
2368 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
2369 CHUNK_ALLOC_NO_FORCE);
2370
2320 delayed_refs = &trans->transaction->delayed_refs; 2371 delayed_refs = &trans->transaction->delayed_refs;
2321 INIT_LIST_HEAD(&cluster); 2372 INIT_LIST_HEAD(&cluster);
2322again: 2373again:
2374 consider_waiting = 0;
2323 spin_lock(&delayed_refs->lock); 2375 spin_lock(&delayed_refs->lock);
2324 if (count == 0) { 2376 if (count == 0) {
2325 count = delayed_refs->num_entries * 2; 2377 count = delayed_refs->num_entries * 2;
@@ -2336,11 +2388,35 @@ again:
2336 * of refs to process starting at the first one we are able to 2388 * of refs to process starting at the first one we are able to
2337 * lock 2389 * lock
2338 */ 2390 */
2391 delayed_start = delayed_refs->run_delayed_start;
2339 ret = btrfs_find_ref_cluster(trans, &cluster, 2392 ret = btrfs_find_ref_cluster(trans, &cluster,
2340 delayed_refs->run_delayed_start); 2393 delayed_refs->run_delayed_start);
2341 if (ret) 2394 if (ret)
2342 break; 2395 break;
2343 2396
2397 if (delayed_start >= delayed_refs->run_delayed_start) {
2398 if (consider_waiting == 0) {
2399 /*
2400 * btrfs_find_ref_cluster looped. let's do one
2401 * more cycle. if we don't run any delayed ref
2402 * during that cycle (because we can't because
2403 * all of them are blocked) and if the number of
2404 * refs doesn't change, we avoid busy waiting.
2405 */
2406 consider_waiting = 1;
2407 num_refs = delayed_refs->num_entries;
2408 } else {
2409 wait_for_more_refs(delayed_refs, num_refs);
2410 /*
2411 * after waiting, things have changed. we
2412 * dropped the lock and someone else might have
2413 * run some refs, built new clusters and so on.
2414 * therefore, we restart staleness detection.
2415 */
2416 consider_waiting = 0;
2417 }
2418 }
2419
2344 ret = run_clustered_refs(trans, root, &cluster); 2420 ret = run_clustered_refs(trans, root, &cluster);
2345 BUG_ON(ret < 0); 2421 BUG_ON(ret < 0);
2346 2422
@@ -2348,6 +2424,11 @@ again:
2348 2424
2349 if (count == 0) 2425 if (count == 0)
2350 break; 2426 break;
2427
2428 if (ret || delayed_refs->run_delayed_start == 0) {
2429 /* refs were run, let's reset staleness detection */
2430 consider_waiting = 0;
2431 }
2351 } 2432 }
2352 2433
2353 if (run_all) { 2434 if (run_all) {
@@ -2405,7 +2486,8 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2405 extent_op->update_key = 0; 2486 extent_op->update_key = 0;
2406 extent_op->is_data = is_data ? 1 : 0; 2487 extent_op->is_data = is_data ? 1 : 0;
2407 2488
2408 ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); 2489 ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
2490 num_bytes, extent_op);
2409 if (ret) 2491 if (ret)
2410 kfree(extent_op); 2492 kfree(extent_op);
2411 return ret; 2493 return ret;
@@ -2590,7 +2672,7 @@ out:
2590static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 2672static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2591 struct btrfs_root *root, 2673 struct btrfs_root *root,
2592 struct extent_buffer *buf, 2674 struct extent_buffer *buf,
2593 int full_backref, int inc) 2675 int full_backref, int inc, int for_cow)
2594{ 2676{
2595 u64 bytenr; 2677 u64 bytenr;
2596 u64 num_bytes; 2678 u64 num_bytes;
@@ -2603,7 +2685,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2603 int level; 2685 int level;
2604 int ret = 0; 2686 int ret = 0;
2605 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 2687 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2606 u64, u64, u64, u64, u64, u64); 2688 u64, u64, u64, u64, u64, u64, int);
2607 2689
2608 ref_root = btrfs_header_owner(buf); 2690 ref_root = btrfs_header_owner(buf);
2609 nritems = btrfs_header_nritems(buf); 2691 nritems = btrfs_header_nritems(buf);
@@ -2640,14 +2722,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2640 key.offset -= btrfs_file_extent_offset(buf, fi); 2722 key.offset -= btrfs_file_extent_offset(buf, fi);
2641 ret = process_func(trans, root, bytenr, num_bytes, 2723 ret = process_func(trans, root, bytenr, num_bytes,
2642 parent, ref_root, key.objectid, 2724 parent, ref_root, key.objectid,
2643 key.offset); 2725 key.offset, for_cow);
2644 if (ret) 2726 if (ret)
2645 goto fail; 2727 goto fail;
2646 } else { 2728 } else {
2647 bytenr = btrfs_node_blockptr(buf, i); 2729 bytenr = btrfs_node_blockptr(buf, i);
2648 num_bytes = btrfs_level_size(root, level - 1); 2730 num_bytes = btrfs_level_size(root, level - 1);
2649 ret = process_func(trans, root, bytenr, num_bytes, 2731 ret = process_func(trans, root, bytenr, num_bytes,
2650 parent, ref_root, level - 1, 0); 2732 parent, ref_root, level - 1, 0,
2733 for_cow);
2651 if (ret) 2734 if (ret)
2652 goto fail; 2735 goto fail;
2653 } 2736 }
@@ -2659,15 +2742,15 @@ fail:
2659} 2742}
2660 2743
2661int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2744int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2662 struct extent_buffer *buf, int full_backref) 2745 struct extent_buffer *buf, int full_backref, int for_cow)
2663{ 2746{
2664 return __btrfs_mod_ref(trans, root, buf, full_backref, 1); 2747 return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
2665} 2748}
2666 2749
2667int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2750int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2668 struct extent_buffer *buf, int full_backref) 2751 struct extent_buffer *buf, int full_backref, int for_cow)
2669{ 2752{
2670 return __btrfs_mod_ref(trans, root, buf, full_backref, 0); 2753 return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
2671} 2754}
2672 2755
2673static int write_one_cache_group(struct btrfs_trans_handle *trans, 2756static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -2993,9 +3076,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2993 INIT_LIST_HEAD(&found->block_groups[i]); 3076 INIT_LIST_HEAD(&found->block_groups[i]);
2994 init_rwsem(&found->groups_sem); 3077 init_rwsem(&found->groups_sem);
2995 spin_lock_init(&found->lock); 3078 spin_lock_init(&found->lock);
2996 found->flags = flags & (BTRFS_BLOCK_GROUP_DATA | 3079 found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
2997 BTRFS_BLOCK_GROUP_SYSTEM |
2998 BTRFS_BLOCK_GROUP_METADATA);
2999 found->total_bytes = total_bytes; 3080 found->total_bytes = total_bytes;
3000 found->disk_total = total_bytes * factor; 3081 found->disk_total = total_bytes * factor;
3001 found->bytes_used = bytes_used; 3082 found->bytes_used = bytes_used;
@@ -3016,20 +3097,27 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3016 3097
3017static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 3098static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3018{ 3099{
3019 u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | 3100 u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
3020 BTRFS_BLOCK_GROUP_RAID1 | 3101
3021 BTRFS_BLOCK_GROUP_RAID10 | 3102 /* chunk -> extended profile */
3022 BTRFS_BLOCK_GROUP_DUP); 3103 if (extra_flags == 0)
3023 if (extra_flags) { 3104 extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
3024 if (flags & BTRFS_BLOCK_GROUP_DATA) 3105
3025 fs_info->avail_data_alloc_bits |= extra_flags; 3106 if (flags & BTRFS_BLOCK_GROUP_DATA)
3026 if (flags & BTRFS_BLOCK_GROUP_METADATA) 3107 fs_info->avail_data_alloc_bits |= extra_flags;
3027 fs_info->avail_metadata_alloc_bits |= extra_flags; 3108 if (flags & BTRFS_BLOCK_GROUP_METADATA)
3028 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3109 fs_info->avail_metadata_alloc_bits |= extra_flags;
3029 fs_info->avail_system_alloc_bits |= extra_flags; 3110 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3030 } 3111 fs_info->avail_system_alloc_bits |= extra_flags;
3031} 3112}
3032 3113
3114/*
3115 * @flags: available profiles in extended format (see ctree.h)
3116 *
3117 * Returns reduced profile in chunk format. If profile changing is in
3118 * progress (either running or paused) picks the target profile (if it's
3119 * already available), otherwise falls back to plain reducing.
3120 */
3033u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 3121u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3034{ 3122{
3035 /* 3123 /*
@@ -3040,6 +3128,34 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3040 u64 num_devices = root->fs_info->fs_devices->rw_devices + 3128 u64 num_devices = root->fs_info->fs_devices->rw_devices +
3041 root->fs_info->fs_devices->missing_devices; 3129 root->fs_info->fs_devices->missing_devices;
3042 3130
3131 /* pick restriper's target profile if it's available */
3132 spin_lock(&root->fs_info->balance_lock);
3133 if (root->fs_info->balance_ctl) {
3134 struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
3135 u64 tgt = 0;
3136
3137 if ((flags & BTRFS_BLOCK_GROUP_DATA) &&
3138 (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3139 (flags & bctl->data.target)) {
3140 tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3141 } else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) &&
3142 (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3143 (flags & bctl->sys.target)) {
3144 tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3145 } else if ((flags & BTRFS_BLOCK_GROUP_METADATA) &&
3146 (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3147 (flags & bctl->meta.target)) {
3148 tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3149 }
3150
3151 if (tgt) {
3152 spin_unlock(&root->fs_info->balance_lock);
3153 flags = tgt;
3154 goto out;
3155 }
3156 }
3157 spin_unlock(&root->fs_info->balance_lock);
3158
3043 if (num_devices == 1) 3159 if (num_devices == 1)
3044 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); 3160 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
3045 if (num_devices < 4) 3161 if (num_devices < 4)
@@ -3059,22 +3175,25 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3059 if ((flags & BTRFS_BLOCK_GROUP_RAID0) && 3175 if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
3060 ((flags & BTRFS_BLOCK_GROUP_RAID1) | 3176 ((flags & BTRFS_BLOCK_GROUP_RAID1) |
3061 (flags & BTRFS_BLOCK_GROUP_RAID10) | 3177 (flags & BTRFS_BLOCK_GROUP_RAID10) |
3062 (flags & BTRFS_BLOCK_GROUP_DUP))) 3178 (flags & BTRFS_BLOCK_GROUP_DUP))) {
3063 flags &= ~BTRFS_BLOCK_GROUP_RAID0; 3179 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
3180 }
3181
3182out:
3183 /* extended -> chunk profile */
3184 flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
3064 return flags; 3185 return flags;
3065} 3186}
3066 3187
3067static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) 3188static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3068{ 3189{
3069 if (flags & BTRFS_BLOCK_GROUP_DATA) 3190 if (flags & BTRFS_BLOCK_GROUP_DATA)
3070 flags |= root->fs_info->avail_data_alloc_bits & 3191 flags |= root->fs_info->avail_data_alloc_bits;
3071 root->fs_info->data_alloc_profile;
3072 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3192 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3073 flags |= root->fs_info->avail_system_alloc_bits & 3193 flags |= root->fs_info->avail_system_alloc_bits;
3074 root->fs_info->system_alloc_profile;
3075 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 3194 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3076 flags |= root->fs_info->avail_metadata_alloc_bits & 3195 flags |= root->fs_info->avail_metadata_alloc_bits;
3077 root->fs_info->metadata_alloc_profile; 3196
3078 return btrfs_reduce_alloc_profile(root, flags); 3197 return btrfs_reduce_alloc_profile(root, flags);
3079} 3198}
3080 3199
@@ -3191,6 +3310,8 @@ commit_trans:
3191 return -ENOSPC; 3310 return -ENOSPC;
3192 } 3311 }
3193 data_sinfo->bytes_may_use += bytes; 3312 data_sinfo->bytes_may_use += bytes;
3313 trace_btrfs_space_reservation(root->fs_info, "space_info",
3314 (u64)data_sinfo, bytes, 1);
3194 spin_unlock(&data_sinfo->lock); 3315 spin_unlock(&data_sinfo->lock);
3195 3316
3196 return 0; 3317 return 0;
@@ -3210,6 +3331,8 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3210 data_sinfo = BTRFS_I(inode)->space_info; 3331 data_sinfo = BTRFS_I(inode)->space_info;
3211 spin_lock(&data_sinfo->lock); 3332 spin_lock(&data_sinfo->lock);
3212 data_sinfo->bytes_may_use -= bytes; 3333 data_sinfo->bytes_may_use -= bytes;
3334 trace_btrfs_space_reservation(root->fs_info, "space_info",
3335 (u64)data_sinfo, bytes, 0);
3213 spin_unlock(&data_sinfo->lock); 3336 spin_unlock(&data_sinfo->lock);
3214} 3337}
3215 3338
@@ -3257,27 +3380,15 @@ static int should_alloc_chunk(struct btrfs_root *root,
3257 if (num_bytes - num_allocated < thresh) 3380 if (num_bytes - num_allocated < thresh)
3258 return 1; 3381 return 1;
3259 } 3382 }
3260
3261 /*
3262 * we have two similar checks here, one based on percentage
3263 * and once based on a hard number of 256MB. The idea
3264 * is that if we have a good amount of free
3265 * room, don't allocate a chunk. A good mount is
3266 * less than 80% utilized of the chunks we have allocated,
3267 * or more than 256MB free
3268 */
3269 if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
3270 return 0;
3271
3272 if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
3273 return 0;
3274
3275 thresh = btrfs_super_total_bytes(root->fs_info->super_copy); 3383 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3276 3384
3277 /* 256MB or 5% of the FS */ 3385 /* 256MB or 2% of the FS */
3278 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); 3386 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
3387 /* system chunks need a much small threshold */
3388 if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM)
3389 thresh = 32 * 1024 * 1024;
3279 3390
3280 if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3)) 3391 if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
3281 return 0; 3392 return 0;
3282 return 1; 3393 return 1;
3283} 3394}
@@ -3291,7 +3402,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3291 int wait_for_alloc = 0; 3402 int wait_for_alloc = 0;
3292 int ret = 0; 3403 int ret = 0;
3293 3404
3294 flags = btrfs_reduce_alloc_profile(extent_root, flags); 3405 BUG_ON(!profile_is_valid(flags, 0));
3295 3406
3296 space_info = __find_space_info(extent_root->fs_info, flags); 3407 space_info = __find_space_info(extent_root->fs_info, flags);
3297 if (!space_info) { 3408 if (!space_info) {
@@ -3582,6 +3693,10 @@ again:
3582 if (used <= space_info->total_bytes) { 3693 if (used <= space_info->total_bytes) {
3583 if (used + orig_bytes <= space_info->total_bytes) { 3694 if (used + orig_bytes <= space_info->total_bytes) {
3584 space_info->bytes_may_use += orig_bytes; 3695 space_info->bytes_may_use += orig_bytes;
3696 trace_btrfs_space_reservation(root->fs_info,
3697 "space_info",
3698 (u64)space_info,
3699 orig_bytes, 1);
3585 ret = 0; 3700 ret = 0;
3586 } else { 3701 } else {
3587 /* 3702 /*
@@ -3649,6 +3764,10 @@ again:
3649 3764
3650 if (used + num_bytes < space_info->total_bytes + avail) { 3765 if (used + num_bytes < space_info->total_bytes + avail) {
3651 space_info->bytes_may_use += orig_bytes; 3766 space_info->bytes_may_use += orig_bytes;
3767 trace_btrfs_space_reservation(root->fs_info,
3768 "space_info",
3769 (u64)space_info,
3770 orig_bytes, 1);
3652 ret = 0; 3771 ret = 0;
3653 } else { 3772 } else {
3654 wait_ordered = true; 3773 wait_ordered = true;
@@ -3755,7 +3874,8 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
3755 spin_unlock(&block_rsv->lock); 3874 spin_unlock(&block_rsv->lock);
3756} 3875}
3757 3876
3758static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, 3877static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
3878 struct btrfs_block_rsv *block_rsv,
3759 struct btrfs_block_rsv *dest, u64 num_bytes) 3879 struct btrfs_block_rsv *dest, u64 num_bytes)
3760{ 3880{
3761 struct btrfs_space_info *space_info = block_rsv->space_info; 3881 struct btrfs_space_info *space_info = block_rsv->space_info;
@@ -3791,6 +3911,9 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3791 if (num_bytes) { 3911 if (num_bytes) {
3792 spin_lock(&space_info->lock); 3912 spin_lock(&space_info->lock);
3793 space_info->bytes_may_use -= num_bytes; 3913 space_info->bytes_may_use -= num_bytes;
3914 trace_btrfs_space_reservation(fs_info, "space_info",
3915 (u64)space_info,
3916 num_bytes, 0);
3794 space_info->reservation_progress++; 3917 space_info->reservation_progress++;
3795 spin_unlock(&space_info->lock); 3918 spin_unlock(&space_info->lock);
3796 } 3919 }
@@ -3947,7 +4070,8 @@ void btrfs_block_rsv_release(struct btrfs_root *root,
3947 if (global_rsv->full || global_rsv == block_rsv || 4070 if (global_rsv->full || global_rsv == block_rsv ||
3948 block_rsv->space_info != global_rsv->space_info) 4071 block_rsv->space_info != global_rsv->space_info)
3949 global_rsv = NULL; 4072 global_rsv = NULL;
3950 block_rsv_release_bytes(block_rsv, global_rsv, num_bytes); 4073 block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
4074 num_bytes);
3951} 4075}
3952 4076
3953/* 4077/*
@@ -4006,11 +4130,15 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
4006 num_bytes = sinfo->total_bytes - num_bytes; 4130 num_bytes = sinfo->total_bytes - num_bytes;
4007 block_rsv->reserved += num_bytes; 4131 block_rsv->reserved += num_bytes;
4008 sinfo->bytes_may_use += num_bytes; 4132 sinfo->bytes_may_use += num_bytes;
4133 trace_btrfs_space_reservation(fs_info, "space_info",
4134 (u64)sinfo, num_bytes, 1);
4009 } 4135 }
4010 4136
4011 if (block_rsv->reserved >= block_rsv->size) { 4137 if (block_rsv->reserved >= block_rsv->size) {
4012 num_bytes = block_rsv->reserved - block_rsv->size; 4138 num_bytes = block_rsv->reserved - block_rsv->size;
4013 sinfo->bytes_may_use -= num_bytes; 4139 sinfo->bytes_may_use -= num_bytes;
4140 trace_btrfs_space_reservation(fs_info, "space_info",
4141 (u64)sinfo, num_bytes, 0);
4014 sinfo->reservation_progress++; 4142 sinfo->reservation_progress++;
4015 block_rsv->reserved = block_rsv->size; 4143 block_rsv->reserved = block_rsv->size;
4016 block_rsv->full = 1; 4144 block_rsv->full = 1;
@@ -4045,7 +4173,8 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
4045 4173
4046static void release_global_block_rsv(struct btrfs_fs_info *fs_info) 4174static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
4047{ 4175{
4048 block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1); 4176 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
4177 (u64)-1);
4049 WARN_ON(fs_info->delalloc_block_rsv.size > 0); 4178 WARN_ON(fs_info->delalloc_block_rsv.size > 0);
4050 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); 4179 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
4051 WARN_ON(fs_info->trans_block_rsv.size > 0); 4180 WARN_ON(fs_info->trans_block_rsv.size > 0);
@@ -4062,6 +4191,8 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
4062 if (!trans->bytes_reserved) 4191 if (!trans->bytes_reserved)
4063 return; 4192 return;
4064 4193
4194 trace_btrfs_space_reservation(root->fs_info, "transaction", (u64)trans,
4195 trans->bytes_reserved, 0);
4065 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); 4196 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
4066 trans->bytes_reserved = 0; 4197 trans->bytes_reserved = 0;
4067} 4198}
@@ -4079,6 +4210,8 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
4079 * when we are truly done with the orphan item. 4210 * when we are truly done with the orphan item.
4080 */ 4211 */
4081 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 4212 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4213 trace_btrfs_space_reservation(root->fs_info, "orphan",
4214 btrfs_ino(inode), num_bytes, 1);
4082 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4215 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4083} 4216}
4084 4217
@@ -4086,6 +4219,8 @@ void btrfs_orphan_release_metadata(struct inode *inode)
4086{ 4219{
4087 struct btrfs_root *root = BTRFS_I(inode)->root; 4220 struct btrfs_root *root = BTRFS_I(inode)->root;
4088 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 4221 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4222 trace_btrfs_space_reservation(root->fs_info, "orphan",
4223 btrfs_ino(inode), num_bytes, 0);
4089 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); 4224 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
4090} 4225}
4091 4226
@@ -4213,12 +4348,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4213 /* Need to be holding the i_mutex here if we aren't free space cache */ 4348 /* Need to be holding the i_mutex here if we aren't free space cache */
4214 if (btrfs_is_free_space_inode(root, inode)) 4349 if (btrfs_is_free_space_inode(root, inode))
4215 flush = 0; 4350 flush = 0;
4216 else
4217 WARN_ON(!mutex_is_locked(&inode->i_mutex));
4218 4351
4219 if (flush && btrfs_transaction_in_commit(root->fs_info)) 4352 if (flush && btrfs_transaction_in_commit(root->fs_info))
4220 schedule_timeout(1); 4353 schedule_timeout(1);
4221 4354
4355 mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
4222 num_bytes = ALIGN(num_bytes, root->sectorsize); 4356 num_bytes = ALIGN(num_bytes, root->sectorsize);
4223 4357
4224 spin_lock(&BTRFS_I(inode)->lock); 4358 spin_lock(&BTRFS_I(inode)->lock);
@@ -4266,8 +4400,14 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4266 if (dropped) 4400 if (dropped)
4267 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4401 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4268 4402
4269 if (to_free) 4403 if (to_free) {
4270 btrfs_block_rsv_release(root, block_rsv, to_free); 4404 btrfs_block_rsv_release(root, block_rsv, to_free);
4405 trace_btrfs_space_reservation(root->fs_info,
4406 "delalloc",
4407 btrfs_ino(inode),
4408 to_free, 0);
4409 }
4410 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4271 return ret; 4411 return ret;
4272 } 4412 }
4273 4413
@@ -4278,7 +4418,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4278 } 4418 }
4279 BTRFS_I(inode)->reserved_extents += nr_extents; 4419 BTRFS_I(inode)->reserved_extents += nr_extents;
4280 spin_unlock(&BTRFS_I(inode)->lock); 4420 spin_unlock(&BTRFS_I(inode)->lock);
4421 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4281 4422
4423 if (to_reserve)
4424 trace_btrfs_space_reservation(root->fs_info,"delalloc",
4425 btrfs_ino(inode), to_reserve, 1);
4282 block_rsv_add_bytes(block_rsv, to_reserve, 1); 4426 block_rsv_add_bytes(block_rsv, to_reserve, 1);
4283 4427
4284 return 0; 4428 return 0;
@@ -4308,6 +4452,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4308 if (dropped > 0) 4452 if (dropped > 0)
4309 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4453 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4310 4454
4455 trace_btrfs_space_reservation(root->fs_info, "delalloc",
4456 btrfs_ino(inode), to_free, 0);
4311 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, 4457 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
4312 to_free); 4458 to_free);
4313} 4459}
@@ -4562,7 +4708,10 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4562 cache->reserved += num_bytes; 4708 cache->reserved += num_bytes;
4563 space_info->bytes_reserved += num_bytes; 4709 space_info->bytes_reserved += num_bytes;
4564 if (reserve == RESERVE_ALLOC) { 4710 if (reserve == RESERVE_ALLOC) {
4565 BUG_ON(space_info->bytes_may_use < num_bytes); 4711 trace_btrfs_space_reservation(cache->fs_info,
4712 "space_info",
4713 (u64)space_info,
4714 num_bytes, 0);
4566 space_info->bytes_may_use -= num_bytes; 4715 space_info->bytes_may_use -= num_bytes;
4567 } 4716 }
4568 } 4717 }
@@ -4928,6 +5077,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
4928 rb_erase(&head->node.rb_node, &delayed_refs->root); 5077 rb_erase(&head->node.rb_node, &delayed_refs->root);
4929 5078
4930 delayed_refs->num_entries--; 5079 delayed_refs->num_entries--;
5080 if (waitqueue_active(&delayed_refs->seq_wait))
5081 wake_up(&delayed_refs->seq_wait);
4931 5082
4932 /* 5083 /*
4933 * we don't take a ref on the node because we're removing it from the 5084 * we don't take a ref on the node because we're removing it from the
@@ -4955,16 +5106,17 @@ out:
4955void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 5106void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4956 struct btrfs_root *root, 5107 struct btrfs_root *root,
4957 struct extent_buffer *buf, 5108 struct extent_buffer *buf,
4958 u64 parent, int last_ref) 5109 u64 parent, int last_ref, int for_cow)
4959{ 5110{
4960 struct btrfs_block_group_cache *cache = NULL; 5111 struct btrfs_block_group_cache *cache = NULL;
4961 int ret; 5112 int ret;
4962 5113
4963 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 5114 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4964 ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len, 5115 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
4965 parent, root->root_key.objectid, 5116 buf->start, buf->len,
4966 btrfs_header_level(buf), 5117 parent, root->root_key.objectid,
4967 BTRFS_DROP_DELAYED_REF, NULL); 5118 btrfs_header_level(buf),
5119 BTRFS_DROP_DELAYED_REF, NULL, for_cow);
4968 BUG_ON(ret); 5120 BUG_ON(ret);
4969 } 5121 }
4970 5122
@@ -4999,12 +5151,12 @@ out:
4999 btrfs_put_block_group(cache); 5151 btrfs_put_block_group(cache);
5000} 5152}
5001 5153
5002int btrfs_free_extent(struct btrfs_trans_handle *trans, 5154int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5003 struct btrfs_root *root, 5155 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
5004 u64 bytenr, u64 num_bytes, u64 parent, 5156 u64 owner, u64 offset, int for_cow)
5005 u64 root_objectid, u64 owner, u64 offset)
5006{ 5157{
5007 int ret; 5158 int ret;
5159 struct btrfs_fs_info *fs_info = root->fs_info;
5008 5160
5009 /* 5161 /*
5010 * tree log blocks never actually go into the extent allocation 5162 * tree log blocks never actually go into the extent allocation
@@ -5016,14 +5168,17 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
5016 btrfs_pin_extent(root, bytenr, num_bytes, 1); 5168 btrfs_pin_extent(root, bytenr, num_bytes, 1);
5017 ret = 0; 5169 ret = 0;
5018 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { 5170 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
5019 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, 5171 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
5172 num_bytes,
5020 parent, root_objectid, (int)owner, 5173 parent, root_objectid, (int)owner,
5021 BTRFS_DROP_DELAYED_REF, NULL); 5174 BTRFS_DROP_DELAYED_REF, NULL, for_cow);
5022 BUG_ON(ret); 5175 BUG_ON(ret);
5023 } else { 5176 } else {
5024 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, 5177 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
5025 parent, root_objectid, owner, 5178 num_bytes,
5026 offset, BTRFS_DROP_DELAYED_REF, NULL); 5179 parent, root_objectid, owner,
5180 offset, BTRFS_DROP_DELAYED_REF,
5181 NULL, for_cow);
5027 BUG_ON(ret); 5182 BUG_ON(ret);
5028 } 5183 }
5029 return ret; 5184 return ret;
@@ -5146,6 +5301,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5146 ins->objectid = 0; 5301 ins->objectid = 0;
5147 ins->offset = 0; 5302 ins->offset = 0;
5148 5303
5304 trace_find_free_extent(orig_root, num_bytes, empty_size, data);
5305
5149 space_info = __find_space_info(root->fs_info, data); 5306 space_info = __find_space_info(root->fs_info, data);
5150 if (!space_info) { 5307 if (!space_info) {
5151 printk(KERN_ERR "No space info for %llu\n", data); 5308 printk(KERN_ERR "No space info for %llu\n", data);
@@ -5295,15 +5452,6 @@ alloc:
5295 if (unlikely(block_group->ro)) 5452 if (unlikely(block_group->ro))
5296 goto loop; 5453 goto loop;
5297 5454
5298 spin_lock(&block_group->free_space_ctl->tree_lock);
5299 if (cached &&
5300 block_group->free_space_ctl->free_space <
5301 num_bytes + empty_cluster + empty_size) {
5302 spin_unlock(&block_group->free_space_ctl->tree_lock);
5303 goto loop;
5304 }
5305 spin_unlock(&block_group->free_space_ctl->tree_lock);
5306
5307 /* 5455 /*
5308 * Ok we want to try and use the cluster allocator, so 5456 * Ok we want to try and use the cluster allocator, so
5309 * lets look there 5457 * lets look there
@@ -5331,6 +5479,8 @@ alloc:
5331 if (offset) { 5479 if (offset) {
5332 /* we have a block, we're done */ 5480 /* we have a block, we're done */
5333 spin_unlock(&last_ptr->refill_lock); 5481 spin_unlock(&last_ptr->refill_lock);
5482 trace_btrfs_reserve_extent_cluster(root,
5483 block_group, search_start, num_bytes);
5334 goto checks; 5484 goto checks;
5335 } 5485 }
5336 5486
@@ -5349,8 +5499,15 @@ refill_cluster:
5349 * plenty of times and not have found 5499 * plenty of times and not have found
5350 * anything, so we are likely way too 5500 * anything, so we are likely way too
5351 * fragmented for the clustering stuff to find 5501 * fragmented for the clustering stuff to find
5352 * anything. */ 5502 * anything.
5353 if (loop >= LOOP_NO_EMPTY_SIZE) { 5503 *
5504 * However, if the cluster is taken from the
5505 * current block group, release the cluster
5506 * first, so that we stand a better chance of
5507 * succeeding in the unclustered
5508 * allocation. */
5509 if (loop >= LOOP_NO_EMPTY_SIZE &&
5510 last_ptr->block_group != block_group) {
5354 spin_unlock(&last_ptr->refill_lock); 5511 spin_unlock(&last_ptr->refill_lock);
5355 goto unclustered_alloc; 5512 goto unclustered_alloc;
5356 } 5513 }
@@ -5361,6 +5518,11 @@ refill_cluster:
5361 */ 5518 */
5362 btrfs_return_cluster_to_free_space(NULL, last_ptr); 5519 btrfs_return_cluster_to_free_space(NULL, last_ptr);
5363 5520
5521 if (loop >= LOOP_NO_EMPTY_SIZE) {
5522 spin_unlock(&last_ptr->refill_lock);
5523 goto unclustered_alloc;
5524 }
5525
5364 /* allocate a cluster in this block group */ 5526 /* allocate a cluster in this block group */
5365 ret = btrfs_find_space_cluster(trans, root, 5527 ret = btrfs_find_space_cluster(trans, root,
5366 block_group, last_ptr, 5528 block_group, last_ptr,
@@ -5377,6 +5539,9 @@ refill_cluster:
5377 if (offset) { 5539 if (offset) {
5378 /* we found one, proceed */ 5540 /* we found one, proceed */
5379 spin_unlock(&last_ptr->refill_lock); 5541 spin_unlock(&last_ptr->refill_lock);
5542 trace_btrfs_reserve_extent_cluster(root,
5543 block_group, search_start,
5544 num_bytes);
5380 goto checks; 5545 goto checks;
5381 } 5546 }
5382 } else if (!cached && loop > LOOP_CACHING_NOWAIT 5547 } else if (!cached && loop > LOOP_CACHING_NOWAIT
@@ -5401,6 +5566,15 @@ refill_cluster:
5401 } 5566 }
5402 5567
5403unclustered_alloc: 5568unclustered_alloc:
5569 spin_lock(&block_group->free_space_ctl->tree_lock);
5570 if (cached &&
5571 block_group->free_space_ctl->free_space <
5572 num_bytes + empty_cluster + empty_size) {
5573 spin_unlock(&block_group->free_space_ctl->tree_lock);
5574 goto loop;
5575 }
5576 spin_unlock(&block_group->free_space_ctl->tree_lock);
5577
5404 offset = btrfs_find_space_for_alloc(block_group, search_start, 5578 offset = btrfs_find_space_for_alloc(block_group, search_start,
5405 num_bytes, empty_size); 5579 num_bytes, empty_size);
5406 /* 5580 /*
@@ -5438,9 +5612,6 @@ checks:
5438 goto loop; 5612 goto loop;
5439 } 5613 }
5440 5614
5441 ins->objectid = search_start;
5442 ins->offset = num_bytes;
5443
5444 if (offset < search_start) 5615 if (offset < search_start)
5445 btrfs_add_free_space(used_block_group, offset, 5616 btrfs_add_free_space(used_block_group, offset,
5446 search_start - offset); 5617 search_start - offset);
@@ -5457,6 +5628,8 @@ checks:
5457 ins->objectid = search_start; 5628 ins->objectid = search_start;
5458 ins->offset = num_bytes; 5629 ins->offset = num_bytes;
5459 5630
5631 trace_btrfs_reserve_extent(orig_root, block_group,
5632 search_start, num_bytes);
5460 if (offset < search_start) 5633 if (offset < search_start)
5461 btrfs_add_free_space(used_block_group, offset, 5634 btrfs_add_free_space(used_block_group, offset,
5462 search_start - offset); 5635 search_start - offset);
@@ -5842,9 +6015,10 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5842 6015
5843 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); 6016 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
5844 6017
5845 ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset, 6018 ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
5846 0, root_objectid, owner, offset, 6019 ins->offset, 0,
5847 BTRFS_ADD_DELAYED_EXTENT, NULL); 6020 root_objectid, owner, offset,
6021 BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
5848 return ret; 6022 return ret;
5849} 6023}
5850 6024
@@ -5997,10 +6171,11 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5997 return ERR_PTR(-ENOSPC); 6171 return ERR_PTR(-ENOSPC);
5998} 6172}
5999 6173
6000static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize) 6174static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
6175 struct btrfs_block_rsv *block_rsv, u32 blocksize)
6001{ 6176{
6002 block_rsv_add_bytes(block_rsv, blocksize, 0); 6177 block_rsv_add_bytes(block_rsv, blocksize, 0);
6003 block_rsv_release_bytes(block_rsv, NULL, 0); 6178 block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
6004} 6179}
6005 6180
6006/* 6181/*
@@ -6014,7 +6189,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6014 struct btrfs_root *root, u32 blocksize, 6189 struct btrfs_root *root, u32 blocksize,
6015 u64 parent, u64 root_objectid, 6190 u64 parent, u64 root_objectid,
6016 struct btrfs_disk_key *key, int level, 6191 struct btrfs_disk_key *key, int level,
6017 u64 hint, u64 empty_size) 6192 u64 hint, u64 empty_size, int for_cow)
6018{ 6193{
6019 struct btrfs_key ins; 6194 struct btrfs_key ins;
6020 struct btrfs_block_rsv *block_rsv; 6195 struct btrfs_block_rsv *block_rsv;
@@ -6030,7 +6205,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6030 ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, 6205 ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
6031 empty_size, hint, (u64)-1, &ins, 0); 6206 empty_size, hint, (u64)-1, &ins, 0);
6032 if (ret) { 6207 if (ret) {
6033 unuse_block_rsv(block_rsv, blocksize); 6208 unuse_block_rsv(root->fs_info, block_rsv, blocksize);
6034 return ERR_PTR(ret); 6209 return ERR_PTR(ret);
6035 } 6210 }
6036 6211
@@ -6058,10 +6233,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6058 extent_op->update_flags = 1; 6233 extent_op->update_flags = 1;
6059 extent_op->is_data = 0; 6234 extent_op->is_data = 0;
6060 6235
6061 ret = btrfs_add_delayed_tree_ref(trans, ins.objectid, 6236 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
6237 ins.objectid,
6062 ins.offset, parent, root_objectid, 6238 ins.offset, parent, root_objectid,
6063 level, BTRFS_ADD_DELAYED_EXTENT, 6239 level, BTRFS_ADD_DELAYED_EXTENT,
6064 extent_op); 6240 extent_op, for_cow);
6065 BUG_ON(ret); 6241 BUG_ON(ret);
6066 } 6242 }
6067 return buf; 6243 return buf;
@@ -6078,6 +6254,7 @@ struct walk_control {
6078 int keep_locks; 6254 int keep_locks;
6079 int reada_slot; 6255 int reada_slot;
6080 int reada_count; 6256 int reada_count;
6257 int for_reloc;
6081}; 6258};
6082 6259
6083#define DROP_REFERENCE 1 6260#define DROP_REFERENCE 1
@@ -6216,9 +6393,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
6216 /* wc->stage == UPDATE_BACKREF */ 6393 /* wc->stage == UPDATE_BACKREF */
6217 if (!(wc->flags[level] & flag)) { 6394 if (!(wc->flags[level] & flag)) {
6218 BUG_ON(!path->locks[level]); 6395 BUG_ON(!path->locks[level]);
6219 ret = btrfs_inc_ref(trans, root, eb, 1); 6396 ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
6220 BUG_ON(ret); 6397 BUG_ON(ret);
6221 ret = btrfs_dec_ref(trans, root, eb, 0); 6398 ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
6222 BUG_ON(ret); 6399 BUG_ON(ret);
6223 ret = btrfs_set_disk_extent_flags(trans, root, eb->start, 6400 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
6224 eb->len, flag, 0); 6401 eb->len, flag, 0);
@@ -6362,7 +6539,7 @@ skip:
6362 } 6539 }
6363 6540
6364 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, 6541 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
6365 root->root_key.objectid, level - 1, 0); 6542 root->root_key.objectid, level - 1, 0, 0);
6366 BUG_ON(ret); 6543 BUG_ON(ret);
6367 } 6544 }
6368 btrfs_tree_unlock(next); 6545 btrfs_tree_unlock(next);
@@ -6436,9 +6613,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6436 if (wc->refs[level] == 1) { 6613 if (wc->refs[level] == 1) {
6437 if (level == 0) { 6614 if (level == 0) {
6438 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 6615 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6439 ret = btrfs_dec_ref(trans, root, eb, 1); 6616 ret = btrfs_dec_ref(trans, root, eb, 1,
6617 wc->for_reloc);
6440 else 6618 else
6441 ret = btrfs_dec_ref(trans, root, eb, 0); 6619 ret = btrfs_dec_ref(trans, root, eb, 0,
6620 wc->for_reloc);
6442 BUG_ON(ret); 6621 BUG_ON(ret);
6443 } 6622 }
6444 /* make block locked assertion in clean_tree_block happy */ 6623 /* make block locked assertion in clean_tree_block happy */
@@ -6465,7 +6644,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6465 btrfs_header_owner(path->nodes[level + 1])); 6644 btrfs_header_owner(path->nodes[level + 1]));
6466 } 6645 }
6467 6646
6468 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); 6647 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0);
6469out: 6648out:
6470 wc->refs[level] = 0; 6649 wc->refs[level] = 0;
6471 wc->flags[level] = 0; 6650 wc->flags[level] = 0;
@@ -6549,7 +6728,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
6549 * blocks are properly updated. 6728 * blocks are properly updated.
6550 */ 6729 */
6551void btrfs_drop_snapshot(struct btrfs_root *root, 6730void btrfs_drop_snapshot(struct btrfs_root *root,
6552 struct btrfs_block_rsv *block_rsv, int update_ref) 6731 struct btrfs_block_rsv *block_rsv, int update_ref,
6732 int for_reloc)
6553{ 6733{
6554 struct btrfs_path *path; 6734 struct btrfs_path *path;
6555 struct btrfs_trans_handle *trans; 6735 struct btrfs_trans_handle *trans;
@@ -6637,6 +6817,7 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
6637 wc->stage = DROP_REFERENCE; 6817 wc->stage = DROP_REFERENCE;
6638 wc->update_ref = update_ref; 6818 wc->update_ref = update_ref;
6639 wc->keep_locks = 0; 6819 wc->keep_locks = 0;
6820 wc->for_reloc = for_reloc;
6640 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 6821 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
6641 6822
6642 while (1) { 6823 while (1) {
@@ -6721,6 +6902,7 @@ out:
6721 * drop subtree rooted at tree block 'node'. 6902 * drop subtree rooted at tree block 'node'.
6722 * 6903 *
6723 * NOTE: this function will unlock and release tree block 'node' 6904 * NOTE: this function will unlock and release tree block 'node'
6905 * only used by relocation code
6724 */ 6906 */
6725int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 6907int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6726 struct btrfs_root *root, 6908 struct btrfs_root *root,
@@ -6765,6 +6947,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6765 wc->stage = DROP_REFERENCE; 6947 wc->stage = DROP_REFERENCE;
6766 wc->update_ref = 0; 6948 wc->update_ref = 0;
6767 wc->keep_locks = 1; 6949 wc->keep_locks = 1;
6950 wc->for_reloc = 1;
6768 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 6951 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
6769 6952
6770 while (1) { 6953 while (1) {
@@ -6792,6 +6975,29 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
6792 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | 6975 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
6793 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 6976 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
6794 6977
6978 if (root->fs_info->balance_ctl) {
6979 struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
6980 u64 tgt = 0;
6981
6982 /* pick restriper's target profile and return */
6983 if (flags & BTRFS_BLOCK_GROUP_DATA &&
6984 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
6985 tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
6986 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
6987 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
6988 tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
6989 } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
6990 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
6991 tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
6992 }
6993
6994 if (tgt) {
6995 /* extended -> chunk profile */
6996 tgt &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
6997 return tgt;
6998 }
6999 }
7000
6795 /* 7001 /*
6796 * we add in the count of missing devices because we want 7002 * we add in the count of missing devices because we want
6797 * to make sure that any RAID levels on a degraded FS 7003 * to make sure that any RAID levels on a degraded FS
@@ -7085,7 +7291,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7085 * space to fit our block group in. 7291 * space to fit our block group in.
7086 */ 7292 */
7087 if (device->total_bytes > device->bytes_used + min_free) { 7293 if (device->total_bytes > device->bytes_used + min_free) {
7088 ret = find_free_dev_extent(NULL, device, min_free, 7294 ret = find_free_dev_extent(device, min_free,
7089 &dev_offset, NULL); 7295 &dev_offset, NULL);
7090 if (!ret) 7296 if (!ret)
7091 dev_nr++; 7297 dev_nr++;
@@ -7447,6 +7653,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7447 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 7653 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
7448 &cache->space_info); 7654 &cache->space_info);
7449 BUG_ON(ret); 7655 BUG_ON(ret);
7656 update_global_block_rsv(root->fs_info);
7450 7657
7451 spin_lock(&cache->space_info->lock); 7658 spin_lock(&cache->space_info->lock);
7452 cache->space_info->bytes_readonly += cache->bytes_super; 7659 cache->space_info->bytes_readonly += cache->bytes_super;
@@ -7466,6 +7673,22 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7466 return 0; 7673 return 0;
7467} 7674}
7468 7675
7676static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
7677{
7678 u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
7679
7680 /* chunk -> extended profile */
7681 if (extra_flags == 0)
7682 extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
7683
7684 if (flags & BTRFS_BLOCK_GROUP_DATA)
7685 fs_info->avail_data_alloc_bits &= ~extra_flags;
7686 if (flags & BTRFS_BLOCK_GROUP_METADATA)
7687 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
7688 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
7689 fs_info->avail_system_alloc_bits &= ~extra_flags;
7690}
7691
7469int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 7692int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7470 struct btrfs_root *root, u64 group_start) 7693 struct btrfs_root *root, u64 group_start)
7471{ 7694{
@@ -7476,6 +7699,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7476 struct btrfs_key key; 7699 struct btrfs_key key;
7477 struct inode *inode; 7700 struct inode *inode;
7478 int ret; 7701 int ret;
7702 int index;
7479 int factor; 7703 int factor;
7480 7704
7481 root = root->fs_info->extent_root; 7705 root = root->fs_info->extent_root;
@@ -7491,6 +7715,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7491 free_excluded_extents(root, block_group); 7715 free_excluded_extents(root, block_group);
7492 7716
7493 memcpy(&key, &block_group->key, sizeof(key)); 7717 memcpy(&key, &block_group->key, sizeof(key));
7718 index = get_block_group_index(block_group);
7494 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | 7719 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
7495 BTRFS_BLOCK_GROUP_RAID1 | 7720 BTRFS_BLOCK_GROUP_RAID1 |
7496 BTRFS_BLOCK_GROUP_RAID10)) 7721 BTRFS_BLOCK_GROUP_RAID10))
@@ -7565,6 +7790,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7565 * are still on the list after taking the semaphore 7790 * are still on the list after taking the semaphore
7566 */ 7791 */
7567 list_del_init(&block_group->list); 7792 list_del_init(&block_group->list);
7793 if (list_empty(&block_group->space_info->block_groups[index]))
7794 clear_avail_alloc_bits(root->fs_info, block_group->flags);
7568 up_write(&block_group->space_info->groups_sem); 7795 up_write(&block_group->space_info->groups_sem);
7569 7796
7570 if (block_group->cached == BTRFS_CACHE_STARTED) 7797 if (block_group->cached == BTRFS_CACHE_STARTED)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 49f3c9dc09f4..9d09a4f81875 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -18,6 +18,7 @@
18#include "ctree.h" 18#include "ctree.h"
19#include "btrfs_inode.h" 19#include "btrfs_inode.h"
20#include "volumes.h" 20#include "volumes.h"
21#include "check-integrity.h"
21 22
22static struct kmem_cache *extent_state_cache; 23static struct kmem_cache *extent_state_cache;
23static struct kmem_cache *extent_buffer_cache; 24static struct kmem_cache *extent_buffer_cache;
@@ -1895,7 +1896,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1895 } 1896 }
1896 bio->bi_bdev = dev->bdev; 1897 bio->bi_bdev = dev->bdev;
1897 bio_add_page(bio, page, length, start-page_offset(page)); 1898 bio_add_page(bio, page, length, start-page_offset(page));
1898 submit_bio(WRITE_SYNC, bio); 1899 btrfsic_submit_bio(WRITE_SYNC, bio);
1899 wait_for_completion(&compl); 1900 wait_for_completion(&compl);
1900 1901
1901 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { 1902 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
@@ -2393,7 +2394,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
2393 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 2394 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
2394 mirror_num, bio_flags, start); 2395 mirror_num, bio_flags, start);
2395 else 2396 else
2396 submit_bio(rw, bio); 2397 btrfsic_submit_bio(rw, bio);
2397 2398
2398 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 2399 if (bio_flagged(bio, BIO_EOPNOTSUPP))
2399 ret = -EOPNOTSUPP; 2400 ret = -EOPNOTSUPP;
@@ -3579,6 +3580,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3579 atomic_set(&eb->blocking_writers, 0); 3580 atomic_set(&eb->blocking_writers, 0);
3580 atomic_set(&eb->spinning_readers, 0); 3581 atomic_set(&eb->spinning_readers, 0);
3581 atomic_set(&eb->spinning_writers, 0); 3582 atomic_set(&eb->spinning_writers, 0);
3583 eb->lock_nested = 0;
3582 init_waitqueue_head(&eb->write_lock_wq); 3584 init_waitqueue_head(&eb->write_lock_wq);
3583 init_waitqueue_head(&eb->read_lock_wq); 3585 init_waitqueue_head(&eb->read_lock_wq);
3584 3586
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7604c3001322..bc6a042cb6fc 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -129,6 +129,7 @@ struct extent_buffer {
129 struct list_head leak_list; 129 struct list_head leak_list;
130 struct rcu_head rcu_head; 130 struct rcu_head rcu_head;
131 atomic_t refs; 131 atomic_t refs;
132 pid_t lock_owner;
132 133
133 /* count of read lock holders on the extent buffer */ 134 /* count of read lock holders on the extent buffer */
134 atomic_t write_locks; 135 atomic_t write_locks;
@@ -137,6 +138,7 @@ struct extent_buffer {
137 atomic_t blocking_readers; 138 atomic_t blocking_readers;
138 atomic_t spinning_readers; 139 atomic_t spinning_readers;
139 atomic_t spinning_writers; 140 atomic_t spinning_writers;
141 int lock_nested;
140 142
141 /* protects write locks */ 143 /* protects write locks */
142 rwlock_t lock; 144 rwlock_t lock;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 034d98503229..859ba2dd8890 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -678,7 +678,7 @@ next_slot:
678 disk_bytenr, num_bytes, 0, 678 disk_bytenr, num_bytes, 0,
679 root->root_key.objectid, 679 root->root_key.objectid,
680 new_key.objectid, 680 new_key.objectid,
681 start - extent_offset); 681 start - extent_offset, 0);
682 BUG_ON(ret); 682 BUG_ON(ret);
683 *hint_byte = disk_bytenr; 683 *hint_byte = disk_bytenr;
684 } 684 }
@@ -753,7 +753,7 @@ next_slot:
753 disk_bytenr, num_bytes, 0, 753 disk_bytenr, num_bytes, 0,
754 root->root_key.objectid, 754 root->root_key.objectid,
755 key.objectid, key.offset - 755 key.objectid, key.offset -
756 extent_offset); 756 extent_offset, 0);
757 BUG_ON(ret); 757 BUG_ON(ret);
758 inode_sub_bytes(inode, 758 inode_sub_bytes(inode,
759 extent_end - key.offset); 759 extent_end - key.offset);
@@ -962,7 +962,7 @@ again:
962 962
963 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, 963 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
964 root->root_key.objectid, 964 root->root_key.objectid,
965 ino, orig_offset); 965 ino, orig_offset, 0);
966 BUG_ON(ret); 966 BUG_ON(ret);
967 967
968 if (split == start) { 968 if (split == start) {
@@ -989,7 +989,7 @@ again:
989 del_nr++; 989 del_nr++;
990 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 990 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
991 0, root->root_key.objectid, 991 0, root->root_key.objectid,
992 ino, orig_offset); 992 ino, orig_offset, 0);
993 BUG_ON(ret); 993 BUG_ON(ret);
994 } 994 }
995 other_start = 0; 995 other_start = 0;
@@ -1006,7 +1006,7 @@ again:
1006 del_nr++; 1006 del_nr++;
1007 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 1007 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1008 0, root->root_key.objectid, 1008 0, root->root_key.objectid,
1009 ino, orig_offset); 1009 ino, orig_offset, 0);
1010 BUG_ON(ret); 1010 BUG_ON(ret);
1011 } 1011 }
1012 if (del_nr == 0) { 1012 if (del_nr == 0) {
@@ -1274,7 +1274,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1274 dirty_pages); 1274 dirty_pages);
1275 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) 1275 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1276 btrfs_btree_balance_dirty(root, 1); 1276 btrfs_btree_balance_dirty(root, 1);
1277 btrfs_throttle(root);
1278 1277
1279 pos += copied; 1278 pos += copied;
1280 num_written += copied; 1279 num_written += copied;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 9a897bf79538..d20ff87ca603 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -319,9 +319,11 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl)
319 io_ctl_unmap_page(io_ctl); 319 io_ctl_unmap_page(io_ctl);
320 320
321 for (i = 0; i < io_ctl->num_pages; i++) { 321 for (i = 0; i < io_ctl->num_pages; i++) {
322 ClearPageChecked(io_ctl->pages[i]); 322 if (io_ctl->pages[i]) {
323 unlock_page(io_ctl->pages[i]); 323 ClearPageChecked(io_ctl->pages[i]);
324 page_cache_release(io_ctl->pages[i]); 324 unlock_page(io_ctl->pages[i]);
325 page_cache_release(io_ctl->pages[i]);
326 }
325 } 327 }
326} 328}
327 329
@@ -635,7 +637,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
635 if (!num_entries) 637 if (!num_entries)
636 return 0; 638 return 0;
637 639
638 io_ctl_init(&io_ctl, inode, root); 640 ret = io_ctl_init(&io_ctl, inode, root);
641 if (ret)
642 return ret;
643
639 ret = readahead_cache(inode); 644 ret = readahead_cache(inode);
640 if (ret) 645 if (ret)
641 goto out; 646 goto out;
@@ -838,7 +843,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
838 struct io_ctl io_ctl; 843 struct io_ctl io_ctl;
839 struct list_head bitmap_list; 844 struct list_head bitmap_list;
840 struct btrfs_key key; 845 struct btrfs_key key;
841 u64 start, end, len; 846 u64 start, extent_start, extent_end, len;
842 int entries = 0; 847 int entries = 0;
843 int bitmaps = 0; 848 int bitmaps = 0;
844 int ret; 849 int ret;
@@ -849,7 +854,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
849 if (!i_size_read(inode)) 854 if (!i_size_read(inode))
850 return -1; 855 return -1;
851 856
852 io_ctl_init(&io_ctl, inode, root); 857 ret = io_ctl_init(&io_ctl, inode, root);
858 if (ret)
859 return -1;
853 860
854 /* Get the cluster for this block_group if it exists */ 861 /* Get the cluster for this block_group if it exists */
855 if (block_group && !list_empty(&block_group->cluster_list)) 862 if (block_group && !list_empty(&block_group->cluster_list))
@@ -857,25 +864,12 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
857 struct btrfs_free_cluster, 864 struct btrfs_free_cluster,
858 block_group_list); 865 block_group_list);
859 866
860 /*
861 * We shouldn't have switched the pinned extents yet so this is the
862 * right one
863 */
864 unpin = root->fs_info->pinned_extents;
865
866 /* Lock all pages first so we can lock the extent safely. */ 867 /* Lock all pages first so we can lock the extent safely. */
867 io_ctl_prepare_pages(&io_ctl, inode, 0); 868 io_ctl_prepare_pages(&io_ctl, inode, 0);
868 869
869 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 870 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
870 0, &cached_state, GFP_NOFS); 871 0, &cached_state, GFP_NOFS);
871 872
872 /*
873 * When searching for pinned extents, we need to start at our start
874 * offset.
875 */
876 if (block_group)
877 start = block_group->key.objectid;
878
879 node = rb_first(&ctl->free_space_offset); 873 node = rb_first(&ctl->free_space_offset);
880 if (!node && cluster) { 874 if (!node && cluster) {
881 node = rb_first(&cluster->root); 875 node = rb_first(&cluster->root);
@@ -918,9 +912,20 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
918 * We want to add any pinned extents to our free space cache 912 * We want to add any pinned extents to our free space cache
919 * so we don't leak the space 913 * so we don't leak the space
920 */ 914 */
915
916 /*
917 * We shouldn't have switched the pinned extents yet so this is the
918 * right one
919 */
920 unpin = root->fs_info->pinned_extents;
921
922 if (block_group)
923 start = block_group->key.objectid;
924
921 while (block_group && (start < block_group->key.objectid + 925 while (block_group && (start < block_group->key.objectid +
922 block_group->key.offset)) { 926 block_group->key.offset)) {
923 ret = find_first_extent_bit(unpin, start, &start, &end, 927 ret = find_first_extent_bit(unpin, start,
928 &extent_start, &extent_end,
924 EXTENT_DIRTY); 929 EXTENT_DIRTY);
925 if (ret) { 930 if (ret) {
926 ret = 0; 931 ret = 0;
@@ -928,20 +933,21 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
928 } 933 }
929 934
930 /* This pinned extent is out of our range */ 935 /* This pinned extent is out of our range */
931 if (start >= block_group->key.objectid + 936 if (extent_start >= block_group->key.objectid +
932 block_group->key.offset) 937 block_group->key.offset)
933 break; 938 break;
934 939
935 len = block_group->key.objectid + 940 extent_start = max(extent_start, start);
936 block_group->key.offset - start; 941 extent_end = min(block_group->key.objectid +
937 len = min(len, end + 1 - start); 942 block_group->key.offset, extent_end + 1);
943 len = extent_end - extent_start;
938 944
939 entries++; 945 entries++;
940 ret = io_ctl_add_entry(&io_ctl, start, len, NULL); 946 ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL);
941 if (ret) 947 if (ret)
942 goto out_nospc; 948 goto out_nospc;
943 949
944 start = end + 1; 950 start = extent_end;
945 } 951 }
946 952
947 /* Write out the bitmaps */ 953 /* Write out the bitmaps */
@@ -2283,23 +2289,23 @@ out:
2283static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, 2289static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
2284 struct btrfs_free_space *entry, 2290 struct btrfs_free_space *entry,
2285 struct btrfs_free_cluster *cluster, 2291 struct btrfs_free_cluster *cluster,
2286 u64 offset, u64 bytes, u64 min_bytes) 2292 u64 offset, u64 bytes,
2293 u64 cont1_bytes, u64 min_bytes)
2287{ 2294{
2288 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2295 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2289 unsigned long next_zero; 2296 unsigned long next_zero;
2290 unsigned long i; 2297 unsigned long i;
2291 unsigned long search_bits; 2298 unsigned long want_bits;
2292 unsigned long total_bits; 2299 unsigned long min_bits;
2293 unsigned long found_bits; 2300 unsigned long found_bits;
2294 unsigned long start = 0; 2301 unsigned long start = 0;
2295 unsigned long total_found = 0; 2302 unsigned long total_found = 0;
2296 int ret; 2303 int ret;
2297 bool found = false;
2298 2304
2299 i = offset_to_bit(entry->offset, block_group->sectorsize, 2305 i = offset_to_bit(entry->offset, block_group->sectorsize,
2300 max_t(u64, offset, entry->offset)); 2306 max_t(u64, offset, entry->offset));
2301 search_bits = bytes_to_bits(bytes, block_group->sectorsize); 2307 want_bits = bytes_to_bits(bytes, block_group->sectorsize);
2302 total_bits = bytes_to_bits(min_bytes, block_group->sectorsize); 2308 min_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
2303 2309
2304again: 2310again:
2305 found_bits = 0; 2311 found_bits = 0;
@@ -2308,7 +2314,7 @@ again:
2308 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) { 2314 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
2309 next_zero = find_next_zero_bit(entry->bitmap, 2315 next_zero = find_next_zero_bit(entry->bitmap,
2310 BITS_PER_BITMAP, i); 2316 BITS_PER_BITMAP, i);
2311 if (next_zero - i >= search_bits) { 2317 if (next_zero - i >= min_bits) {
2312 found_bits = next_zero - i; 2318 found_bits = next_zero - i;
2313 break; 2319 break;
2314 } 2320 }
@@ -2318,10 +2324,9 @@ again:
2318 if (!found_bits) 2324 if (!found_bits)
2319 return -ENOSPC; 2325 return -ENOSPC;
2320 2326
2321 if (!found) { 2327 if (!total_found) {
2322 start = i; 2328 start = i;
2323 cluster->max_size = 0; 2329 cluster->max_size = 0;
2324 found = true;
2325 } 2330 }
2326 2331
2327 total_found += found_bits; 2332 total_found += found_bits;
@@ -2329,13 +2334,8 @@ again:
2329 if (cluster->max_size < found_bits * block_group->sectorsize) 2334 if (cluster->max_size < found_bits * block_group->sectorsize)
2330 cluster->max_size = found_bits * block_group->sectorsize; 2335 cluster->max_size = found_bits * block_group->sectorsize;
2331 2336
2332 if (total_found < total_bits) { 2337 if (total_found < want_bits || cluster->max_size < cont1_bytes) {
2333 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero); 2338 i = next_zero + 1;
2334 if (i - start > total_bits * 2) {
2335 total_found = 0;
2336 cluster->max_size = 0;
2337 found = false;
2338 }
2339 goto again; 2339 goto again;
2340 } 2340 }
2341 2341
@@ -2346,28 +2346,31 @@ again:
2346 &entry->offset_index, 1); 2346 &entry->offset_index, 1);
2347 BUG_ON(ret); 2347 BUG_ON(ret);
2348 2348
2349 trace_btrfs_setup_cluster(block_group, cluster,
2350 total_found * block_group->sectorsize, 1);
2349 return 0; 2351 return 0;
2350} 2352}
2351 2353
2352/* 2354/*
2353 * This searches the block group for just extents to fill the cluster with. 2355 * This searches the block group for just extents to fill the cluster with.
2356 * Try to find a cluster with at least bytes total bytes, at least one
2357 * extent of cont1_bytes, and other clusters of at least min_bytes.
2354 */ 2358 */
2355static noinline int 2359static noinline int
2356setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, 2360setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2357 struct btrfs_free_cluster *cluster, 2361 struct btrfs_free_cluster *cluster,
2358 struct list_head *bitmaps, u64 offset, u64 bytes, 2362 struct list_head *bitmaps, u64 offset, u64 bytes,
2359 u64 min_bytes) 2363 u64 cont1_bytes, u64 min_bytes)
2360{ 2364{
2361 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2365 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2362 struct btrfs_free_space *first = NULL; 2366 struct btrfs_free_space *first = NULL;
2363 struct btrfs_free_space *entry = NULL; 2367 struct btrfs_free_space *entry = NULL;
2364 struct btrfs_free_space *prev = NULL;
2365 struct btrfs_free_space *last; 2368 struct btrfs_free_space *last;
2366 struct rb_node *node; 2369 struct rb_node *node;
2367 u64 window_start; 2370 u64 window_start;
2368 u64 window_free; 2371 u64 window_free;
2369 u64 max_extent; 2372 u64 max_extent;
2370 u64 max_gap = 128 * 1024; 2373 u64 total_size = 0;
2371 2374
2372 entry = tree_search_offset(ctl, offset, 0, 1); 2375 entry = tree_search_offset(ctl, offset, 0, 1);
2373 if (!entry) 2376 if (!entry)
@@ -2377,8 +2380,8 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2377 * We don't want bitmaps, so just move along until we find a normal 2380 * We don't want bitmaps, so just move along until we find a normal
2378 * extent entry. 2381 * extent entry.
2379 */ 2382 */
2380 while (entry->bitmap) { 2383 while (entry->bitmap || entry->bytes < min_bytes) {
2381 if (list_empty(&entry->list)) 2384 if (entry->bitmap && list_empty(&entry->list))
2382 list_add_tail(&entry->list, bitmaps); 2385 list_add_tail(&entry->list, bitmaps);
2383 node = rb_next(&entry->offset_index); 2386 node = rb_next(&entry->offset_index);
2384 if (!node) 2387 if (!node)
@@ -2391,12 +2394,9 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2391 max_extent = entry->bytes; 2394 max_extent = entry->bytes;
2392 first = entry; 2395 first = entry;
2393 last = entry; 2396 last = entry;
2394 prev = entry;
2395 2397
2396 while (window_free <= min_bytes) { 2398 for (node = rb_next(&entry->offset_index); node;
2397 node = rb_next(&entry->offset_index); 2399 node = rb_next(&entry->offset_index)) {
2398 if (!node)
2399 return -ENOSPC;
2400 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2400 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2401 2401
2402 if (entry->bitmap) { 2402 if (entry->bitmap) {
@@ -2405,26 +2405,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2405 continue; 2405 continue;
2406 } 2406 }
2407 2407
2408 /* 2408 if (entry->bytes < min_bytes)
2409 * we haven't filled the empty size and the window is 2409 continue;
2410 * very large. reset and try again 2410
2411 */ 2411 last = entry;
2412 if (entry->offset - (prev->offset + prev->bytes) > max_gap || 2412 window_free += entry->bytes;
2413 entry->offset - window_start > (min_bytes * 2)) { 2413 if (entry->bytes > max_extent)
2414 first = entry;
2415 window_start = entry->offset;
2416 window_free = entry->bytes;
2417 last = entry;
2418 max_extent = entry->bytes; 2414 max_extent = entry->bytes;
2419 } else {
2420 last = entry;
2421 window_free += entry->bytes;
2422 if (entry->bytes > max_extent)
2423 max_extent = entry->bytes;
2424 }
2425 prev = entry;
2426 } 2415 }
2427 2416
2417 if (window_free < bytes || max_extent < cont1_bytes)
2418 return -ENOSPC;
2419
2428 cluster->window_start = first->offset; 2420 cluster->window_start = first->offset;
2429 2421
2430 node = &first->offset_index; 2422 node = &first->offset_index;
@@ -2438,17 +2430,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2438 2430
2439 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2431 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2440 node = rb_next(&entry->offset_index); 2432 node = rb_next(&entry->offset_index);
2441 if (entry->bitmap) 2433 if (entry->bitmap || entry->bytes < min_bytes)
2442 continue; 2434 continue;
2443 2435
2444 rb_erase(&entry->offset_index, &ctl->free_space_offset); 2436 rb_erase(&entry->offset_index, &ctl->free_space_offset);
2445 ret = tree_insert_offset(&cluster->root, entry->offset, 2437 ret = tree_insert_offset(&cluster->root, entry->offset,
2446 &entry->offset_index, 0); 2438 &entry->offset_index, 0);
2439 total_size += entry->bytes;
2447 BUG_ON(ret); 2440 BUG_ON(ret);
2448 } while (node && entry != last); 2441 } while (node && entry != last);
2449 2442
2450 cluster->max_size = max_extent; 2443 cluster->max_size = max_extent;
2451 2444 trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
2452 return 0; 2445 return 0;
2453} 2446}
2454 2447
@@ -2460,7 +2453,7 @@ static noinline int
2460setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, 2453setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2461 struct btrfs_free_cluster *cluster, 2454 struct btrfs_free_cluster *cluster,
2462 struct list_head *bitmaps, u64 offset, u64 bytes, 2455 struct list_head *bitmaps, u64 offset, u64 bytes,
2463 u64 min_bytes) 2456 u64 cont1_bytes, u64 min_bytes)
2464{ 2457{
2465 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2458 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2466 struct btrfs_free_space *entry; 2459 struct btrfs_free_space *entry;
@@ -2485,7 +2478,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2485 if (entry->bytes < min_bytes) 2478 if (entry->bytes < min_bytes)
2486 continue; 2479 continue;
2487 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, 2480 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
2488 bytes, min_bytes); 2481 bytes, cont1_bytes, min_bytes);
2489 if (!ret) 2482 if (!ret)
2490 return 0; 2483 return 0;
2491 } 2484 }
@@ -2499,7 +2492,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2499 2492
2500/* 2493/*
2501 * here we try to find a cluster of blocks in a block group. The goal 2494 * here we try to find a cluster of blocks in a block group. The goal
2502 * is to find at least bytes free and up to empty_size + bytes free. 2495 * is to find at least bytes+empty_size.
2503 * We might not find them all in one contiguous area. 2496 * We might not find them all in one contiguous area.
2504 * 2497 *
2505 * returns zero and sets up cluster if things worked out, otherwise 2498 * returns zero and sets up cluster if things worked out, otherwise
@@ -2515,23 +2508,24 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2515 struct btrfs_free_space *entry, *tmp; 2508 struct btrfs_free_space *entry, *tmp;
2516 LIST_HEAD(bitmaps); 2509 LIST_HEAD(bitmaps);
2517 u64 min_bytes; 2510 u64 min_bytes;
2511 u64 cont1_bytes;
2518 int ret; 2512 int ret;
2519 2513
2520 /* for metadata, allow allocates with more holes */ 2514 /*
2515 * Choose the minimum extent size we'll require for this
2516 * cluster. For SSD_SPREAD, don't allow any fragmentation.
2517 * For metadata, allow allocates with smaller extents. For
2518 * data, keep it dense.
2519 */
2521 if (btrfs_test_opt(root, SSD_SPREAD)) { 2520 if (btrfs_test_opt(root, SSD_SPREAD)) {
2522 min_bytes = bytes + empty_size; 2521 cont1_bytes = min_bytes = bytes + empty_size;
2523 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { 2522 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
2524 /* 2523 cont1_bytes = bytes;
2525 * we want to do larger allocations when we are 2524 min_bytes = block_group->sectorsize;
2526 * flushing out the delayed refs, it helps prevent 2525 } else {
2527 * making more work as we go along. 2526 cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
2528 */ 2527 min_bytes = block_group->sectorsize;
2529 if (trans->transaction->delayed_refs.flushing) 2528 }
2530 min_bytes = max(bytes, (bytes + empty_size) >> 1);
2531 else
2532 min_bytes = max(bytes, (bytes + empty_size) >> 4);
2533 } else
2534 min_bytes = max(bytes, (bytes + empty_size) >> 2);
2535 2529
2536 spin_lock(&ctl->tree_lock); 2530 spin_lock(&ctl->tree_lock);
2537 2531
@@ -2539,7 +2533,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2539 * If we know we don't have enough space to make a cluster don't even 2533 * If we know we don't have enough space to make a cluster don't even
2540 * bother doing all the work to try and find one. 2534 * bother doing all the work to try and find one.
2541 */ 2535 */
2542 if (ctl->free_space < min_bytes) { 2536 if (ctl->free_space < bytes) {
2543 spin_unlock(&ctl->tree_lock); 2537 spin_unlock(&ctl->tree_lock);
2544 return -ENOSPC; 2538 return -ENOSPC;
2545 } 2539 }
@@ -2552,11 +2546,17 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2552 goto out; 2546 goto out;
2553 } 2547 }
2554 2548
2549 trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
2550 min_bytes);
2551
2552 INIT_LIST_HEAD(&bitmaps);
2555 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, 2553 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
2556 bytes, min_bytes); 2554 bytes + empty_size,
2555 cont1_bytes, min_bytes);
2557 if (ret) 2556 if (ret)
2558 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, 2557 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
2559 offset, bytes, min_bytes); 2558 offset, bytes + empty_size,
2559 cont1_bytes, min_bytes);
2560 2560
2561 /* Clear our temporary list */ 2561 /* Clear our temporary list */
2562 list_for_each_entry_safe(entry, tmp, &bitmaps, list) 2562 list_for_each_entry_safe(entry, tmp, &bitmaps, list)
@@ -2567,6 +2567,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2567 list_add_tail(&cluster->block_group_list, 2567 list_add_tail(&cluster->block_group_list,
2568 &block_group->cluster_list); 2568 &block_group->cluster_list);
2569 cluster->block_group = block_group; 2569 cluster->block_group = block_group;
2570 } else {
2571 trace_btrfs_failed_cluster_setup(block_group);
2570 } 2572 }
2571out: 2573out:
2572 spin_unlock(&cluster->lock); 2574 spin_unlock(&cluster->lock);
@@ -2588,17 +2590,57 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
2588 cluster->block_group = NULL; 2590 cluster->block_group = NULL;
2589} 2591}
2590 2592
2591int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, 2593static int do_trimming(struct btrfs_block_group_cache *block_group,
2592 u64 *trimmed, u64 start, u64 end, u64 minlen) 2594 u64 *total_trimmed, u64 start, u64 bytes,
2595 u64 reserved_start, u64 reserved_bytes)
2593{ 2596{
2594 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2597 struct btrfs_space_info *space_info = block_group->space_info;
2595 struct btrfs_free_space *entry = NULL;
2596 struct btrfs_fs_info *fs_info = block_group->fs_info; 2598 struct btrfs_fs_info *fs_info = block_group->fs_info;
2597 u64 bytes = 0; 2599 int ret;
2598 u64 actually_trimmed; 2600 int update = 0;
2599 int ret = 0; 2601 u64 trimmed = 0;
2600 2602
2601 *trimmed = 0; 2603 spin_lock(&space_info->lock);
2604 spin_lock(&block_group->lock);
2605 if (!block_group->ro) {
2606 block_group->reserved += reserved_bytes;
2607 space_info->bytes_reserved += reserved_bytes;
2608 update = 1;
2609 }
2610 spin_unlock(&block_group->lock);
2611 spin_unlock(&space_info->lock);
2612
2613 ret = btrfs_error_discard_extent(fs_info->extent_root,
2614 start, bytes, &trimmed);
2615 if (!ret)
2616 *total_trimmed += trimmed;
2617
2618 btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
2619
2620 if (update) {
2621 spin_lock(&space_info->lock);
2622 spin_lock(&block_group->lock);
2623 if (block_group->ro)
2624 space_info->bytes_readonly += reserved_bytes;
2625 block_group->reserved -= reserved_bytes;
2626 space_info->bytes_reserved -= reserved_bytes;
2627 spin_unlock(&space_info->lock);
2628 spin_unlock(&block_group->lock);
2629 }
2630
2631 return ret;
2632}
2633
2634static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
2635 u64 *total_trimmed, u64 start, u64 end, u64 minlen)
2636{
2637 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2638 struct btrfs_free_space *entry;
2639 struct rb_node *node;
2640 int ret = 0;
2641 u64 extent_start;
2642 u64 extent_bytes;
2643 u64 bytes;
2602 2644
2603 while (start < end) { 2645 while (start < end) {
2604 spin_lock(&ctl->tree_lock); 2646 spin_lock(&ctl->tree_lock);
@@ -2609,81 +2651,118 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2609 } 2651 }
2610 2652
2611 entry = tree_search_offset(ctl, start, 0, 1); 2653 entry = tree_search_offset(ctl, start, 0, 1);
2612 if (!entry) 2654 if (!entry) {
2613 entry = tree_search_offset(ctl,
2614 offset_to_bitmap(ctl, start),
2615 1, 1);
2616
2617 if (!entry || entry->offset >= end) {
2618 spin_unlock(&ctl->tree_lock); 2655 spin_unlock(&ctl->tree_lock);
2619 break; 2656 break;
2620 } 2657 }
2621 2658
2622 if (entry->bitmap) { 2659 /* skip bitmaps */
2623 ret = search_bitmap(ctl, entry, &start, &bytes); 2660 while (entry->bitmap) {
2624 if (!ret) { 2661 node = rb_next(&entry->offset_index);
2625 if (start >= end) { 2662 if (!node) {
2626 spin_unlock(&ctl->tree_lock);
2627 break;
2628 }
2629 bytes = min(bytes, end - start);
2630 bitmap_clear_bits(ctl, entry, start, bytes);
2631 if (entry->bytes == 0)
2632 free_bitmap(ctl, entry);
2633 } else {
2634 start = entry->offset + BITS_PER_BITMAP *
2635 block_group->sectorsize;
2636 spin_unlock(&ctl->tree_lock); 2663 spin_unlock(&ctl->tree_lock);
2637 ret = 0; 2664 goto out;
2638 continue;
2639 } 2665 }
2640 } else { 2666 entry = rb_entry(node, struct btrfs_free_space,
2641 start = entry->offset; 2667 offset_index);
2642 bytes = min(entry->bytes, end - start);
2643 unlink_free_space(ctl, entry);
2644 kmem_cache_free(btrfs_free_space_cachep, entry);
2645 } 2668 }
2646 2669
2670 if (entry->offset >= end) {
2671 spin_unlock(&ctl->tree_lock);
2672 break;
2673 }
2674
2675 extent_start = entry->offset;
2676 extent_bytes = entry->bytes;
2677 start = max(start, extent_start);
2678 bytes = min(extent_start + extent_bytes, end) - start;
2679 if (bytes < minlen) {
2680 spin_unlock(&ctl->tree_lock);
2681 goto next;
2682 }
2683
2684 unlink_free_space(ctl, entry);
2685 kmem_cache_free(btrfs_free_space_cachep, entry);
2686
2647 spin_unlock(&ctl->tree_lock); 2687 spin_unlock(&ctl->tree_lock);
2648 2688
2649 if (bytes >= minlen) { 2689 ret = do_trimming(block_group, total_trimmed, start, bytes,
2650 struct btrfs_space_info *space_info; 2690 extent_start, extent_bytes);
2651 int update = 0; 2691 if (ret)
2652 2692 break;
2653 space_info = block_group->space_info; 2693next:
2654 spin_lock(&space_info->lock); 2694 start += bytes;
2655 spin_lock(&block_group->lock);
2656 if (!block_group->ro) {
2657 block_group->reserved += bytes;
2658 space_info->bytes_reserved += bytes;
2659 update = 1;
2660 }
2661 spin_unlock(&block_group->lock);
2662 spin_unlock(&space_info->lock);
2663
2664 ret = btrfs_error_discard_extent(fs_info->extent_root,
2665 start,
2666 bytes,
2667 &actually_trimmed);
2668
2669 btrfs_add_free_space(block_group, start, bytes);
2670 if (update) {
2671 spin_lock(&space_info->lock);
2672 spin_lock(&block_group->lock);
2673 if (block_group->ro)
2674 space_info->bytes_readonly += bytes;
2675 block_group->reserved -= bytes;
2676 space_info->bytes_reserved -= bytes;
2677 spin_unlock(&space_info->lock);
2678 spin_unlock(&block_group->lock);
2679 }
2680 2695
2681 if (ret) 2696 if (fatal_signal_pending(current)) {
2682 break; 2697 ret = -ERESTARTSYS;
2683 *trimmed += actually_trimmed; 2698 break;
2699 }
2700
2701 cond_resched();
2702 }
2703out:
2704 return ret;
2705}
2706
2707static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
2708 u64 *total_trimmed, u64 start, u64 end, u64 minlen)
2709{
2710 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2711 struct btrfs_free_space *entry;
2712 int ret = 0;
2713 int ret2;
2714 u64 bytes;
2715 u64 offset = offset_to_bitmap(ctl, start);
2716
2717 while (offset < end) {
2718 bool next_bitmap = false;
2719
2720 spin_lock(&ctl->tree_lock);
2721
2722 if (ctl->free_space < minlen) {
2723 spin_unlock(&ctl->tree_lock);
2724 break;
2725 }
2726
2727 entry = tree_search_offset(ctl, offset, 1, 0);
2728 if (!entry) {
2729 spin_unlock(&ctl->tree_lock);
2730 next_bitmap = true;
2731 goto next;
2732 }
2733
2734 bytes = minlen;
2735 ret2 = search_bitmap(ctl, entry, &start, &bytes);
2736 if (ret2 || start >= end) {
2737 spin_unlock(&ctl->tree_lock);
2738 next_bitmap = true;
2739 goto next;
2740 }
2741
2742 bytes = min(bytes, end - start);
2743 if (bytes < minlen) {
2744 spin_unlock(&ctl->tree_lock);
2745 goto next;
2746 }
2747
2748 bitmap_clear_bits(ctl, entry, start, bytes);
2749 if (entry->bytes == 0)
2750 free_bitmap(ctl, entry);
2751
2752 spin_unlock(&ctl->tree_lock);
2753
2754 ret = do_trimming(block_group, total_trimmed, start, bytes,
2755 start, bytes);
2756 if (ret)
2757 break;
2758next:
2759 if (next_bitmap) {
2760 offset += BITS_PER_BITMAP * ctl->unit;
2761 } else {
2762 start += bytes;
2763 if (start >= offset + BITS_PER_BITMAP * ctl->unit)
2764 offset += BITS_PER_BITMAP * ctl->unit;
2684 } 2765 }
2685 start += bytes;
2686 bytes = 0;
2687 2766
2688 if (fatal_signal_pending(current)) { 2767 if (fatal_signal_pending(current)) {
2689 ret = -ERESTARTSYS; 2768 ret = -ERESTARTSYS;
@@ -2696,6 +2775,22 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2696 return ret; 2775 return ret;
2697} 2776}
2698 2777
2778int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2779 u64 *trimmed, u64 start, u64 end, u64 minlen)
2780{
2781 int ret;
2782
2783 *trimmed = 0;
2784
2785 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
2786 if (ret)
2787 return ret;
2788
2789 ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
2790
2791 return ret;
2792}
2793
2699/* 2794/*
2700 * Find the left-most item in the cache tree, and then return the 2795 * Find the left-most item in the cache tree, and then return the
2701 * smallest inode number in the item. 2796 * smallest inode number in the item.
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index f8962a957d65..213ffa86ce1b 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -438,6 +438,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
438 trans->bytes_reserved); 438 trans->bytes_reserved);
439 if (ret) 439 if (ret)
440 goto out; 440 goto out;
441 trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans,
442 trans->bytes_reserved, 1);
441again: 443again:
442 inode = lookup_free_ino_inode(root, path); 444 inode = lookup_free_ino_inode(root, path);
443 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 445 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
@@ -498,6 +500,8 @@ again:
498out_put: 500out_put:
499 iput(inode); 501 iput(inode);
500out_release: 502out_release:
503 trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans,
504 trans->bytes_reserved, 0);
501 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); 505 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
502out: 506out:
503 trans->block_rsv = rsv; 507 trans->block_rsv = rsv;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 81b235a61f8c..0da19a0ea00d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1951,12 +1951,28 @@ enum btrfs_orphan_cleanup_state {
1951void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, 1951void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
1952 struct btrfs_root *root) 1952 struct btrfs_root *root)
1953{ 1953{
1954 struct btrfs_block_rsv *block_rsv;
1954 int ret; 1955 int ret;
1955 1956
1956 if (!list_empty(&root->orphan_list) || 1957 if (!list_empty(&root->orphan_list) ||
1957 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) 1958 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
1958 return; 1959 return;
1959 1960
1961 spin_lock(&root->orphan_lock);
1962 if (!list_empty(&root->orphan_list)) {
1963 spin_unlock(&root->orphan_lock);
1964 return;
1965 }
1966
1967 if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
1968 spin_unlock(&root->orphan_lock);
1969 return;
1970 }
1971
1972 block_rsv = root->orphan_block_rsv;
1973 root->orphan_block_rsv = NULL;
1974 spin_unlock(&root->orphan_lock);
1975
1960 if (root->orphan_item_inserted && 1976 if (root->orphan_item_inserted &&
1961 btrfs_root_refs(&root->root_item) > 0) { 1977 btrfs_root_refs(&root->root_item) > 0) {
1962 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, 1978 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
@@ -1965,10 +1981,9 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
1965 root->orphan_item_inserted = 0; 1981 root->orphan_item_inserted = 0;
1966 } 1982 }
1967 1983
1968 if (root->orphan_block_rsv) { 1984 if (block_rsv) {
1969 WARN_ON(root->orphan_block_rsv->size > 0); 1985 WARN_ON(block_rsv->size > 0);
1970 btrfs_free_block_rsv(root, root->orphan_block_rsv); 1986 btrfs_free_block_rsv(root, block_rsv);
1971 root->orphan_block_rsv = NULL;
1972 } 1987 }
1973} 1988}
1974 1989
@@ -2224,14 +2239,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2224 continue; 2239 continue;
2225 } 2240 }
2226 nr_truncate++; 2241 nr_truncate++;
2227 /*
2228 * Need to hold the imutex for reservation purposes, not
2229 * a huge deal here but I have a WARN_ON in
2230 * btrfs_delalloc_reserve_space to catch offenders.
2231 */
2232 mutex_lock(&inode->i_mutex);
2233 ret = btrfs_truncate(inode); 2242 ret = btrfs_truncate(inode);
2234 mutex_unlock(&inode->i_mutex);
2235 } else { 2243 } else {
2236 nr_unlink++; 2244 nr_unlink++;
2237 } 2245 }
@@ -2845,7 +2853,7 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
2845 BUG_ON(!root->fs_info->enospc_unlink); 2853 BUG_ON(!root->fs_info->enospc_unlink);
2846 root->fs_info->enospc_unlink = 0; 2854 root->fs_info->enospc_unlink = 0;
2847 } 2855 }
2848 btrfs_end_transaction_throttle(trans, root); 2856 btrfs_end_transaction(trans, root);
2849} 2857}
2850 2858
2851static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 2859static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -3009,7 +3017,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3009 int pending_del_nr = 0; 3017 int pending_del_nr = 0;
3010 int pending_del_slot = 0; 3018 int pending_del_slot = 0;
3011 int extent_type = -1; 3019 int extent_type = -1;
3012 int encoding;
3013 int ret; 3020 int ret;
3014 int err = 0; 3021 int err = 0;
3015 u64 ino = btrfs_ino(inode); 3022 u64 ino = btrfs_ino(inode);
@@ -3059,7 +3066,6 @@ search_again:
3059 leaf = path->nodes[0]; 3066 leaf = path->nodes[0];
3060 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3067 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3061 found_type = btrfs_key_type(&found_key); 3068 found_type = btrfs_key_type(&found_key);
3062 encoding = 0;
3063 3069
3064 if (found_key.objectid != ino) 3070 if (found_key.objectid != ino)
3065 break; 3071 break;
@@ -3072,10 +3078,6 @@ search_again:
3072 fi = btrfs_item_ptr(leaf, path->slots[0], 3078 fi = btrfs_item_ptr(leaf, path->slots[0],
3073 struct btrfs_file_extent_item); 3079 struct btrfs_file_extent_item);
3074 extent_type = btrfs_file_extent_type(leaf, fi); 3080 extent_type = btrfs_file_extent_type(leaf, fi);
3075 encoding = btrfs_file_extent_compression(leaf, fi);
3076 encoding |= btrfs_file_extent_encryption(leaf, fi);
3077 encoding |= btrfs_file_extent_other_encoding(leaf, fi);
3078
3079 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 3081 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
3080 item_end += 3082 item_end +=
3081 btrfs_file_extent_num_bytes(leaf, fi); 3083 btrfs_file_extent_num_bytes(leaf, fi);
@@ -3103,7 +3105,7 @@ search_again:
3103 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 3105 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
3104 u64 num_dec; 3106 u64 num_dec;
3105 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); 3107 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
3106 if (!del_item && !encoding) { 3108 if (!del_item) {
3107 u64 orig_num_bytes = 3109 u64 orig_num_bytes =
3108 btrfs_file_extent_num_bytes(leaf, fi); 3110 btrfs_file_extent_num_bytes(leaf, fi);
3109 extent_num_bytes = new_size - 3111 extent_num_bytes = new_size -
@@ -3179,7 +3181,7 @@ delete:
3179 ret = btrfs_free_extent(trans, root, extent_start, 3181 ret = btrfs_free_extent(trans, root, extent_start,
3180 extent_num_bytes, 0, 3182 extent_num_bytes, 0,
3181 btrfs_header_owner(leaf), 3183 btrfs_header_owner(leaf),
3182 ino, extent_offset); 3184 ino, extent_offset, 0);
3183 BUG_ON(ret); 3185 BUG_ON(ret);
3184 } 3186 }
3185 3187
@@ -3434,7 +3436,7 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
3434 i_size_write(inode, newsize); 3436 i_size_write(inode, newsize);
3435 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); 3437 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
3436 ret = btrfs_update_inode(trans, root, inode); 3438 ret = btrfs_update_inode(trans, root, inode);
3437 btrfs_end_transaction_throttle(trans, root); 3439 btrfs_end_transaction(trans, root);
3438 } else { 3440 } else {
3439 3441
3440 /* 3442 /*
@@ -4655,7 +4657,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4655 } 4657 }
4656out_unlock: 4658out_unlock:
4657 nr = trans->blocks_used; 4659 nr = trans->blocks_used;
4658 btrfs_end_transaction_throttle(trans, root); 4660 btrfs_end_transaction(trans, root);
4659 btrfs_btree_balance_dirty(root, nr); 4661 btrfs_btree_balance_dirty(root, nr);
4660 if (drop_inode) { 4662 if (drop_inode) {
4661 inode_dec_link_count(inode); 4663 inode_dec_link_count(inode);
@@ -4723,7 +4725,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4723 } 4725 }
4724out_unlock: 4726out_unlock:
4725 nr = trans->blocks_used; 4727 nr = trans->blocks_used;
4726 btrfs_end_transaction_throttle(trans, root); 4728 btrfs_end_transaction(trans, root);
4727 if (drop_inode) { 4729 if (drop_inode) {
4728 inode_dec_link_count(inode); 4730 inode_dec_link_count(inode);
4729 iput(inode); 4731 iput(inode);
@@ -4782,7 +4784,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4782 } 4784 }
4783 4785
4784 nr = trans->blocks_used; 4786 nr = trans->blocks_used;
4785 btrfs_end_transaction_throttle(trans, root); 4787 btrfs_end_transaction(trans, root);
4786fail: 4788fail:
4787 if (drop_inode) { 4789 if (drop_inode) {
4788 inode_dec_link_count(inode); 4790 inode_dec_link_count(inode);
@@ -4848,7 +4850,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4848 4850
4849out_fail: 4851out_fail:
4850 nr = trans->blocks_used; 4852 nr = trans->blocks_used;
4851 btrfs_end_transaction_throttle(trans, root); 4853 btrfs_end_transaction(trans, root);
4852 if (drop_on_err) 4854 if (drop_on_err)
4853 iput(inode); 4855 iput(inode);
4854 btrfs_btree_balance_dirty(root, nr); 4856 btrfs_btree_balance_dirty(root, nr);
@@ -5121,7 +5123,7 @@ again:
5121 } 5123 }
5122 flush_dcache_page(page); 5124 flush_dcache_page(page);
5123 } else if (create && PageUptodate(page)) { 5125 } else if (create && PageUptodate(page)) {
5124 WARN_ON(1); 5126 BUG();
5125 if (!trans) { 5127 if (!trans) {
5126 kunmap(page); 5128 kunmap(page);
5127 free_extent_map(em); 5129 free_extent_map(em);
@@ -6402,10 +6404,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
6402 u64 page_start; 6404 u64 page_start;
6403 u64 page_end; 6405 u64 page_end;
6404 6406
6405 /* Need this to keep space reservations serialized */
6406 mutex_lock(&inode->i_mutex);
6407 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 6407 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
6408 mutex_unlock(&inode->i_mutex);
6409 if (!ret) 6408 if (!ret)
6410 ret = btrfs_update_time(vma->vm_file); 6409 ret = btrfs_update_time(vma->vm_file);
6411 if (ret) { 6410 if (ret) {
@@ -6494,8 +6493,8 @@ out_unlock:
6494 if (!ret) 6493 if (!ret)
6495 return VM_FAULT_LOCKED; 6494 return VM_FAULT_LOCKED;
6496 unlock_page(page); 6495 unlock_page(page);
6497 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
6498out: 6496out:
6497 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
6499 return ret; 6498 return ret;
6500} 6499}
6501 6500
@@ -6668,7 +6667,7 @@ end_trans:
6668 err = ret; 6667 err = ret;
6669 6668
6670 nr = trans->blocks_used; 6669 nr = trans->blocks_used;
6671 ret = btrfs_end_transaction_throttle(trans, root); 6670 ret = btrfs_end_transaction(trans, root);
6672 btrfs_btree_balance_dirty(root, nr); 6671 btrfs_btree_balance_dirty(root, nr);
6673 } 6672 }
6674 6673
@@ -6749,6 +6748,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6749 extent_io_tree_init(&ei->io_tree, &inode->i_data); 6748 extent_io_tree_init(&ei->io_tree, &inode->i_data);
6750 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); 6749 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
6751 mutex_init(&ei->log_mutex); 6750 mutex_init(&ei->log_mutex);
6751 mutex_init(&ei->delalloc_mutex);
6752 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 6752 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
6753 INIT_LIST_HEAD(&ei->i_orphan); 6753 INIT_LIST_HEAD(&ei->i_orphan);
6754 INIT_LIST_HEAD(&ei->delalloc_inodes); 6754 INIT_LIST_HEAD(&ei->delalloc_inodes);
@@ -7074,7 +7074,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
7074 btrfs_end_log_trans(root); 7074 btrfs_end_log_trans(root);
7075 } 7075 }
7076out_fail: 7076out_fail:
7077 btrfs_end_transaction_throttle(trans, root); 7077 btrfs_end_transaction(trans, root);
7078out_notrans: 7078out_notrans:
7079 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 7079 if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
7080 up_read(&root->fs_info->subvol_sem); 7080 up_read(&root->fs_info->subvol_sem);
@@ -7246,7 +7246,7 @@ out_unlock:
7246 if (!err) 7246 if (!err)
7247 d_instantiate(dentry, inode); 7247 d_instantiate(dentry, inode);
7248 nr = trans->blocks_used; 7248 nr = trans->blocks_used;
7249 btrfs_end_transaction_throttle(trans, root); 7249 btrfs_end_transaction(trans, root);
7250 if (drop_inode) { 7250 if (drop_inode) {
7251 inode_dec_link_count(inode); 7251 inode_dec_link_count(inode);
7252 iput(inode); 7252 iput(inode);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5441ff1480fd..ab620014bcc3 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -176,6 +176,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
176 struct btrfs_trans_handle *trans; 176 struct btrfs_trans_handle *trans;
177 unsigned int flags, oldflags; 177 unsigned int flags, oldflags;
178 int ret; 178 int ret;
179 u64 ip_oldflags;
180 unsigned int i_oldflags;
179 181
180 if (btrfs_root_readonly(root)) 182 if (btrfs_root_readonly(root))
181 return -EROFS; 183 return -EROFS;
@@ -192,6 +194,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
192 194
193 mutex_lock(&inode->i_mutex); 195 mutex_lock(&inode->i_mutex);
194 196
197 ip_oldflags = ip->flags;
198 i_oldflags = inode->i_flags;
199
195 flags = btrfs_mask_flags(inode->i_mode, flags); 200 flags = btrfs_mask_flags(inode->i_mode, flags);
196 oldflags = btrfs_flags_to_ioctl(ip->flags); 201 oldflags = btrfs_flags_to_ioctl(ip->flags);
197 if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { 202 if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
@@ -249,19 +254,24 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
249 ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); 254 ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
250 } 255 }
251 256
252 trans = btrfs_join_transaction(root); 257 trans = btrfs_start_transaction(root, 1);
253 BUG_ON(IS_ERR(trans)); 258 if (IS_ERR(trans)) {
259 ret = PTR_ERR(trans);
260 goto out_drop;
261 }
254 262
255 btrfs_update_iflags(inode); 263 btrfs_update_iflags(inode);
256 inode->i_ctime = CURRENT_TIME; 264 inode->i_ctime = CURRENT_TIME;
257 ret = btrfs_update_inode(trans, root, inode); 265 ret = btrfs_update_inode(trans, root, inode);
258 BUG_ON(ret);
259 266
260 btrfs_end_transaction(trans, root); 267 btrfs_end_transaction(trans, root);
268 out_drop:
269 if (ret) {
270 ip->flags = ip_oldflags;
271 inode->i_flags = i_oldflags;
272 }
261 273
262 mnt_drop_write_file(file); 274 mnt_drop_write_file(file);
263
264 ret = 0;
265 out_unlock: 275 out_unlock:
266 mutex_unlock(&inode->i_mutex); 276 mutex_unlock(&inode->i_mutex);
267 return ret; 277 return ret;
@@ -276,14 +286,13 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
276 286
277static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) 287static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
278{ 288{
279 struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info; 289 struct btrfs_fs_info *fs_info = btrfs_sb(fdentry(file)->d_sb);
280 struct btrfs_fs_info *fs_info = root->fs_info;
281 struct btrfs_device *device; 290 struct btrfs_device *device;
282 struct request_queue *q; 291 struct request_queue *q;
283 struct fstrim_range range; 292 struct fstrim_range range;
284 u64 minlen = ULLONG_MAX; 293 u64 minlen = ULLONG_MAX;
285 u64 num_devices = 0; 294 u64 num_devices = 0;
286 u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); 295 u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
287 int ret; 296 int ret;
288 297
289 if (!capable(CAP_SYS_ADMIN)) 298 if (!capable(CAP_SYS_ADMIN))
@@ -312,7 +321,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
312 321
313 range.len = min(range.len, total_bytes - range.start); 322 range.len = min(range.len, total_bytes - range.start);
314 range.minlen = max(range.minlen, minlen); 323 range.minlen = max(range.minlen, minlen);
315 ret = btrfs_trim_fs(root, &range); 324 ret = btrfs_trim_fs(fs_info->tree_root, &range);
316 if (ret < 0) 325 if (ret < 0)
317 return ret; 326 return ret;
318 327
@@ -358,7 +367,7 @@ static noinline int create_subvol(struct btrfs_root *root,
358 return PTR_ERR(trans); 367 return PTR_ERR(trans);
359 368
360 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 369 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
361 0, objectid, NULL, 0, 0, 0); 370 0, objectid, NULL, 0, 0, 0, 0);
362 if (IS_ERR(leaf)) { 371 if (IS_ERR(leaf)) {
363 ret = PTR_ERR(leaf); 372 ret = PTR_ERR(leaf);
364 goto fail; 373 goto fail;
@@ -858,10 +867,8 @@ static int cluster_pages_for_defrag(struct inode *inode,
858 return 0; 867 return 0;
859 file_end = (isize - 1) >> PAGE_CACHE_SHIFT; 868 file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
860 869
861 mutex_lock(&inode->i_mutex);
862 ret = btrfs_delalloc_reserve_space(inode, 870 ret = btrfs_delalloc_reserve_space(inode,
863 num_pages << PAGE_CACHE_SHIFT); 871 num_pages << PAGE_CACHE_SHIFT);
864 mutex_unlock(&inode->i_mutex);
865 if (ret) 872 if (ret)
866 return ret; 873 return ret;
867again: 874again:
@@ -1203,13 +1210,21 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1203 if (!capable(CAP_SYS_ADMIN)) 1210 if (!capable(CAP_SYS_ADMIN))
1204 return -EPERM; 1211 return -EPERM;
1205 1212
1213 mutex_lock(&root->fs_info->volume_mutex);
1214 if (root->fs_info->balance_ctl) {
1215 printk(KERN_INFO "btrfs: balance in progress\n");
1216 ret = -EINVAL;
1217 goto out;
1218 }
1219
1206 vol_args = memdup_user(arg, sizeof(*vol_args)); 1220 vol_args = memdup_user(arg, sizeof(*vol_args));
1207 if (IS_ERR(vol_args)) 1221 if (IS_ERR(vol_args)) {
1208 return PTR_ERR(vol_args); 1222 ret = PTR_ERR(vol_args);
1223 goto out;
1224 }
1209 1225
1210 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 1226 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
1211 1227
1212 mutex_lock(&root->fs_info->volume_mutex);
1213 sizestr = vol_args->name; 1228 sizestr = vol_args->name;
1214 devstr = strchr(sizestr, ':'); 1229 devstr = strchr(sizestr, ':');
1215 if (devstr) { 1230 if (devstr) {
@@ -1226,7 +1241,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1226 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", 1241 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
1227 (unsigned long long)devid); 1242 (unsigned long long)devid);
1228 ret = -EINVAL; 1243 ret = -EINVAL;
1229 goto out_unlock; 1244 goto out_free;
1230 } 1245 }
1231 if (!strcmp(sizestr, "max")) 1246 if (!strcmp(sizestr, "max"))
1232 new_size = device->bdev->bd_inode->i_size; 1247 new_size = device->bdev->bd_inode->i_size;
@@ -1241,7 +1256,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1241 new_size = memparse(sizestr, NULL); 1256 new_size = memparse(sizestr, NULL);
1242 if (new_size == 0) { 1257 if (new_size == 0) {
1243 ret = -EINVAL; 1258 ret = -EINVAL;
1244 goto out_unlock; 1259 goto out_free;
1245 } 1260 }
1246 } 1261 }
1247 1262
@@ -1250,7 +1265,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1250 if (mod < 0) { 1265 if (mod < 0) {
1251 if (new_size > old_size) { 1266 if (new_size > old_size) {
1252 ret = -EINVAL; 1267 ret = -EINVAL;
1253 goto out_unlock; 1268 goto out_free;
1254 } 1269 }
1255 new_size = old_size - new_size; 1270 new_size = old_size - new_size;
1256 } else if (mod > 0) { 1271 } else if (mod > 0) {
@@ -1259,11 +1274,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1259 1274
1260 if (new_size < 256 * 1024 * 1024) { 1275 if (new_size < 256 * 1024 * 1024) {
1261 ret = -EINVAL; 1276 ret = -EINVAL;
1262 goto out_unlock; 1277 goto out_free;
1263 } 1278 }
1264 if (new_size > device->bdev->bd_inode->i_size) { 1279 if (new_size > device->bdev->bd_inode->i_size) {
1265 ret = -EFBIG; 1280 ret = -EFBIG;
1266 goto out_unlock; 1281 goto out_free;
1267 } 1282 }
1268 1283
1269 do_div(new_size, root->sectorsize); 1284 do_div(new_size, root->sectorsize);
@@ -1276,7 +1291,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1276 trans = btrfs_start_transaction(root, 0); 1291 trans = btrfs_start_transaction(root, 0);
1277 if (IS_ERR(trans)) { 1292 if (IS_ERR(trans)) {
1278 ret = PTR_ERR(trans); 1293 ret = PTR_ERR(trans);
1279 goto out_unlock; 1294 goto out_free;
1280 } 1295 }
1281 ret = btrfs_grow_device(trans, device, new_size); 1296 ret = btrfs_grow_device(trans, device, new_size);
1282 btrfs_commit_transaction(trans, root); 1297 btrfs_commit_transaction(trans, root);
@@ -1284,9 +1299,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1284 ret = btrfs_shrink_device(device, new_size); 1299 ret = btrfs_shrink_device(device, new_size);
1285 } 1300 }
1286 1301
1287out_unlock: 1302out_free:
1288 mutex_unlock(&root->fs_info->volume_mutex);
1289 kfree(vol_args); 1303 kfree(vol_args);
1304out:
1305 mutex_unlock(&root->fs_info->volume_mutex);
1290 return ret; 1306 return ret;
1291} 1307}
1292 1308
@@ -2052,14 +2068,25 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
2052 if (!capable(CAP_SYS_ADMIN)) 2068 if (!capable(CAP_SYS_ADMIN))
2053 return -EPERM; 2069 return -EPERM;
2054 2070
2071 mutex_lock(&root->fs_info->volume_mutex);
2072 if (root->fs_info->balance_ctl) {
2073 printk(KERN_INFO "btrfs: balance in progress\n");
2074 ret = -EINVAL;
2075 goto out;
2076 }
2077
2055 vol_args = memdup_user(arg, sizeof(*vol_args)); 2078 vol_args = memdup_user(arg, sizeof(*vol_args));
2056 if (IS_ERR(vol_args)) 2079 if (IS_ERR(vol_args)) {
2057 return PTR_ERR(vol_args); 2080 ret = PTR_ERR(vol_args);
2081 goto out;
2082 }
2058 2083
2059 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2084 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
2060 ret = btrfs_init_new_device(root, vol_args->name); 2085 ret = btrfs_init_new_device(root, vol_args->name);
2061 2086
2062 kfree(vol_args); 2087 kfree(vol_args);
2088out:
2089 mutex_unlock(&root->fs_info->volume_mutex);
2063 return ret; 2090 return ret;
2064} 2091}
2065 2092
@@ -2074,14 +2101,25 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
2074 if (root->fs_info->sb->s_flags & MS_RDONLY) 2101 if (root->fs_info->sb->s_flags & MS_RDONLY)
2075 return -EROFS; 2102 return -EROFS;
2076 2103
2104 mutex_lock(&root->fs_info->volume_mutex);
2105 if (root->fs_info->balance_ctl) {
2106 printk(KERN_INFO "btrfs: balance in progress\n");
2107 ret = -EINVAL;
2108 goto out;
2109 }
2110
2077 vol_args = memdup_user(arg, sizeof(*vol_args)); 2111 vol_args = memdup_user(arg, sizeof(*vol_args));
2078 if (IS_ERR(vol_args)) 2112 if (IS_ERR(vol_args)) {
2079 return PTR_ERR(vol_args); 2113 ret = PTR_ERR(vol_args);
2114 goto out;
2115 }
2080 2116
2081 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2117 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
2082 ret = btrfs_rm_device(root, vol_args->name); 2118 ret = btrfs_rm_device(root, vol_args->name);
2083 2119
2084 kfree(vol_args); 2120 kfree(vol_args);
2121out:
2122 mutex_unlock(&root->fs_info->volume_mutex);
2085 return ret; 2123 return ret;
2086} 2124}
2087 2125
@@ -2427,7 +2465,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2427 disko, diskl, 0, 2465 disko, diskl, 0,
2428 root->root_key.objectid, 2466 root->root_key.objectid,
2429 btrfs_ino(inode), 2467 btrfs_ino(inode),
2430 new_key.offset - datao); 2468 new_key.offset - datao,
2469 0);
2431 BUG_ON(ret); 2470 BUG_ON(ret);
2432 } 2471 }
2433 } else if (type == BTRFS_FILE_EXTENT_INLINE) { 2472 } else if (type == BTRFS_FILE_EXTENT_INLINE) {
@@ -2977,7 +3016,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
2977{ 3016{
2978 int ret = 0; 3017 int ret = 0;
2979 int size; 3018 int size;
2980 u64 extent_offset; 3019 u64 extent_item_pos;
2981 struct btrfs_ioctl_logical_ino_args *loi; 3020 struct btrfs_ioctl_logical_ino_args *loi;
2982 struct btrfs_data_container *inodes = NULL; 3021 struct btrfs_data_container *inodes = NULL;
2983 struct btrfs_path *path = NULL; 3022 struct btrfs_path *path = NULL;
@@ -3008,15 +3047,17 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
3008 } 3047 }
3009 3048
3010 ret = extent_from_logical(root->fs_info, loi->logical, path, &key); 3049 ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
3050 btrfs_release_path(path);
3011 3051
3012 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) 3052 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
3013 ret = -ENOENT; 3053 ret = -ENOENT;
3014 if (ret < 0) 3054 if (ret < 0)
3015 goto out; 3055 goto out;
3016 3056
3017 extent_offset = loi->logical - key.objectid; 3057 extent_item_pos = loi->logical - key.objectid;
3018 ret = iterate_extent_inodes(root->fs_info, path, key.objectid, 3058 ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
3019 extent_offset, build_ino_list, inodes); 3059 extent_item_pos, build_ino_list,
3060 inodes);
3020 3061
3021 if (ret < 0) 3062 if (ret < 0)
3022 goto out; 3063 goto out;
@@ -3034,6 +3075,163 @@ out:
3034 return ret; 3075 return ret;
3035} 3076}
3036 3077
3078void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
3079 struct btrfs_ioctl_balance_args *bargs)
3080{
3081 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3082
3083 bargs->flags = bctl->flags;
3084
3085 if (atomic_read(&fs_info->balance_running))
3086 bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
3087 if (atomic_read(&fs_info->balance_pause_req))
3088 bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
3089 if (atomic_read(&fs_info->balance_cancel_req))
3090 bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ;
3091
3092 memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
3093 memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
3094 memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
3095
3096 if (lock) {
3097 spin_lock(&fs_info->balance_lock);
3098 memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
3099 spin_unlock(&fs_info->balance_lock);
3100 } else {
3101 memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
3102 }
3103}
3104
3105static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
3106{
3107 struct btrfs_fs_info *fs_info = root->fs_info;
3108 struct btrfs_ioctl_balance_args *bargs;
3109 struct btrfs_balance_control *bctl;
3110 int ret;
3111
3112 if (!capable(CAP_SYS_ADMIN))
3113 return -EPERM;
3114
3115 if (fs_info->sb->s_flags & MS_RDONLY)
3116 return -EROFS;
3117
3118 mutex_lock(&fs_info->volume_mutex);
3119 mutex_lock(&fs_info->balance_mutex);
3120
3121 if (arg) {
3122 bargs = memdup_user(arg, sizeof(*bargs));
3123 if (IS_ERR(bargs)) {
3124 ret = PTR_ERR(bargs);
3125 goto out;
3126 }
3127
3128 if (bargs->flags & BTRFS_BALANCE_RESUME) {
3129 if (!fs_info->balance_ctl) {
3130 ret = -ENOTCONN;
3131 goto out_bargs;
3132 }
3133
3134 bctl = fs_info->balance_ctl;
3135 spin_lock(&fs_info->balance_lock);
3136 bctl->flags |= BTRFS_BALANCE_RESUME;
3137 spin_unlock(&fs_info->balance_lock);
3138
3139 goto do_balance;
3140 }
3141 } else {
3142 bargs = NULL;
3143 }
3144
3145 if (fs_info->balance_ctl) {
3146 ret = -EINPROGRESS;
3147 goto out_bargs;
3148 }
3149
3150 bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
3151 if (!bctl) {
3152 ret = -ENOMEM;
3153 goto out_bargs;
3154 }
3155
3156 bctl->fs_info = fs_info;
3157 if (arg) {
3158 memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
3159 memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
3160 memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
3161
3162 bctl->flags = bargs->flags;
3163 } else {
3164 /* balance everything - no filters */
3165 bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
3166 }
3167
3168do_balance:
3169 ret = btrfs_balance(bctl, bargs);
3170 /*
3171 * bctl is freed in __cancel_balance or in free_fs_info if
3172 * restriper was paused all the way until unmount
3173 */
3174 if (arg) {
3175 if (copy_to_user(arg, bargs, sizeof(*bargs)))
3176 ret = -EFAULT;
3177 }
3178
3179out_bargs:
3180 kfree(bargs);
3181out:
3182 mutex_unlock(&fs_info->balance_mutex);
3183 mutex_unlock(&fs_info->volume_mutex);
3184 return ret;
3185}
3186
3187static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd)
3188{
3189 if (!capable(CAP_SYS_ADMIN))
3190 return -EPERM;
3191
3192 switch (cmd) {
3193 case BTRFS_BALANCE_CTL_PAUSE:
3194 return btrfs_pause_balance(root->fs_info);
3195 case BTRFS_BALANCE_CTL_CANCEL:
3196 return btrfs_cancel_balance(root->fs_info);
3197 }
3198
3199 return -EINVAL;
3200}
3201
3202static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
3203 void __user *arg)
3204{
3205 struct btrfs_fs_info *fs_info = root->fs_info;
3206 struct btrfs_ioctl_balance_args *bargs;
3207 int ret = 0;
3208
3209 if (!capable(CAP_SYS_ADMIN))
3210 return -EPERM;
3211
3212 mutex_lock(&fs_info->balance_mutex);
3213 if (!fs_info->balance_ctl) {
3214 ret = -ENOTCONN;
3215 goto out;
3216 }
3217
3218 bargs = kzalloc(sizeof(*bargs), GFP_NOFS);
3219 if (!bargs) {
3220 ret = -ENOMEM;
3221 goto out;
3222 }
3223
3224 update_ioctl_balance_args(fs_info, 1, bargs);
3225
3226 if (copy_to_user(arg, bargs, sizeof(*bargs)))
3227 ret = -EFAULT;
3228
3229 kfree(bargs);
3230out:
3231 mutex_unlock(&fs_info->balance_mutex);
3232 return ret;
3233}
3234
3037long btrfs_ioctl(struct file *file, unsigned int 3235long btrfs_ioctl(struct file *file, unsigned int
3038 cmd, unsigned long arg) 3236 cmd, unsigned long arg)
3039{ 3237{
@@ -3078,7 +3276,7 @@ long btrfs_ioctl(struct file *file, unsigned int
3078 case BTRFS_IOC_DEV_INFO: 3276 case BTRFS_IOC_DEV_INFO:
3079 return btrfs_ioctl_dev_info(root, argp); 3277 return btrfs_ioctl_dev_info(root, argp);
3080 case BTRFS_IOC_BALANCE: 3278 case BTRFS_IOC_BALANCE:
3081 return btrfs_balance(root->fs_info->dev_root); 3279 return btrfs_ioctl_balance(root, NULL);
3082 case BTRFS_IOC_CLONE: 3280 case BTRFS_IOC_CLONE:
3083 return btrfs_ioctl_clone(file, arg, 0, 0, 0); 3281 return btrfs_ioctl_clone(file, arg, 0, 0, 0);
3084 case BTRFS_IOC_CLONE_RANGE: 3282 case BTRFS_IOC_CLONE_RANGE:
@@ -3110,6 +3308,12 @@ long btrfs_ioctl(struct file *file, unsigned int
3110 return btrfs_ioctl_scrub_cancel(root, argp); 3308 return btrfs_ioctl_scrub_cancel(root, argp);
3111 case BTRFS_IOC_SCRUB_PROGRESS: 3309 case BTRFS_IOC_SCRUB_PROGRESS:
3112 return btrfs_ioctl_scrub_progress(root, argp); 3310 return btrfs_ioctl_scrub_progress(root, argp);
3311 case BTRFS_IOC_BALANCE_V2:
3312 return btrfs_ioctl_balance(root, argp);
3313 case BTRFS_IOC_BALANCE_CTL:
3314 return btrfs_ioctl_balance_ctl(root, arg);
3315 case BTRFS_IOC_BALANCE_PROGRESS:
3316 return btrfs_ioctl_balance_progress(root, argp);
3113 } 3317 }
3114 3318
3115 return -ENOTTY; 3319 return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 252ae9915de8..4f69028a68c4 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -109,6 +109,55 @@ struct btrfs_ioctl_fs_info_args {
109 __u64 reserved[124]; /* pad to 1k */ 109 __u64 reserved[124]; /* pad to 1k */
110}; 110};
111 111
112/* balance control ioctl modes */
113#define BTRFS_BALANCE_CTL_PAUSE 1
114#define BTRFS_BALANCE_CTL_CANCEL 2
115
116/*
117 * this is packed, because it should be exactly the same as its disk
118 * byte order counterpart (struct btrfs_disk_balance_args)
119 */
120struct btrfs_balance_args {
121 __u64 profiles;
122 __u64 usage;
123 __u64 devid;
124 __u64 pstart;
125 __u64 pend;
126 __u64 vstart;
127 __u64 vend;
128
129 __u64 target;
130
131 __u64 flags;
132
133 __u64 unused[8];
134} __attribute__ ((__packed__));
135
136/* report balance progress to userspace */
137struct btrfs_balance_progress {
138 __u64 expected; /* estimated # of chunks that will be
139 * relocated to fulfill the request */
140 __u64 considered; /* # of chunks we have considered so far */
141 __u64 completed; /* # of chunks relocated so far */
142};
143
144#define BTRFS_BALANCE_STATE_RUNNING (1ULL << 0)
145#define BTRFS_BALANCE_STATE_PAUSE_REQ (1ULL << 1)
146#define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2)
147
148struct btrfs_ioctl_balance_args {
149 __u64 flags; /* in/out */
150 __u64 state; /* out */
151
152 struct btrfs_balance_args data; /* in/out */
153 struct btrfs_balance_args meta; /* in/out */
154 struct btrfs_balance_args sys; /* in/out */
155
156 struct btrfs_balance_progress stat; /* out */
157
158 __u64 unused[72]; /* pad to 1k */
159};
160
112#define BTRFS_INO_LOOKUP_PATH_MAX 4080 161#define BTRFS_INO_LOOKUP_PATH_MAX 4080
113struct btrfs_ioctl_ino_lookup_args { 162struct btrfs_ioctl_ino_lookup_args {
114 __u64 treeid; 163 __u64 treeid;
@@ -272,6 +321,11 @@ struct btrfs_ioctl_logical_ino_args {
272 struct btrfs_ioctl_dev_info_args) 321 struct btrfs_ioctl_dev_info_args)
273#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ 322#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
274 struct btrfs_ioctl_fs_info_args) 323 struct btrfs_ioctl_fs_info_args)
324#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \
325 struct btrfs_ioctl_balance_args)
326#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int)
327#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \
328 struct btrfs_ioctl_balance_args)
275#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ 329#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
276 struct btrfs_ioctl_ino_path_args) 330 struct btrfs_ioctl_ino_path_args)
277#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ 331#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index d77b67c4b275..5e178d8f7167 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -33,6 +33,14 @@ void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
33 */ 33 */
34void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) 34void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
35{ 35{
36 if (eb->lock_nested) {
37 read_lock(&eb->lock);
38 if (eb->lock_nested && current->pid == eb->lock_owner) {
39 read_unlock(&eb->lock);
40 return;
41 }
42 read_unlock(&eb->lock);
43 }
36 if (rw == BTRFS_WRITE_LOCK) { 44 if (rw == BTRFS_WRITE_LOCK) {
37 if (atomic_read(&eb->blocking_writers) == 0) { 45 if (atomic_read(&eb->blocking_writers) == 0) {
38 WARN_ON(atomic_read(&eb->spinning_writers) != 1); 46 WARN_ON(atomic_read(&eb->spinning_writers) != 1);
@@ -57,6 +65,14 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
57 */ 65 */
58void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) 66void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
59{ 67{
68 if (eb->lock_nested) {
69 read_lock(&eb->lock);
70 if (&eb->lock_nested && current->pid == eb->lock_owner) {
71 read_unlock(&eb->lock);
72 return;
73 }
74 read_unlock(&eb->lock);
75 }
60 if (rw == BTRFS_WRITE_LOCK_BLOCKING) { 76 if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
61 BUG_ON(atomic_read(&eb->blocking_writers) != 1); 77 BUG_ON(atomic_read(&eb->blocking_writers) != 1);
62 write_lock(&eb->lock); 78 write_lock(&eb->lock);
@@ -81,12 +97,25 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
81void btrfs_tree_read_lock(struct extent_buffer *eb) 97void btrfs_tree_read_lock(struct extent_buffer *eb)
82{ 98{
83again: 99again:
100 read_lock(&eb->lock);
101 if (atomic_read(&eb->blocking_writers) &&
102 current->pid == eb->lock_owner) {
103 /*
104 * This extent is already write-locked by our thread. We allow
105 * an additional read lock to be added because it's for the same
106 * thread. btrfs_find_all_roots() depends on this as it may be
107 * called on a partly (write-)locked tree.
108 */
109 BUG_ON(eb->lock_nested);
110 eb->lock_nested = 1;
111 read_unlock(&eb->lock);
112 return;
113 }
114 read_unlock(&eb->lock);
84 wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); 115 wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
85 read_lock(&eb->lock); 116 read_lock(&eb->lock);
86 if (atomic_read(&eb->blocking_writers)) { 117 if (atomic_read(&eb->blocking_writers)) {
87 read_unlock(&eb->lock); 118 read_unlock(&eb->lock);
88 wait_event(eb->write_lock_wq,
89 atomic_read(&eb->blocking_writers) == 0);
90 goto again; 119 goto again;
91 } 120 }
92 atomic_inc(&eb->read_locks); 121 atomic_inc(&eb->read_locks);
@@ -129,6 +158,7 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
129 } 158 }
130 atomic_inc(&eb->write_locks); 159 atomic_inc(&eb->write_locks);
131 atomic_inc(&eb->spinning_writers); 160 atomic_inc(&eb->spinning_writers);
161 eb->lock_owner = current->pid;
132 return 1; 162 return 1;
133} 163}
134 164
@@ -137,6 +167,15 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
137 */ 167 */
138void btrfs_tree_read_unlock(struct extent_buffer *eb) 168void btrfs_tree_read_unlock(struct extent_buffer *eb)
139{ 169{
170 if (eb->lock_nested) {
171 read_lock(&eb->lock);
172 if (eb->lock_nested && current->pid == eb->lock_owner) {
173 eb->lock_nested = 0;
174 read_unlock(&eb->lock);
175 return;
176 }
177 read_unlock(&eb->lock);
178 }
140 btrfs_assert_tree_read_locked(eb); 179 btrfs_assert_tree_read_locked(eb);
141 WARN_ON(atomic_read(&eb->spinning_readers) == 0); 180 WARN_ON(atomic_read(&eb->spinning_readers) == 0);
142 atomic_dec(&eb->spinning_readers); 181 atomic_dec(&eb->spinning_readers);
@@ -149,6 +188,15 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
149 */ 188 */
150void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) 189void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
151{ 190{
191 if (eb->lock_nested) {
192 read_lock(&eb->lock);
193 if (eb->lock_nested && current->pid == eb->lock_owner) {
194 eb->lock_nested = 0;
195 read_unlock(&eb->lock);
196 return;
197 }
198 read_unlock(&eb->lock);
199 }
152 btrfs_assert_tree_read_locked(eb); 200 btrfs_assert_tree_read_locked(eb);
153 WARN_ON(atomic_read(&eb->blocking_readers) == 0); 201 WARN_ON(atomic_read(&eb->blocking_readers) == 0);
154 if (atomic_dec_and_test(&eb->blocking_readers)) 202 if (atomic_dec_and_test(&eb->blocking_readers))
@@ -181,6 +229,7 @@ again:
181 WARN_ON(atomic_read(&eb->spinning_writers)); 229 WARN_ON(atomic_read(&eb->spinning_writers));
182 atomic_inc(&eb->spinning_writers); 230 atomic_inc(&eb->spinning_writers);
183 atomic_inc(&eb->write_locks); 231 atomic_inc(&eb->write_locks);
232 eb->lock_owner = current->pid;
184 return 0; 233 return 0;
185} 234}
186 235
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index cfb55434a469..8c1aae2c845d 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1604,12 +1604,12 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
1604 ret = btrfs_inc_extent_ref(trans, root, new_bytenr, 1604 ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
1605 num_bytes, parent, 1605 num_bytes, parent,
1606 btrfs_header_owner(leaf), 1606 btrfs_header_owner(leaf),
1607 key.objectid, key.offset); 1607 key.objectid, key.offset, 1);
1608 BUG_ON(ret); 1608 BUG_ON(ret);
1609 1609
1610 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 1610 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1611 parent, btrfs_header_owner(leaf), 1611 parent, btrfs_header_owner(leaf),
1612 key.objectid, key.offset); 1612 key.objectid, key.offset, 1);
1613 BUG_ON(ret); 1613 BUG_ON(ret);
1614 } 1614 }
1615 if (dirty) 1615 if (dirty)
@@ -1778,21 +1778,23 @@ again:
1778 1778
1779 ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize, 1779 ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
1780 path->nodes[level]->start, 1780 path->nodes[level]->start,
1781 src->root_key.objectid, level - 1, 0); 1781 src->root_key.objectid, level - 1, 0,
1782 1);
1782 BUG_ON(ret); 1783 BUG_ON(ret);
1783 ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize, 1784 ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
1784 0, dest->root_key.objectid, level - 1, 1785 0, dest->root_key.objectid, level - 1,
1785 0); 1786 0, 1);
1786 BUG_ON(ret); 1787 BUG_ON(ret);
1787 1788
1788 ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, 1789 ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
1789 path->nodes[level]->start, 1790 path->nodes[level]->start,
1790 src->root_key.objectid, level - 1, 0); 1791 src->root_key.objectid, level - 1, 0,
1792 1);
1791 BUG_ON(ret); 1793 BUG_ON(ret);
1792 1794
1793 ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, 1795 ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
1794 0, dest->root_key.objectid, level - 1, 1796 0, dest->root_key.objectid, level - 1,
1795 0); 1797 0, 1);
1796 BUG_ON(ret); 1798 BUG_ON(ret);
1797 1799
1798 btrfs_unlock_up_safe(path, 0); 1800 btrfs_unlock_up_safe(path, 0);
@@ -2244,7 +2246,7 @@ again:
2244 } else { 2246 } else {
2245 list_del_init(&reloc_root->root_list); 2247 list_del_init(&reloc_root->root_list);
2246 } 2248 }
2247 btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0); 2249 btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
2248 } 2250 }
2249 2251
2250 if (found) { 2252 if (found) {
@@ -2558,7 +2560,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2558 node->eb->start, blocksize, 2560 node->eb->start, blocksize,
2559 upper->eb->start, 2561 upper->eb->start,
2560 btrfs_header_owner(upper->eb), 2562 btrfs_header_owner(upper->eb),
2561 node->level, 0); 2563 node->level, 0, 1);
2562 BUG_ON(ret); 2564 BUG_ON(ret);
2563 2565
2564 ret = btrfs_drop_subtree(trans, root, eb, upper->eb); 2566 ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
@@ -2947,9 +2949,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
2947 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; 2949 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
2948 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; 2950 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
2949 while (index <= last_index) { 2951 while (index <= last_index) {
2950 mutex_lock(&inode->i_mutex);
2951 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); 2952 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
2952 mutex_unlock(&inode->i_mutex);
2953 if (ret) 2953 if (ret)
2954 goto out; 2954 goto out;
2955 2955
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ddf2c90d3fc0..9770cc5bfb76 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -25,6 +25,7 @@
25#include "transaction.h" 25#include "transaction.h"
26#include "backref.h" 26#include "backref.h"
27#include "extent_io.h" 27#include "extent_io.h"
28#include "check-integrity.h"
28 29
29/* 30/*
30 * This is only the first step towards a full-features scrub. It reads all 31 * This is only the first step towards a full-features scrub. It reads all
@@ -309,7 +310,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
309 u8 ref_level; 310 u8 ref_level;
310 unsigned long ptr = 0; 311 unsigned long ptr = 0;
311 const int bufsize = 4096; 312 const int bufsize = 4096;
312 u64 extent_offset; 313 u64 extent_item_pos;
313 314
314 path = btrfs_alloc_path(); 315 path = btrfs_alloc_path();
315 316
@@ -329,12 +330,13 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
329 if (ret < 0) 330 if (ret < 0)
330 goto out; 331 goto out;
331 332
332 extent_offset = swarn.logical - found_key.objectid; 333 extent_item_pos = swarn.logical - found_key.objectid;
333 swarn.extent_item_size = found_key.offset; 334 swarn.extent_item_size = found_key.offset;
334 335
335 eb = path->nodes[0]; 336 eb = path->nodes[0];
336 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); 337 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
337 item_size = btrfs_item_size_nr(eb, path->slots[0]); 338 item_size = btrfs_item_size_nr(eb, path->slots[0]);
339 btrfs_release_path(path);
338 340
339 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 341 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
340 do { 342 do {
@@ -351,7 +353,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
351 } else { 353 } else {
352 swarn.path = path; 354 swarn.path = path;
353 iterate_extent_inodes(fs_info, path, found_key.objectid, 355 iterate_extent_inodes(fs_info, path, found_key.objectid,
354 extent_offset, 356 extent_item_pos,
355 scrub_print_warning_inode, &swarn); 357 scrub_print_warning_inode, &swarn);
356 } 358 }
357 359
@@ -732,7 +734,7 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
732 bio_add_page(bio, page, PAGE_SIZE, 0); 734 bio_add_page(bio, page, PAGE_SIZE, 0);
733 bio->bi_end_io = scrub_fixup_end_io; 735 bio->bi_end_io = scrub_fixup_end_io;
734 bio->bi_private = &complete; 736 bio->bi_private = &complete;
735 submit_bio(rw, bio); 737 btrfsic_submit_bio(rw, bio);
736 738
737 /* this will also unplug the queue */ 739 /* this will also unplug the queue */
738 wait_for_completion(&complete); 740 wait_for_completion(&complete);
@@ -958,7 +960,7 @@ static int scrub_submit(struct scrub_dev *sdev)
958 sdev->curr = -1; 960 sdev->curr = -1;
959 atomic_inc(&sdev->in_flight); 961 atomic_inc(&sdev->in_flight);
960 962
961 submit_bio(READ, sbio->bio); 963 btrfsic_submit_bio(READ, sbio->bio);
962 964
963 return 0; 965 return 0;
964} 966}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ae488aa1966a..3ce97b217cbe 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -147,13 +147,13 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
147 147
148static void btrfs_put_super(struct super_block *sb) 148static void btrfs_put_super(struct super_block *sb)
149{ 149{
150 struct btrfs_root *root = btrfs_sb(sb); 150 (void)close_ctree(btrfs_sb(sb)->tree_root);
151 int ret; 151 /* FIXME: need to fix VFS to return error? */
152 152 /* AV: return it _where_? ->put_super() can be triggered by any number
153 ret = close_ctree(root); 153 * of async events, up to and including delivery of SIGKILL to the
154 sb->s_fs_info = NULL; 154 * last process that kept it busy. Or segfault in the aforementioned
155 155 * process... Whom would you report that to?
156 (void)ret; /* FIXME: need to fix VFS to return error? */ 156 */
157} 157}
158 158
159enum { 159enum {
@@ -163,8 +163,11 @@ enum {
163 Opt_compress_type, Opt_compress_force, Opt_compress_force_type, 163 Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
164 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, 164 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
165 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, 165 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
166 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, 166 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
167 Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err, 167 Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
168 Opt_check_integrity, Opt_check_integrity_including_extent_data,
169 Opt_check_integrity_print_mask,
170 Opt_err,
168}; 171};
169 172
170static match_table_t tokens = { 173static match_table_t tokens = {
@@ -199,6 +202,10 @@ static match_table_t tokens = {
199 {Opt_inode_cache, "inode_cache"}, 202 {Opt_inode_cache, "inode_cache"},
200 {Opt_no_space_cache, "nospace_cache"}, 203 {Opt_no_space_cache, "nospace_cache"},
201 {Opt_recovery, "recovery"}, 204 {Opt_recovery, "recovery"},
205 {Opt_skip_balance, "skip_balance"},
206 {Opt_check_integrity, "check_int"},
207 {Opt_check_integrity_including_extent_data, "check_int_data"},
208 {Opt_check_integrity_print_mask, "check_int_print_mask=%d"},
202 {Opt_err, NULL}, 209 {Opt_err, NULL},
203}; 210};
204 211
@@ -397,6 +404,40 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
397 printk(KERN_INFO "btrfs: enabling auto recovery"); 404 printk(KERN_INFO "btrfs: enabling auto recovery");
398 btrfs_set_opt(info->mount_opt, RECOVERY); 405 btrfs_set_opt(info->mount_opt, RECOVERY);
399 break; 406 break;
407 case Opt_skip_balance:
408 btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
409 break;
410#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
411 case Opt_check_integrity_including_extent_data:
412 printk(KERN_INFO "btrfs: enabling check integrity"
413 " including extent data\n");
414 btrfs_set_opt(info->mount_opt,
415 CHECK_INTEGRITY_INCLUDING_EXTENT_DATA);
416 btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
417 break;
418 case Opt_check_integrity:
419 printk(KERN_INFO "btrfs: enabling check integrity\n");
420 btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
421 break;
422 case Opt_check_integrity_print_mask:
423 intarg = 0;
424 match_int(&args[0], &intarg);
425 if (intarg) {
426 info->check_integrity_print_mask = intarg;
427 printk(KERN_INFO "btrfs:"
428 " check_integrity_print_mask 0x%x\n",
429 info->check_integrity_print_mask);
430 }
431 break;
432#else
433 case Opt_check_integrity_including_extent_data:
434 case Opt_check_integrity:
435 case Opt_check_integrity_print_mask:
436 printk(KERN_ERR "btrfs: support for check_integrity*"
437 " not compiled in!\n");
438 ret = -EINVAL;
439 goto out;
440#endif
400 case Opt_err: 441 case Opt_err:
401 printk(KERN_INFO "btrfs: unrecognized mount option " 442 printk(KERN_INFO "btrfs: unrecognized mount option "
402 "'%s'\n", p); 443 "'%s'\n", p);
@@ -500,7 +541,8 @@ out:
500static struct dentry *get_default_root(struct super_block *sb, 541static struct dentry *get_default_root(struct super_block *sb,
501 u64 subvol_objectid) 542 u64 subvol_objectid)
502{ 543{
503 struct btrfs_root *root = sb->s_fs_info; 544 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
545 struct btrfs_root *root = fs_info->tree_root;
504 struct btrfs_root *new_root; 546 struct btrfs_root *new_root;
505 struct btrfs_dir_item *di; 547 struct btrfs_dir_item *di;
506 struct btrfs_path *path; 548 struct btrfs_path *path;
@@ -530,7 +572,7 @@ static struct dentry *get_default_root(struct super_block *sb,
530 * will mount by default if we haven't been given a specific subvolume 572 * will mount by default if we haven't been given a specific subvolume
531 * to mount. 573 * to mount.
532 */ 574 */
533 dir_id = btrfs_super_root_dir(root->fs_info->super_copy); 575 dir_id = btrfs_super_root_dir(fs_info->super_copy);
534 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); 576 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
535 if (IS_ERR(di)) { 577 if (IS_ERR(di)) {
536 btrfs_free_path(path); 578 btrfs_free_path(path);
@@ -544,7 +586,7 @@ static struct dentry *get_default_root(struct super_block *sb,
544 */ 586 */
545 btrfs_free_path(path); 587 btrfs_free_path(path);
546 dir_id = BTRFS_FIRST_FREE_OBJECTID; 588 dir_id = BTRFS_FIRST_FREE_OBJECTID;
547 new_root = root->fs_info->fs_root; 589 new_root = fs_info->fs_root;
548 goto setup_root; 590 goto setup_root;
549 } 591 }
550 592
@@ -552,7 +594,7 @@ static struct dentry *get_default_root(struct super_block *sb,
552 btrfs_free_path(path); 594 btrfs_free_path(path);
553 595
554find_root: 596find_root:
555 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); 597 new_root = btrfs_read_fs_root_no_name(fs_info, &location);
556 if (IS_ERR(new_root)) 598 if (IS_ERR(new_root))
557 return ERR_CAST(new_root); 599 return ERR_CAST(new_root);
558 600
@@ -588,7 +630,7 @@ static int btrfs_fill_super(struct super_block *sb,
588{ 630{
589 struct inode *inode; 631 struct inode *inode;
590 struct dentry *root_dentry; 632 struct dentry *root_dentry;
591 struct btrfs_root *tree_root; 633 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
592 struct btrfs_key key; 634 struct btrfs_key key;
593 int err; 635 int err;
594 636
@@ -603,18 +645,16 @@ static int btrfs_fill_super(struct super_block *sb,
603 sb->s_flags |= MS_POSIXACL; 645 sb->s_flags |= MS_POSIXACL;
604#endif 646#endif
605 647
606 tree_root = open_ctree(sb, fs_devices, (char *)data); 648 err = open_ctree(sb, fs_devices, (char *)data);
607 649 if (err) {
608 if (IS_ERR(tree_root)) {
609 printk("btrfs: open_ctree failed\n"); 650 printk("btrfs: open_ctree failed\n");
610 return PTR_ERR(tree_root); 651 return err;
611 } 652 }
612 sb->s_fs_info = tree_root;
613 653
614 key.objectid = BTRFS_FIRST_FREE_OBJECTID; 654 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
615 key.type = BTRFS_INODE_ITEM_KEY; 655 key.type = BTRFS_INODE_ITEM_KEY;
616 key.offset = 0; 656 key.offset = 0;
617 inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL); 657 inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL);
618 if (IS_ERR(inode)) { 658 if (IS_ERR(inode)) {
619 err = PTR_ERR(inode); 659 err = PTR_ERR(inode);
620 goto fail_close; 660 goto fail_close;
@@ -631,23 +671,25 @@ static int btrfs_fill_super(struct super_block *sb,
631 671
632 save_mount_options(sb, data); 672 save_mount_options(sb, data);
633 cleancache_init_fs(sb); 673 cleancache_init_fs(sb);
674 sb->s_flags |= MS_ACTIVE;
634 return 0; 675 return 0;
635 676
636fail_close: 677fail_close:
637 close_ctree(tree_root); 678 close_ctree(fs_info->tree_root);
638 return err; 679 return err;
639} 680}
640 681
641int btrfs_sync_fs(struct super_block *sb, int wait) 682int btrfs_sync_fs(struct super_block *sb, int wait)
642{ 683{
643 struct btrfs_trans_handle *trans; 684 struct btrfs_trans_handle *trans;
644 struct btrfs_root *root = btrfs_sb(sb); 685 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
686 struct btrfs_root *root = fs_info->tree_root;
645 int ret; 687 int ret;
646 688
647 trace_btrfs_sync_fs(wait); 689 trace_btrfs_sync_fs(wait);
648 690
649 if (!wait) { 691 if (!wait) {
650 filemap_flush(root->fs_info->btree_inode->i_mapping); 692 filemap_flush(fs_info->btree_inode->i_mapping);
651 return 0; 693 return 0;
652 } 694 }
653 695
@@ -663,8 +705,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
663 705
664static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) 706static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
665{ 707{
666 struct btrfs_root *root = btrfs_sb(dentry->d_sb); 708 struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
667 struct btrfs_fs_info *info = root->fs_info; 709 struct btrfs_root *root = info->tree_root;
668 char *compress_type; 710 char *compress_type;
669 711
670 if (btrfs_test_opt(root, DEGRADED)) 712 if (btrfs_test_opt(root, DEGRADED))
@@ -722,28 +764,25 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
722 seq_puts(seq, ",autodefrag"); 764 seq_puts(seq, ",autodefrag");
723 if (btrfs_test_opt(root, INODE_MAP_CACHE)) 765 if (btrfs_test_opt(root, INODE_MAP_CACHE))
724 seq_puts(seq, ",inode_cache"); 766 seq_puts(seq, ",inode_cache");
767 if (btrfs_test_opt(root, SKIP_BALANCE))
768 seq_puts(seq, ",skip_balance");
725 return 0; 769 return 0;
726} 770}
727 771
728static int btrfs_test_super(struct super_block *s, void *data) 772static int btrfs_test_super(struct super_block *s, void *data)
729{ 773{
730 struct btrfs_root *test_root = data; 774 struct btrfs_fs_info *p = data;
731 struct btrfs_root *root = btrfs_sb(s); 775 struct btrfs_fs_info *fs_info = btrfs_sb(s);
732 776
733 /* 777 return fs_info->fs_devices == p->fs_devices;
734 * If this super block is going away, return false as it
735 * can't match as an existing super block.
736 */
737 if (!atomic_read(&s->s_active))
738 return 0;
739 return root->fs_info->fs_devices == test_root->fs_info->fs_devices;
740} 778}
741 779
742static int btrfs_set_super(struct super_block *s, void *data) 780static int btrfs_set_super(struct super_block *s, void *data)
743{ 781{
744 s->s_fs_info = data; 782 int err = set_anon_super(s, data);
745 783 if (!err)
746 return set_anon_super(s, data); 784 s->s_fs_info = data;
785 return err;
747} 786}
748 787
749/* 788/*
@@ -903,12 +942,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
903 if (!fs_info) 942 if (!fs_info)
904 return ERR_PTR(-ENOMEM); 943 return ERR_PTR(-ENOMEM);
905 944
906 fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
907 if (!fs_info->tree_root) {
908 error = -ENOMEM;
909 goto error_fs_info;
910 }
911 fs_info->tree_root->fs_info = fs_info;
912 fs_info->fs_devices = fs_devices; 945 fs_info->fs_devices = fs_devices;
913 946
914 fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); 947 fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
@@ -928,43 +961,30 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
928 } 961 }
929 962
930 bdev = fs_devices->latest_bdev; 963 bdev = fs_devices->latest_bdev;
931 s = sget(fs_type, btrfs_test_super, btrfs_set_super, 964 s = sget(fs_type, btrfs_test_super, btrfs_set_super, fs_info);
932 fs_info->tree_root);
933 if (IS_ERR(s)) { 965 if (IS_ERR(s)) {
934 error = PTR_ERR(s); 966 error = PTR_ERR(s);
935 goto error_close_devices; 967 goto error_close_devices;
936 } 968 }
937 969
938 if (s->s_root) { 970 if (s->s_root) {
939 if ((flags ^ s->s_flags) & MS_RDONLY) {
940 deactivate_locked_super(s);
941 error = -EBUSY;
942 goto error_close_devices;
943 }
944
945 btrfs_close_devices(fs_devices); 971 btrfs_close_devices(fs_devices);
946 free_fs_info(fs_info); 972 free_fs_info(fs_info);
973 if ((flags ^ s->s_flags) & MS_RDONLY)
974 error = -EBUSY;
947 } else { 975 } else {
948 char b[BDEVNAME_SIZE]; 976 char b[BDEVNAME_SIZE];
949 977
950 s->s_flags = flags | MS_NOSEC; 978 s->s_flags = flags | MS_NOSEC;
951 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); 979 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
952 btrfs_sb(s)->fs_info->bdev_holder = fs_type; 980 btrfs_sb(s)->bdev_holder = fs_type;
953 error = btrfs_fill_super(s, fs_devices, data, 981 error = btrfs_fill_super(s, fs_devices, data,
954 flags & MS_SILENT ? 1 : 0); 982 flags & MS_SILENT ? 1 : 0);
955 if (error) {
956 deactivate_locked_super(s);
957 return ERR_PTR(error);
958 }
959
960 s->s_flags |= MS_ACTIVE;
961 } 983 }
962 984
963 root = get_default_root(s, subvol_objectid); 985 root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
964 if (IS_ERR(root)) { 986 if (IS_ERR(root))
965 deactivate_locked_super(s); 987 deactivate_locked_super(s);
966 return root;
967 }
968 988
969 return root; 989 return root;
970 990
@@ -977,7 +997,8 @@ error_fs_info:
977 997
978static int btrfs_remount(struct super_block *sb, int *flags, char *data) 998static int btrfs_remount(struct super_block *sb, int *flags, char *data)
979{ 999{
980 struct btrfs_root *root = btrfs_sb(sb); 1000 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1001 struct btrfs_root *root = fs_info->tree_root;
981 int ret; 1002 int ret;
982 1003
983 ret = btrfs_parse_options(root, data); 1004 ret = btrfs_parse_options(root, data);
@@ -993,13 +1014,13 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
993 ret = btrfs_commit_super(root); 1014 ret = btrfs_commit_super(root);
994 WARN_ON(ret); 1015 WARN_ON(ret);
995 } else { 1016 } else {
996 if (root->fs_info->fs_devices->rw_devices == 0) 1017 if (fs_info->fs_devices->rw_devices == 0)
997 return -EACCES; 1018 return -EACCES;
998 1019
999 if (btrfs_super_log_root(root->fs_info->super_copy) != 0) 1020 if (btrfs_super_log_root(fs_info->super_copy) != 0)
1000 return -EINVAL; 1021 return -EINVAL;
1001 1022
1002 ret = btrfs_cleanup_fs_roots(root->fs_info); 1023 ret = btrfs_cleanup_fs_roots(fs_info);
1003 WARN_ON(ret); 1024 WARN_ON(ret);
1004 1025
1005 /* recover relocation */ 1026 /* recover relocation */
@@ -1168,18 +1189,18 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1168 1189
1169static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) 1190static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1170{ 1191{
1171 struct btrfs_root *root = btrfs_sb(dentry->d_sb); 1192 struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
1172 struct btrfs_super_block *disk_super = root->fs_info->super_copy; 1193 struct btrfs_super_block *disk_super = fs_info->super_copy;
1173 struct list_head *head = &root->fs_info->space_info; 1194 struct list_head *head = &fs_info->space_info;
1174 struct btrfs_space_info *found; 1195 struct btrfs_space_info *found;
1175 u64 total_used = 0; 1196 u64 total_used = 0;
1176 u64 total_free_data = 0; 1197 u64 total_free_data = 0;
1177 int bits = dentry->d_sb->s_blocksize_bits; 1198 int bits = dentry->d_sb->s_blocksize_bits;
1178 __be32 *fsid = (__be32 *)root->fs_info->fsid; 1199 __be32 *fsid = (__be32 *)fs_info->fsid;
1179 int ret; 1200 int ret;
1180 1201
1181 /* holding chunk_muext to avoid allocating new chunks */ 1202 /* holding chunk_muext to avoid allocating new chunks */
1182 mutex_lock(&root->fs_info->chunk_mutex); 1203 mutex_lock(&fs_info->chunk_mutex);
1183 rcu_read_lock(); 1204 rcu_read_lock();
1184 list_for_each_entry_rcu(found, head, list) { 1205 list_for_each_entry_rcu(found, head, list) {
1185 if (found->flags & BTRFS_BLOCK_GROUP_DATA) { 1206 if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
@@ -1198,14 +1219,14 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1198 buf->f_bsize = dentry->d_sb->s_blocksize; 1219 buf->f_bsize = dentry->d_sb->s_blocksize;
1199 buf->f_type = BTRFS_SUPER_MAGIC; 1220 buf->f_type = BTRFS_SUPER_MAGIC;
1200 buf->f_bavail = total_free_data; 1221 buf->f_bavail = total_free_data;
1201 ret = btrfs_calc_avail_data_space(root, &total_free_data); 1222 ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
1202 if (ret) { 1223 if (ret) {
1203 mutex_unlock(&root->fs_info->chunk_mutex); 1224 mutex_unlock(&fs_info->chunk_mutex);
1204 return ret; 1225 return ret;
1205 } 1226 }
1206 buf->f_bavail += total_free_data; 1227 buf->f_bavail += total_free_data;
1207 buf->f_bavail = buf->f_bavail >> bits; 1228 buf->f_bavail = buf->f_bavail >> bits;
1208 mutex_unlock(&root->fs_info->chunk_mutex); 1229 mutex_unlock(&fs_info->chunk_mutex);
1209 1230
1210 /* We treat it as constant endianness (it doesn't matter _which_) 1231 /* We treat it as constant endianness (it doesn't matter _which_)
1211 because we want the fsid to come out the same whether mounted 1232 because we want the fsid to come out the same whether mounted
@@ -1219,11 +1240,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1219 return 0; 1240 return 0;
1220} 1241}
1221 1242
1243static void btrfs_kill_super(struct super_block *sb)
1244{
1245 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1246 kill_anon_super(sb);
1247 free_fs_info(fs_info);
1248}
1249
1222static struct file_system_type btrfs_fs_type = { 1250static struct file_system_type btrfs_fs_type = {
1223 .owner = THIS_MODULE, 1251 .owner = THIS_MODULE,
1224 .name = "btrfs", 1252 .name = "btrfs",
1225 .mount = btrfs_mount, 1253 .mount = btrfs_mount,
1226 .kill_sb = kill_anon_super, 1254 .kill_sb = btrfs_kill_super,
1227 .fs_flags = FS_REQUIRES_DEV, 1255 .fs_flags = FS_REQUIRES_DEV,
1228}; 1256};
1229 1257
@@ -1257,17 +1285,17 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
1257 1285
1258static int btrfs_freeze(struct super_block *sb) 1286static int btrfs_freeze(struct super_block *sb)
1259{ 1287{
1260 struct btrfs_root *root = btrfs_sb(sb); 1288 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1261 mutex_lock(&root->fs_info->transaction_kthread_mutex); 1289 mutex_lock(&fs_info->transaction_kthread_mutex);
1262 mutex_lock(&root->fs_info->cleaner_mutex); 1290 mutex_lock(&fs_info->cleaner_mutex);
1263 return 0; 1291 return 0;
1264} 1292}
1265 1293
1266static int btrfs_unfreeze(struct super_block *sb) 1294static int btrfs_unfreeze(struct super_block *sb)
1267{ 1295{
1268 struct btrfs_root *root = btrfs_sb(sb); 1296 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1269 mutex_unlock(&root->fs_info->cleaner_mutex); 1297 mutex_unlock(&fs_info->cleaner_mutex);
1270 mutex_unlock(&root->fs_info->transaction_kthread_mutex); 1298 mutex_unlock(&fs_info->transaction_kthread_mutex);
1271 return 0; 1299 return 0;
1272} 1300}
1273 1301
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 81376d94cd3c..287a6728b1ad 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -36,6 +36,8 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
36 WARN_ON(atomic_read(&transaction->use_count) == 0); 36 WARN_ON(atomic_read(&transaction->use_count) == 0);
37 if (atomic_dec_and_test(&transaction->use_count)) { 37 if (atomic_dec_and_test(&transaction->use_count)) {
38 BUG_ON(!list_empty(&transaction->list)); 38 BUG_ON(!list_empty(&transaction->list));
39 WARN_ON(transaction->delayed_refs.root.rb_node);
40 WARN_ON(!list_empty(&transaction->delayed_refs.seq_head));
39 memset(transaction, 0, sizeof(*transaction)); 41 memset(transaction, 0, sizeof(*transaction));
40 kmem_cache_free(btrfs_transaction_cachep, transaction); 42 kmem_cache_free(btrfs_transaction_cachep, transaction);
41 } 43 }
@@ -108,8 +110,11 @@ loop:
108 cur_trans->delayed_refs.num_heads = 0; 110 cur_trans->delayed_refs.num_heads = 0;
109 cur_trans->delayed_refs.flushing = 0; 111 cur_trans->delayed_refs.flushing = 0;
110 cur_trans->delayed_refs.run_delayed_start = 0; 112 cur_trans->delayed_refs.run_delayed_start = 0;
113 cur_trans->delayed_refs.seq = 1;
114 init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);
111 spin_lock_init(&cur_trans->commit_lock); 115 spin_lock_init(&cur_trans->commit_lock);
112 spin_lock_init(&cur_trans->delayed_refs.lock); 116 spin_lock_init(&cur_trans->delayed_refs.lock);
117 INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
113 118
114 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 119 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
115 list_add_tail(&cur_trans->list, &root->fs_info->trans_list); 120 list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
@@ -321,6 +326,8 @@ again:
321 } 326 }
322 327
323 if (num_bytes) { 328 if (num_bytes) {
329 trace_btrfs_space_reservation(root->fs_info, "transaction",
330 (u64)h, num_bytes, 1);
324 h->block_rsv = &root->fs_info->trans_block_rsv; 331 h->block_rsv = &root->fs_info->trans_block_rsv;
325 h->bytes_reserved = num_bytes; 332 h->bytes_reserved = num_bytes;
326 } 333 }
@@ -467,19 +474,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
467 474
468 btrfs_trans_release_metadata(trans, root); 475 btrfs_trans_release_metadata(trans, root);
469 trans->block_rsv = NULL; 476 trans->block_rsv = NULL;
470 while (count < 4) { 477 while (count < 2) {
471 unsigned long cur = trans->delayed_ref_updates; 478 unsigned long cur = trans->delayed_ref_updates;
472 trans->delayed_ref_updates = 0; 479 trans->delayed_ref_updates = 0;
473 if (cur && 480 if (cur &&
474 trans->transaction->delayed_refs.num_heads_ready > 64) { 481 trans->transaction->delayed_refs.num_heads_ready > 64) {
475 trans->delayed_ref_updates = 0; 482 trans->delayed_ref_updates = 0;
476
477 /*
478 * do a full flush if the transaction is trying
479 * to close
480 */
481 if (trans->transaction->delayed_refs.flushing)
482 cur = 0;
483 btrfs_run_delayed_refs(trans, root, cur); 483 btrfs_run_delayed_refs(trans, root, cur);
484 } else { 484 } else {
485 break; 485 break;
@@ -1393,9 +1393,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
1393 1393
1394 if (btrfs_header_backref_rev(root->node) < 1394 if (btrfs_header_backref_rev(root->node) <
1395 BTRFS_MIXED_BACKREF_REV) 1395 BTRFS_MIXED_BACKREF_REV)
1396 btrfs_drop_snapshot(root, NULL, 0); 1396 btrfs_drop_snapshot(root, NULL, 0, 0);
1397 else 1397 else
1398 btrfs_drop_snapshot(root, NULL, 1); 1398 btrfs_drop_snapshot(root, NULL, 1, 0);
1399 } 1399 }
1400 return 0; 1400 return 0;
1401} 1401}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 3568374d419d..cb877e0886a7 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -589,7 +589,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
589 ret = btrfs_inc_extent_ref(trans, root, 589 ret = btrfs_inc_extent_ref(trans, root,
590 ins.objectid, ins.offset, 590 ins.objectid, ins.offset,
591 0, root->root_key.objectid, 591 0, root->root_key.objectid,
592 key->objectid, offset); 592 key->objectid, offset, 0);
593 BUG_ON(ret); 593 BUG_ON(ret);
594 } else { 594 } else {
595 /* 595 /*
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
new file mode 100644
index 000000000000..12f5147bd2b1
--- /dev/null
+++ b/fs/btrfs/ulist.c
@@ -0,0 +1,220 @@
1/*
2 * Copyright (C) 2011 STRATO AG
3 * written by Arne Jansen <sensille@gmx.net>
4 * Distributed under the GNU GPL license version 2.
5 */
6
7#include <linux/slab.h>
8#include <linux/module.h>
9#include "ulist.h"
10
11/*
12 * ulist is a generic data structure to hold a collection of unique u64
13 * values. The only operations it supports is adding to the list and
14 * enumerating it.
15 * It is possible to store an auxiliary value along with the key.
16 *
17 * The implementation is preliminary and can probably be sped up
18 * significantly. A first step would be to store the values in an rbtree
19 * as soon as ULIST_SIZE is exceeded.
20 *
21 * A sample usage for ulists is the enumeration of directed graphs without
22 * visiting a node twice. The pseudo-code could look like this:
23 *
24 * ulist = ulist_alloc();
25 * ulist_add(ulist, root);
26 * elem = NULL;
27 *
28 * while ((elem = ulist_next(ulist, elem)) {
29 * for (all child nodes n in elem)
30 * ulist_add(ulist, n);
31 * do something useful with the node;
32 * }
33 * ulist_free(ulist);
34 *
35 * This assumes the graph nodes are adressable by u64. This stems from the
36 * usage for tree enumeration in btrfs, where the logical addresses are
37 * 64 bit.
38 *
39 * It is also useful for tree enumeration which could be done elegantly
40 * recursively, but is not possible due to kernel stack limitations. The
41 * loop would be similar to the above.
42 */
43
44/**
45 * ulist_init - freshly initialize a ulist
46 * @ulist: the ulist to initialize
47 *
48 * Note: don't use this function to init an already used ulist, use
49 * ulist_reinit instead.
50 */
51void ulist_init(struct ulist *ulist)
52{
53 ulist->nnodes = 0;
54 ulist->nodes = ulist->int_nodes;
55 ulist->nodes_alloced = ULIST_SIZE;
56}
57EXPORT_SYMBOL(ulist_init);
58
59/**
60 * ulist_fini - free up additionally allocated memory for the ulist
61 * @ulist: the ulist from which to free the additional memory
62 *
63 * This is useful in cases where the base 'struct ulist' has been statically
64 * allocated.
65 */
66void ulist_fini(struct ulist *ulist)
67{
68 /*
69 * The first ULIST_SIZE elements are stored inline in struct ulist.
70 * Only if more elements are alocated they need to be freed.
71 */
72 if (ulist->nodes_alloced > ULIST_SIZE)
73 kfree(ulist->nodes);
74 ulist->nodes_alloced = 0; /* in case ulist_fini is called twice */
75}
76EXPORT_SYMBOL(ulist_fini);
77
78/**
79 * ulist_reinit - prepare a ulist for reuse
80 * @ulist: ulist to be reused
81 *
82 * Free up all additional memory allocated for the list elements and reinit
83 * the ulist.
84 */
85void ulist_reinit(struct ulist *ulist)
86{
87 ulist_fini(ulist);
88 ulist_init(ulist);
89}
90EXPORT_SYMBOL(ulist_reinit);
91
92/**
93 * ulist_alloc - dynamically allocate a ulist
94 * @gfp_mask: allocation flags to for base allocation
95 *
96 * The allocated ulist will be returned in an initialized state.
97 */
98struct ulist *ulist_alloc(unsigned long gfp_mask)
99{
100 struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask);
101
102 if (!ulist)
103 return NULL;
104
105 ulist_init(ulist);
106
107 return ulist;
108}
109EXPORT_SYMBOL(ulist_alloc);
110
111/**
112 * ulist_free - free dynamically allocated ulist
113 * @ulist: ulist to free
114 *
115 * It is not necessary to call ulist_fini before.
116 */
117void ulist_free(struct ulist *ulist)
118{
119 if (!ulist)
120 return;
121 ulist_fini(ulist);
122 kfree(ulist);
123}
124EXPORT_SYMBOL(ulist_free);
125
126/**
127 * ulist_add - add an element to the ulist
128 * @ulist: ulist to add the element to
129 * @val: value to add to ulist
130 * @aux: auxiliary value to store along with val
131 * @gfp_mask: flags to use for allocation
132 *
133 * Note: locking must be provided by the caller. In case of rwlocks write
134 * locking is needed
135 *
136 * Add an element to a ulist. The @val will only be added if it doesn't
137 * already exist. If it is added, the auxiliary value @aux is stored along with
138 * it. In case @val already exists in the ulist, @aux is ignored, even if
139 * it differs from the already stored value.
140 *
141 * ulist_add returns 0 if @val already exists in ulist and 1 if @val has been
142 * inserted.
143 * In case of allocation failure -ENOMEM is returned and the ulist stays
144 * unaltered.
145 */
146int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
147 unsigned long gfp_mask)
148{
149 int i;
150
151 for (i = 0; i < ulist->nnodes; ++i) {
152 if (ulist->nodes[i].val == val)
153 return 0;
154 }
155
156 if (ulist->nnodes >= ulist->nodes_alloced) {
157 u64 new_alloced = ulist->nodes_alloced + 128;
158 struct ulist_node *new_nodes;
159 void *old = NULL;
160
161 /*
162 * if nodes_alloced == ULIST_SIZE no memory has been allocated
163 * yet, so pass NULL to krealloc
164 */
165 if (ulist->nodes_alloced > ULIST_SIZE)
166 old = ulist->nodes;
167
168 new_nodes = krealloc(old, sizeof(*new_nodes) * new_alloced,
169 gfp_mask);
170 if (!new_nodes)
171 return -ENOMEM;
172
173 if (!old)
174 memcpy(new_nodes, ulist->int_nodes,
175 sizeof(ulist->int_nodes));
176
177 ulist->nodes = new_nodes;
178 ulist->nodes_alloced = new_alloced;
179 }
180 ulist->nodes[ulist->nnodes].val = val;
181 ulist->nodes[ulist->nnodes].aux = aux;
182 ++ulist->nnodes;
183
184 return 1;
185}
186EXPORT_SYMBOL(ulist_add);
187
188/**
189 * ulist_next - iterate ulist
190 * @ulist: ulist to iterate
191 * @prev: previously returned element or %NULL to start iteration
192 *
193 * Note: locking must be provided by the caller. In case of rwlocks only read
194 * locking is needed
195 *
196 * This function is used to iterate an ulist. The iteration is started with
197 * @prev = %NULL. It returns the next element from the ulist or %NULL when the
198 * end is reached. No guarantee is made with respect to the order in which
199 * the elements are returned. They might neither be returned in order of
200 * addition nor in ascending order.
201 * It is allowed to call ulist_add during an enumeration. Newly added items
202 * are guaranteed to show up in the running enumeration.
203 */
204struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev)
205{
206 int next;
207
208 if (ulist->nnodes == 0)
209 return NULL;
210
211 if (!prev)
212 return &ulist->nodes[0];
213
214 next = (prev - ulist->nodes) + 1;
215 if (next < 0 || next >= ulist->nnodes)
216 return NULL;
217
218 return &ulist->nodes[next];
219}
220EXPORT_SYMBOL(ulist_next);
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
new file mode 100644
index 000000000000..2e25dec58ec0
--- /dev/null
+++ b/fs/btrfs/ulist.h
@@ -0,0 +1,68 @@
1/*
2 * Copyright (C) 2011 STRATO AG
3 * written by Arne Jansen <sensille@gmx.net>
4 * Distributed under the GNU GPL license version 2.
5 *
6 */
7
8#ifndef __ULIST__
9#define __ULIST__
10
11/*
12 * ulist is a generic data structure to hold a collection of unique u64
13 * values. The only operations it supports is adding to the list and
14 * enumerating it.
15 * It is possible to store an auxiliary value along with the key.
16 *
17 * The implementation is preliminary and can probably be sped up
18 * significantly. A first step would be to store the values in an rbtree
19 * as soon as ULIST_SIZE is exceeded.
20 */
21
22/*
23 * number of elements statically allocated inside struct ulist
24 */
25#define ULIST_SIZE 16
26
27/*
28 * element of the list
29 */
30struct ulist_node {
31 u64 val; /* value to store */
32 unsigned long aux; /* auxiliary value saved along with the val */
33};
34
35struct ulist {
36 /*
37 * number of elements stored in list
38 */
39 unsigned long nnodes;
40
41 /*
42 * number of nodes we already have room for
43 */
44 unsigned long nodes_alloced;
45
46 /*
47 * pointer to the array storing the elements. The first ULIST_SIZE
48 * elements are stored inline. In this case the it points to int_nodes.
49 * After exceeding ULIST_SIZE, dynamic memory is allocated.
50 */
51 struct ulist_node *nodes;
52
53 /*
54 * inline storage space for the first ULIST_SIZE entries
55 */
56 struct ulist_node int_nodes[ULIST_SIZE];
57};
58
59void ulist_init(struct ulist *ulist);
60void ulist_fini(struct ulist *ulist);
61void ulist_reinit(struct ulist *ulist);
62struct ulist *ulist_alloc(unsigned long gfp_mask);
63void ulist_free(struct ulist *ulist);
64int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
65 unsigned long gfp_mask);
66struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev);
67
68#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f4b839fd3c9d..0b4e2af7954d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -23,6 +23,7 @@
23#include <linux/random.h> 23#include <linux/random.h>
24#include <linux/iocontext.h> 24#include <linux/iocontext.h>
25#include <linux/capability.h> 25#include <linux/capability.h>
26#include <linux/kthread.h>
26#include <asm/div64.h> 27#include <asm/div64.h>
27#include "compat.h" 28#include "compat.h"
28#include "ctree.h" 29#include "ctree.h"
@@ -32,6 +33,7 @@
32#include "print-tree.h" 33#include "print-tree.h"
33#include "volumes.h" 34#include "volumes.h"
34#include "async-thread.h" 35#include "async-thread.h"
36#include "check-integrity.h"
35 37
36static int init_first_rw_device(struct btrfs_trans_handle *trans, 38static int init_first_rw_device(struct btrfs_trans_handle *trans,
37 struct btrfs_root *root, 39 struct btrfs_root *root,
@@ -246,7 +248,7 @@ loop_lock:
246 sync_pending = 0; 248 sync_pending = 0;
247 } 249 }
248 250
249 submit_bio(cur->bi_rw, cur); 251 btrfsic_submit_bio(cur->bi_rw, cur);
250 num_run++; 252 num_run++;
251 batch_run++; 253 batch_run++;
252 if (need_resched()) 254 if (need_resched())
@@ -706,8 +708,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
706 u64 devid; 708 u64 devid;
707 u64 transid; 709 u64 transid;
708 710
709 mutex_lock(&uuid_mutex);
710
711 flags |= FMODE_EXCL; 711 flags |= FMODE_EXCL;
712 bdev = blkdev_get_by_path(path, flags, holder); 712 bdev = blkdev_get_by_path(path, flags, holder);
713 713
@@ -716,6 +716,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
716 goto error; 716 goto error;
717 } 717 }
718 718
719 mutex_lock(&uuid_mutex);
719 ret = set_blocksize(bdev, 4096); 720 ret = set_blocksize(bdev, 4096);
720 if (ret) 721 if (ret)
721 goto error_close; 722 goto error_close;
@@ -737,9 +738,9 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
737 738
738 brelse(bh); 739 brelse(bh);
739error_close: 740error_close:
741 mutex_unlock(&uuid_mutex);
740 blkdev_put(bdev, flags); 742 blkdev_put(bdev, flags);
741error: 743error:
742 mutex_unlock(&uuid_mutex);
743 return ret; 744 return ret;
744} 745}
745 746
@@ -829,7 +830,6 @@ out:
829 830
830/* 831/*
831 * find_free_dev_extent - find free space in the specified device 832 * find_free_dev_extent - find free space in the specified device
832 * @trans: transaction handler
833 * @device: the device which we search the free space in 833 * @device: the device which we search the free space in
834 * @num_bytes: the size of the free space that we need 834 * @num_bytes: the size of the free space that we need
835 * @start: store the start of the free space. 835 * @start: store the start of the free space.
@@ -848,8 +848,7 @@ out:
848 * But if we don't find suitable free space, it is used to store the size of 848 * But if we don't find suitable free space, it is used to store the size of
849 * the max free space. 849 * the max free space.
850 */ 850 */
851int find_free_dev_extent(struct btrfs_trans_handle *trans, 851int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
852 struct btrfs_device *device, u64 num_bytes,
853 u64 *start, u64 *len) 852 u64 *start, u64 *len)
854{ 853{
855 struct btrfs_key key; 854 struct btrfs_key key;
@@ -893,7 +892,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
893 key.offset = search_start; 892 key.offset = search_start;
894 key.type = BTRFS_DEV_EXTENT_KEY; 893 key.type = BTRFS_DEV_EXTENT_KEY;
895 894
896 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 895 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
897 if (ret < 0) 896 if (ret < 0)
898 goto out; 897 goto out;
899 if (ret > 0) { 898 if (ret > 0) {
@@ -1282,7 +1281,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1282 bool clear_super = false; 1281 bool clear_super = false;
1283 1282
1284 mutex_lock(&uuid_mutex); 1283 mutex_lock(&uuid_mutex);
1285 mutex_lock(&root->fs_info->volume_mutex);
1286 1284
1287 all_avail = root->fs_info->avail_data_alloc_bits | 1285 all_avail = root->fs_info->avail_data_alloc_bits |
1288 root->fs_info->avail_system_alloc_bits | 1286 root->fs_info->avail_system_alloc_bits |
@@ -1452,7 +1450,6 @@ error_close:
1452 if (bdev) 1450 if (bdev)
1453 blkdev_put(bdev, FMODE_READ | FMODE_EXCL); 1451 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1454out: 1452out:
1455 mutex_unlock(&root->fs_info->volume_mutex);
1456 mutex_unlock(&uuid_mutex); 1453 mutex_unlock(&uuid_mutex);
1457 return ret; 1454 return ret;
1458error_undo: 1455error_undo:
@@ -1469,8 +1466,7 @@ error_undo:
1469/* 1466/*
1470 * does all the dirty work required for changing file system's UUID. 1467 * does all the dirty work required for changing file system's UUID.
1471 */ 1468 */
1472static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, 1469static int btrfs_prepare_sprout(struct btrfs_root *root)
1473 struct btrfs_root *root)
1474{ 1470{
1475 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1471 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1476 struct btrfs_fs_devices *old_devices; 1472 struct btrfs_fs_devices *old_devices;
@@ -1629,7 +1625,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1629 } 1625 }
1630 1626
1631 filemap_write_and_wait(bdev->bd_inode->i_mapping); 1627 filemap_write_and_wait(bdev->bd_inode->i_mapping);
1632 mutex_lock(&root->fs_info->volume_mutex);
1633 1628
1634 devices = &root->fs_info->fs_devices->devices; 1629 devices = &root->fs_info->fs_devices->devices;
1635 /* 1630 /*
@@ -1695,7 +1690,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1695 1690
1696 if (seeding_dev) { 1691 if (seeding_dev) {
1697 sb->s_flags &= ~MS_RDONLY; 1692 sb->s_flags &= ~MS_RDONLY;
1698 ret = btrfs_prepare_sprout(trans, root); 1693 ret = btrfs_prepare_sprout(root);
1699 BUG_ON(ret); 1694 BUG_ON(ret);
1700 } 1695 }
1701 1696
@@ -1757,8 +1752,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1757 ret = btrfs_relocate_sys_chunks(root); 1752 ret = btrfs_relocate_sys_chunks(root);
1758 BUG_ON(ret); 1753 BUG_ON(ret);
1759 } 1754 }
1760out: 1755
1761 mutex_unlock(&root->fs_info->volume_mutex);
1762 return ret; 1756 return ret;
1763error: 1757error:
1764 blkdev_put(bdev, FMODE_EXCL); 1758 blkdev_put(bdev, FMODE_EXCL);
@@ -1766,7 +1760,7 @@ error:
1766 mutex_unlock(&uuid_mutex); 1760 mutex_unlock(&uuid_mutex);
1767 up_write(&sb->s_umount); 1761 up_write(&sb->s_umount);
1768 } 1762 }
1769 goto out; 1763 return ret;
1770} 1764}
1771 1765
1772static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 1766static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
@@ -2077,6 +2071,362 @@ error:
2077 return ret; 2071 return ret;
2078} 2072}
2079 2073
2074static int insert_balance_item(struct btrfs_root *root,
2075 struct btrfs_balance_control *bctl)
2076{
2077 struct btrfs_trans_handle *trans;
2078 struct btrfs_balance_item *item;
2079 struct btrfs_disk_balance_args disk_bargs;
2080 struct btrfs_path *path;
2081 struct extent_buffer *leaf;
2082 struct btrfs_key key;
2083 int ret, err;
2084
2085 path = btrfs_alloc_path();
2086 if (!path)
2087 return -ENOMEM;
2088
2089 trans = btrfs_start_transaction(root, 0);
2090 if (IS_ERR(trans)) {
2091 btrfs_free_path(path);
2092 return PTR_ERR(trans);
2093 }
2094
2095 key.objectid = BTRFS_BALANCE_OBJECTID;
2096 key.type = BTRFS_BALANCE_ITEM_KEY;
2097 key.offset = 0;
2098
2099 ret = btrfs_insert_empty_item(trans, root, path, &key,
2100 sizeof(*item));
2101 if (ret)
2102 goto out;
2103
2104 leaf = path->nodes[0];
2105 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2106
2107 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
2108
2109 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
2110 btrfs_set_balance_data(leaf, item, &disk_bargs);
2111 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
2112 btrfs_set_balance_meta(leaf, item, &disk_bargs);
2113 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
2114 btrfs_set_balance_sys(leaf, item, &disk_bargs);
2115
2116 btrfs_set_balance_flags(leaf, item, bctl->flags);
2117
2118 btrfs_mark_buffer_dirty(leaf);
2119out:
2120 btrfs_free_path(path);
2121 err = btrfs_commit_transaction(trans, root);
2122 if (err && !ret)
2123 ret = err;
2124 return ret;
2125}
2126
2127static int del_balance_item(struct btrfs_root *root)
2128{
2129 struct btrfs_trans_handle *trans;
2130 struct btrfs_path *path;
2131 struct btrfs_key key;
2132 int ret, err;
2133
2134 path = btrfs_alloc_path();
2135 if (!path)
2136 return -ENOMEM;
2137
2138 trans = btrfs_start_transaction(root, 0);
2139 if (IS_ERR(trans)) {
2140 btrfs_free_path(path);
2141 return PTR_ERR(trans);
2142 }
2143
2144 key.objectid = BTRFS_BALANCE_OBJECTID;
2145 key.type = BTRFS_BALANCE_ITEM_KEY;
2146 key.offset = 0;
2147
2148 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2149 if (ret < 0)
2150 goto out;
2151 if (ret > 0) {
2152 ret = -ENOENT;
2153 goto out;
2154 }
2155
2156 ret = btrfs_del_item(trans, root, path);
2157out:
2158 btrfs_free_path(path);
2159 err = btrfs_commit_transaction(trans, root);
2160 if (err && !ret)
2161 ret = err;
2162 return ret;
2163}
2164
2165/*
2166 * This is a heuristic used to reduce the number of chunks balanced on
2167 * resume after balance was interrupted.
2168 */
2169static void update_balance_args(struct btrfs_balance_control *bctl)
2170{
2171 /*
2172 * Turn on soft mode for chunk types that were being converted.
2173 */
2174 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
2175 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
2176 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
2177 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
2178 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
2179 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
2180
2181 /*
2182 * Turn on usage filter if is not already used. The idea is
2183 * that chunks that we have already balanced should be
2184 * reasonably full. Don't do it for chunks that are being
2185 * converted - that will keep us from relocating unconverted
2186 * (albeit full) chunks.
2187 */
2188 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2189 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2190 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
2191 bctl->data.usage = 90;
2192 }
2193 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2194 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2195 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
2196 bctl->sys.usage = 90;
2197 }
2198 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2199 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2200 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
2201 bctl->meta.usage = 90;
2202 }
2203}
2204
2205/*
2206 * Should be called with both balance and volume mutexes held to
2207 * serialize other volume operations (add_dev/rm_dev/resize) with
2208 * restriper. Same goes for unset_balance_control.
2209 */
2210static void set_balance_control(struct btrfs_balance_control *bctl)
2211{
2212 struct btrfs_fs_info *fs_info = bctl->fs_info;
2213
2214 BUG_ON(fs_info->balance_ctl);
2215
2216 spin_lock(&fs_info->balance_lock);
2217 fs_info->balance_ctl = bctl;
2218 spin_unlock(&fs_info->balance_lock);
2219}
2220
2221static void unset_balance_control(struct btrfs_fs_info *fs_info)
2222{
2223 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2224
2225 BUG_ON(!fs_info->balance_ctl);
2226
2227 spin_lock(&fs_info->balance_lock);
2228 fs_info->balance_ctl = NULL;
2229 spin_unlock(&fs_info->balance_lock);
2230
2231 kfree(bctl);
2232}
2233
2234/*
2235 * Balance filters. Return 1 if chunk should be filtered out
2236 * (should not be balanced).
2237 */
2238static int chunk_profiles_filter(u64 chunk_profile,
2239 struct btrfs_balance_args *bargs)
2240{
2241 chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
2242
2243 if (chunk_profile == 0)
2244 chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2245
2246 if (bargs->profiles & chunk_profile)
2247 return 0;
2248
2249 return 1;
2250}
2251
2252static u64 div_factor_fine(u64 num, int factor)
2253{
2254 if (factor <= 0)
2255 return 0;
2256 if (factor >= 100)
2257 return num;
2258
2259 num *= factor;
2260 do_div(num, 100);
2261 return num;
2262}
2263
2264static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2265 struct btrfs_balance_args *bargs)
2266{
2267 struct btrfs_block_group_cache *cache;
2268 u64 chunk_used, user_thresh;
2269 int ret = 1;
2270
2271 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2272 chunk_used = btrfs_block_group_used(&cache->item);
2273
2274 user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
2275 if (chunk_used < user_thresh)
2276 ret = 0;
2277
2278 btrfs_put_block_group(cache);
2279 return ret;
2280}
2281
2282static int chunk_devid_filter(struct extent_buffer *leaf,
2283 struct btrfs_chunk *chunk,
2284 struct btrfs_balance_args *bargs)
2285{
2286 struct btrfs_stripe *stripe;
2287 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2288 int i;
2289
2290 for (i = 0; i < num_stripes; i++) {
2291 stripe = btrfs_stripe_nr(chunk, i);
2292 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
2293 return 0;
2294 }
2295
2296 return 1;
2297}
2298
2299/* [pstart, pend) */
2300static int chunk_drange_filter(struct extent_buffer *leaf,
2301 struct btrfs_chunk *chunk,
2302 u64 chunk_offset,
2303 struct btrfs_balance_args *bargs)
2304{
2305 struct btrfs_stripe *stripe;
2306 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2307 u64 stripe_offset;
2308 u64 stripe_length;
2309 int factor;
2310 int i;
2311
2312 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
2313 return 0;
2314
2315 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
2316 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
2317 factor = 2;
2318 else
2319 factor = 1;
2320 factor = num_stripes / factor;
2321
2322 for (i = 0; i < num_stripes; i++) {
2323 stripe = btrfs_stripe_nr(chunk, i);
2324 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
2325 continue;
2326
2327 stripe_offset = btrfs_stripe_offset(leaf, stripe);
2328 stripe_length = btrfs_chunk_length(leaf, chunk);
2329 do_div(stripe_length, factor);
2330
2331 if (stripe_offset < bargs->pend &&
2332 stripe_offset + stripe_length > bargs->pstart)
2333 return 0;
2334 }
2335
2336 return 1;
2337}
2338
2339/* [vstart, vend) */
2340static int chunk_vrange_filter(struct extent_buffer *leaf,
2341 struct btrfs_chunk *chunk,
2342 u64 chunk_offset,
2343 struct btrfs_balance_args *bargs)
2344{
2345 if (chunk_offset < bargs->vend &&
2346 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
2347 /* at least part of the chunk is inside this vrange */
2348 return 0;
2349
2350 return 1;
2351}
2352
2353static int chunk_soft_convert_filter(u64 chunk_profile,
2354 struct btrfs_balance_args *bargs)
2355{
2356 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
2357 return 0;
2358
2359 chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
2360
2361 if (chunk_profile == 0)
2362 chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2363
2364 if (bargs->target & chunk_profile)
2365 return 1;
2366
2367 return 0;
2368}
2369
2370static int should_balance_chunk(struct btrfs_root *root,
2371 struct extent_buffer *leaf,
2372 struct btrfs_chunk *chunk, u64 chunk_offset)
2373{
2374 struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
2375 struct btrfs_balance_args *bargs = NULL;
2376 u64 chunk_type = btrfs_chunk_type(leaf, chunk);
2377
2378 /* type filter */
2379 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
2380 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
2381 return 0;
2382 }
2383
2384 if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
2385 bargs = &bctl->data;
2386 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
2387 bargs = &bctl->sys;
2388 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
2389 bargs = &bctl->meta;
2390
2391 /* profiles filter */
2392 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
2393 chunk_profiles_filter(chunk_type, bargs)) {
2394 return 0;
2395 }
2396
2397 /* usage filter */
2398 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
2399 chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
2400 return 0;
2401 }
2402
2403 /* devid filter */
2404 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
2405 chunk_devid_filter(leaf, chunk, bargs)) {
2406 return 0;
2407 }
2408
2409 /* drange filter, makes sense only with devid filter */
2410 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
2411 chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
2412 return 0;
2413 }
2414
2415 /* vrange filter */
2416 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
2417 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
2418 return 0;
2419 }
2420
2421 /* soft profile changing mode */
2422 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
2423 chunk_soft_convert_filter(chunk_type, bargs)) {
2424 return 0;
2425 }
2426
2427 return 1;
2428}
2429
2080static u64 div_factor(u64 num, int factor) 2430static u64 div_factor(u64 num, int factor)
2081{ 2431{
2082 if (factor == 10) 2432 if (factor == 10)
@@ -2086,29 +2436,28 @@ static u64 div_factor(u64 num, int factor)
2086 return num; 2436 return num;
2087} 2437}
2088 2438
2089int btrfs_balance(struct btrfs_root *dev_root) 2439static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2090{ 2440{
2091 int ret; 2441 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2092 struct list_head *devices = &dev_root->fs_info->fs_devices->devices; 2442 struct btrfs_root *chunk_root = fs_info->chunk_root;
2443 struct btrfs_root *dev_root = fs_info->dev_root;
2444 struct list_head *devices;
2093 struct btrfs_device *device; 2445 struct btrfs_device *device;
2094 u64 old_size; 2446 u64 old_size;
2095 u64 size_to_free; 2447 u64 size_to_free;
2448 struct btrfs_chunk *chunk;
2096 struct btrfs_path *path; 2449 struct btrfs_path *path;
2097 struct btrfs_key key; 2450 struct btrfs_key key;
2098 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
2099 struct btrfs_trans_handle *trans;
2100 struct btrfs_key found_key; 2451 struct btrfs_key found_key;
2101 2452 struct btrfs_trans_handle *trans;
2102 if (dev_root->fs_info->sb->s_flags & MS_RDONLY) 2453 struct extent_buffer *leaf;
2103 return -EROFS; 2454 int slot;
2104 2455 int ret;
2105 if (!capable(CAP_SYS_ADMIN)) 2456 int enospc_errors = 0;
2106 return -EPERM; 2457 bool counting = true;
2107
2108 mutex_lock(&dev_root->fs_info->volume_mutex);
2109 dev_root = dev_root->fs_info->dev_root;
2110 2458
2111 /* step one make some room on all the devices */ 2459 /* step one make some room on all the devices */
2460 devices = &fs_info->fs_devices->devices;
2112 list_for_each_entry(device, devices, dev_list) { 2461 list_for_each_entry(device, devices, dev_list) {
2113 old_size = device->total_bytes; 2462 old_size = device->total_bytes;
2114 size_to_free = div_factor(old_size, 1); 2463 size_to_free = div_factor(old_size, 1);
@@ -2137,11 +2486,23 @@ int btrfs_balance(struct btrfs_root *dev_root)
2137 ret = -ENOMEM; 2486 ret = -ENOMEM;
2138 goto error; 2487 goto error;
2139 } 2488 }
2489
2490 /* zero out stat counters */
2491 spin_lock(&fs_info->balance_lock);
2492 memset(&bctl->stat, 0, sizeof(bctl->stat));
2493 spin_unlock(&fs_info->balance_lock);
2494again:
2140 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2495 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2141 key.offset = (u64)-1; 2496 key.offset = (u64)-1;
2142 key.type = BTRFS_CHUNK_ITEM_KEY; 2497 key.type = BTRFS_CHUNK_ITEM_KEY;
2143 2498
2144 while (1) { 2499 while (1) {
2500 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
2501 atomic_read(&fs_info->balance_cancel_req)) {
2502 ret = -ECANCELED;
2503 goto error;
2504 }
2505
2145 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 2506 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2146 if (ret < 0) 2507 if (ret < 0)
2147 goto error; 2508 goto error;
@@ -2151,15 +2512,19 @@ int btrfs_balance(struct btrfs_root *dev_root)
2151 * failed 2512 * failed
2152 */ 2513 */
2153 if (ret == 0) 2514 if (ret == 0)
2154 break; 2515 BUG(); /* FIXME break ? */
2155 2516
2156 ret = btrfs_previous_item(chunk_root, path, 0, 2517 ret = btrfs_previous_item(chunk_root, path, 0,
2157 BTRFS_CHUNK_ITEM_KEY); 2518 BTRFS_CHUNK_ITEM_KEY);
2158 if (ret) 2519 if (ret) {
2520 ret = 0;
2159 break; 2521 break;
2522 }
2523
2524 leaf = path->nodes[0];
2525 slot = path->slots[0];
2526 btrfs_item_key_to_cpu(leaf, &found_key, slot);
2160 2527
2161 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2162 path->slots[0]);
2163 if (found_key.objectid != key.objectid) 2528 if (found_key.objectid != key.objectid)
2164 break; 2529 break;
2165 2530
@@ -2167,22 +2532,375 @@ int btrfs_balance(struct btrfs_root *dev_root)
2167 if (found_key.offset == 0) 2532 if (found_key.offset == 0)
2168 break; 2533 break;
2169 2534
2535 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
2536
2537 if (!counting) {
2538 spin_lock(&fs_info->balance_lock);
2539 bctl->stat.considered++;
2540 spin_unlock(&fs_info->balance_lock);
2541 }
2542
2543 ret = should_balance_chunk(chunk_root, leaf, chunk,
2544 found_key.offset);
2170 btrfs_release_path(path); 2545 btrfs_release_path(path);
2546 if (!ret)
2547 goto loop;
2548
2549 if (counting) {
2550 spin_lock(&fs_info->balance_lock);
2551 bctl->stat.expected++;
2552 spin_unlock(&fs_info->balance_lock);
2553 goto loop;
2554 }
2555
2171 ret = btrfs_relocate_chunk(chunk_root, 2556 ret = btrfs_relocate_chunk(chunk_root,
2172 chunk_root->root_key.objectid, 2557 chunk_root->root_key.objectid,
2173 found_key.objectid, 2558 found_key.objectid,
2174 found_key.offset); 2559 found_key.offset);
2175 if (ret && ret != -ENOSPC) 2560 if (ret && ret != -ENOSPC)
2176 goto error; 2561 goto error;
2562 if (ret == -ENOSPC) {
2563 enospc_errors++;
2564 } else {
2565 spin_lock(&fs_info->balance_lock);
2566 bctl->stat.completed++;
2567 spin_unlock(&fs_info->balance_lock);
2568 }
2569loop:
2177 key.offset = found_key.offset - 1; 2570 key.offset = found_key.offset - 1;
2178 } 2571 }
2179 ret = 0; 2572
2573 if (counting) {
2574 btrfs_release_path(path);
2575 counting = false;
2576 goto again;
2577 }
2180error: 2578error:
2181 btrfs_free_path(path); 2579 btrfs_free_path(path);
2182 mutex_unlock(&dev_root->fs_info->volume_mutex); 2580 if (enospc_errors) {
2581 printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
2582 enospc_errors);
2583 if (!ret)
2584 ret = -ENOSPC;
2585 }
2586
2183 return ret; 2587 return ret;
2184} 2588}
2185 2589
2590static inline int balance_need_close(struct btrfs_fs_info *fs_info)
2591{
2592 /* cancel requested || normal exit path */
2593 return atomic_read(&fs_info->balance_cancel_req) ||
2594 (atomic_read(&fs_info->balance_pause_req) == 0 &&
2595 atomic_read(&fs_info->balance_cancel_req) == 0);
2596}
2597
2598static void __cancel_balance(struct btrfs_fs_info *fs_info)
2599{
2600 int ret;
2601
2602 unset_balance_control(fs_info);
2603 ret = del_balance_item(fs_info->tree_root);
2604 BUG_ON(ret);
2605}
2606
2607void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
2608 struct btrfs_ioctl_balance_args *bargs);
2609
2610/*
2611 * Should be called with both balance and volume mutexes held
2612 */
2613int btrfs_balance(struct btrfs_balance_control *bctl,
2614 struct btrfs_ioctl_balance_args *bargs)
2615{
2616 struct btrfs_fs_info *fs_info = bctl->fs_info;
2617 u64 allowed;
2618 int ret;
2619
2620 if (btrfs_fs_closing(fs_info) ||
2621 atomic_read(&fs_info->balance_pause_req) ||
2622 atomic_read(&fs_info->balance_cancel_req)) {
2623 ret = -EINVAL;
2624 goto out;
2625 }
2626
2627 /*
2628 * In case of mixed groups both data and meta should be picked,
2629 * and identical options should be given for both of them.
2630 */
2631 allowed = btrfs_super_incompat_flags(fs_info->super_copy);
2632 if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
2633 (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) {
2634 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
2635 !(bctl->flags & BTRFS_BALANCE_METADATA) ||
2636 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
2637 printk(KERN_ERR "btrfs: with mixed groups data and "
2638 "metadata balance options must be the same\n");
2639 ret = -EINVAL;
2640 goto out;
2641 }
2642 }
2643
2644 /*
2645 * Profile changing sanity checks. Skip them if a simple
2646 * balance is requested.
2647 */
2648 if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) &
2649 BTRFS_BALANCE_ARGS_CONVERT))
2650 goto do_balance;
2651
2652 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2653 if (fs_info->fs_devices->num_devices == 1)
2654 allowed |= BTRFS_BLOCK_GROUP_DUP;
2655 else if (fs_info->fs_devices->num_devices < 4)
2656 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
2657 else
2658 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
2659 BTRFS_BLOCK_GROUP_RAID10);
2660
2661 if (!profile_is_valid(bctl->data.target, 1) ||
2662 bctl->data.target & ~allowed) {
2663 printk(KERN_ERR "btrfs: unable to start balance with target "
2664 "data profile %llu\n",
2665 (unsigned long long)bctl->data.target);
2666 ret = -EINVAL;
2667 goto out;
2668 }
2669 if (!profile_is_valid(bctl->meta.target, 1) ||
2670 bctl->meta.target & ~allowed) {
2671 printk(KERN_ERR "btrfs: unable to start balance with target "
2672 "metadata profile %llu\n",
2673 (unsigned long long)bctl->meta.target);
2674 ret = -EINVAL;
2675 goto out;
2676 }
2677 if (!profile_is_valid(bctl->sys.target, 1) ||
2678 bctl->sys.target & ~allowed) {
2679 printk(KERN_ERR "btrfs: unable to start balance with target "
2680 "system profile %llu\n",
2681 (unsigned long long)bctl->sys.target);
2682 ret = -EINVAL;
2683 goto out;
2684 }
2685
2686 if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) {
2687 printk(KERN_ERR "btrfs: dup for data is not allowed\n");
2688 ret = -EINVAL;
2689 goto out;
2690 }
2691
2692 /* allow to reduce meta or sys integrity only if force set */
2693 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2694 BTRFS_BLOCK_GROUP_RAID10;
2695 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2696 (fs_info->avail_system_alloc_bits & allowed) &&
2697 !(bctl->sys.target & allowed)) ||
2698 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2699 (fs_info->avail_metadata_alloc_bits & allowed) &&
2700 !(bctl->meta.target & allowed))) {
2701 if (bctl->flags & BTRFS_BALANCE_FORCE) {
2702 printk(KERN_INFO "btrfs: force reducing metadata "
2703 "integrity\n");
2704 } else {
2705 printk(KERN_ERR "btrfs: balance will reduce metadata "
2706 "integrity, use force if you want this\n");
2707 ret = -EINVAL;
2708 goto out;
2709 }
2710 }
2711
2712do_balance:
2713 ret = insert_balance_item(fs_info->tree_root, bctl);
2714 if (ret && ret != -EEXIST)
2715 goto out;
2716
2717 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
2718 BUG_ON(ret == -EEXIST);
2719 set_balance_control(bctl);
2720 } else {
2721 BUG_ON(ret != -EEXIST);
2722 spin_lock(&fs_info->balance_lock);
2723 update_balance_args(bctl);
2724 spin_unlock(&fs_info->balance_lock);
2725 }
2726
2727 atomic_inc(&fs_info->balance_running);
2728 mutex_unlock(&fs_info->balance_mutex);
2729
2730 ret = __btrfs_balance(fs_info);
2731
2732 mutex_lock(&fs_info->balance_mutex);
2733 atomic_dec(&fs_info->balance_running);
2734
2735 if (bargs) {
2736 memset(bargs, 0, sizeof(*bargs));
2737 update_ioctl_balance_args(fs_info, 0, bargs);
2738 }
2739
2740 if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
2741 balance_need_close(fs_info)) {
2742 __cancel_balance(fs_info);
2743 }
2744
2745 wake_up(&fs_info->balance_wait_q);
2746
2747 return ret;
2748out:
2749 if (bctl->flags & BTRFS_BALANCE_RESUME)
2750 __cancel_balance(fs_info);
2751 else
2752 kfree(bctl);
2753 return ret;
2754}
2755
2756static int balance_kthread(void *data)
2757{
2758 struct btrfs_balance_control *bctl =
2759 (struct btrfs_balance_control *)data;
2760 struct btrfs_fs_info *fs_info = bctl->fs_info;
2761 int ret = 0;
2762
2763 mutex_lock(&fs_info->volume_mutex);
2764 mutex_lock(&fs_info->balance_mutex);
2765
2766 set_balance_control(bctl);
2767
2768 if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
2769 printk(KERN_INFO "btrfs: force skipping balance\n");
2770 } else {
2771 printk(KERN_INFO "btrfs: continuing balance\n");
2772 ret = btrfs_balance(bctl, NULL);
2773 }
2774
2775 mutex_unlock(&fs_info->balance_mutex);
2776 mutex_unlock(&fs_info->volume_mutex);
2777 return ret;
2778}
2779
2780int btrfs_recover_balance(struct btrfs_root *tree_root)
2781{
2782 struct task_struct *tsk;
2783 struct btrfs_balance_control *bctl;
2784 struct btrfs_balance_item *item;
2785 struct btrfs_disk_balance_args disk_bargs;
2786 struct btrfs_path *path;
2787 struct extent_buffer *leaf;
2788 struct btrfs_key key;
2789 int ret;
2790
2791 path = btrfs_alloc_path();
2792 if (!path)
2793 return -ENOMEM;
2794
2795 bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
2796 if (!bctl) {
2797 ret = -ENOMEM;
2798 goto out;
2799 }
2800
2801 key.objectid = BTRFS_BALANCE_OBJECTID;
2802 key.type = BTRFS_BALANCE_ITEM_KEY;
2803 key.offset = 0;
2804
2805 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
2806 if (ret < 0)
2807 goto out_bctl;
2808 if (ret > 0) { /* ret = -ENOENT; */
2809 ret = 0;
2810 goto out_bctl;
2811 }
2812
2813 leaf = path->nodes[0];
2814 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2815
2816 bctl->fs_info = tree_root->fs_info;
2817 bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME;
2818
2819 btrfs_balance_data(leaf, item, &disk_bargs);
2820 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
2821 btrfs_balance_meta(leaf, item, &disk_bargs);
2822 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
2823 btrfs_balance_sys(leaf, item, &disk_bargs);
2824 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
2825
2826 tsk = kthread_run(balance_kthread, bctl, "btrfs-balance");
2827 if (IS_ERR(tsk))
2828 ret = PTR_ERR(tsk);
2829 else
2830 goto out;
2831
2832out_bctl:
2833 kfree(bctl);
2834out:
2835 btrfs_free_path(path);
2836 return ret;
2837}
2838
2839int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
2840{
2841 int ret = 0;
2842
2843 mutex_lock(&fs_info->balance_mutex);
2844 if (!fs_info->balance_ctl) {
2845 mutex_unlock(&fs_info->balance_mutex);
2846 return -ENOTCONN;
2847 }
2848
2849 if (atomic_read(&fs_info->balance_running)) {
2850 atomic_inc(&fs_info->balance_pause_req);
2851 mutex_unlock(&fs_info->balance_mutex);
2852
2853 wait_event(fs_info->balance_wait_q,
2854 atomic_read(&fs_info->balance_running) == 0);
2855
2856 mutex_lock(&fs_info->balance_mutex);
2857 /* we are good with balance_ctl ripped off from under us */
2858 BUG_ON(atomic_read(&fs_info->balance_running));
2859 atomic_dec(&fs_info->balance_pause_req);
2860 } else {
2861 ret = -ENOTCONN;
2862 }
2863
2864 mutex_unlock(&fs_info->balance_mutex);
2865 return ret;
2866}
2867
2868int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
2869{
2870 mutex_lock(&fs_info->balance_mutex);
2871 if (!fs_info->balance_ctl) {
2872 mutex_unlock(&fs_info->balance_mutex);
2873 return -ENOTCONN;
2874 }
2875
2876 atomic_inc(&fs_info->balance_cancel_req);
2877 /*
2878 * if we are running just wait and return, balance item is
2879 * deleted in btrfs_balance in this case
2880 */
2881 if (atomic_read(&fs_info->balance_running)) {
2882 mutex_unlock(&fs_info->balance_mutex);
2883 wait_event(fs_info->balance_wait_q,
2884 atomic_read(&fs_info->balance_running) == 0);
2885 mutex_lock(&fs_info->balance_mutex);
2886 } else {
2887 /* __cancel_balance needs volume_mutex */
2888 mutex_unlock(&fs_info->balance_mutex);
2889 mutex_lock(&fs_info->volume_mutex);
2890 mutex_lock(&fs_info->balance_mutex);
2891
2892 if (fs_info->balance_ctl)
2893 __cancel_balance(fs_info);
2894
2895 mutex_unlock(&fs_info->volume_mutex);
2896 }
2897
2898 BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
2899 atomic_dec(&fs_info->balance_cancel_req);
2900 mutex_unlock(&fs_info->balance_mutex);
2901 return 0;
2902}
2903
2186/* 2904/*
2187 * shrinking a device means finding all of the device extents past 2905 * shrinking a device means finding all of the device extents past
2188 * the new size, and then following the back refs to the chunks. 2906 * the new size, and then following the back refs to the chunks.
@@ -2323,8 +3041,7 @@ done:
2323 return ret; 3041 return ret;
2324} 3042}
2325 3043
2326static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, 3044static int btrfs_add_system_chunk(struct btrfs_root *root,
2327 struct btrfs_root *root,
2328 struct btrfs_key *key, 3045 struct btrfs_key *key,
2329 struct btrfs_chunk *chunk, int item_size) 3046 struct btrfs_chunk *chunk, int item_size)
2330{ 3047{
@@ -2441,10 +3158,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2441 max_stripe_size = 1024 * 1024 * 1024; 3158 max_stripe_size = 1024 * 1024 * 1024;
2442 max_chunk_size = 10 * max_stripe_size; 3159 max_chunk_size = 10 * max_stripe_size;
2443 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 3160 } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
2444 max_stripe_size = 256 * 1024 * 1024; 3161 /* for larger filesystems, use larger metadata chunks */
3162 if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
3163 max_stripe_size = 1024 * 1024 * 1024;
3164 else
3165 max_stripe_size = 256 * 1024 * 1024;
2445 max_chunk_size = max_stripe_size; 3166 max_chunk_size = max_stripe_size;
2446 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 3167 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
2447 max_stripe_size = 8 * 1024 * 1024; 3168 max_stripe_size = 32 * 1024 * 1024;
2448 max_chunk_size = 2 * max_stripe_size; 3169 max_chunk_size = 2 * max_stripe_size;
2449 } else { 3170 } else {
2450 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n", 3171 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
@@ -2496,7 +3217,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2496 if (total_avail == 0) 3217 if (total_avail == 0)
2497 continue; 3218 continue;
2498 3219
2499 ret = find_free_dev_extent(trans, device, 3220 ret = find_free_dev_extent(device,
2500 max_stripe_size * dev_stripes, 3221 max_stripe_size * dev_stripes,
2501 &dev_offset, &max_avail); 3222 &dev_offset, &max_avail);
2502 if (ret && ret != -ENOSPC) 3223 if (ret && ret != -ENOSPC)
@@ -2687,7 +3408,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2687 BUG_ON(ret); 3408 BUG_ON(ret);
2688 3409
2689 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 3410 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2690 ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk, 3411 ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
2691 item_size); 3412 item_size);
2692 BUG_ON(ret); 3413 BUG_ON(ret);
2693 } 3414 }
@@ -2752,8 +3473,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
2752 return ret; 3473 return ret;
2753 3474
2754 alloc_profile = BTRFS_BLOCK_GROUP_METADATA | 3475 alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
2755 (fs_info->metadata_alloc_profile & 3476 fs_info->avail_metadata_alloc_bits;
2756 fs_info->avail_metadata_alloc_bits);
2757 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 3477 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2758 3478
2759 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 3479 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
@@ -2763,8 +3483,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
2763 sys_chunk_offset = chunk_offset + chunk_size; 3483 sys_chunk_offset = chunk_offset + chunk_size;
2764 3484
2765 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | 3485 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
2766 (fs_info->system_alloc_profile & 3486 fs_info->avail_system_alloc_bits;
2767 fs_info->avail_system_alloc_bits);
2768 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 3487 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2769 3488
2770 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 3489 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
@@ -2901,26 +3620,13 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2901 u64 stripe_nr; 3620 u64 stripe_nr;
2902 u64 stripe_nr_orig; 3621 u64 stripe_nr_orig;
2903 u64 stripe_nr_end; 3622 u64 stripe_nr_end;
2904 int stripes_allocated = 8;
2905 int stripes_required = 1;
2906 int stripe_index; 3623 int stripe_index;
2907 int i; 3624 int i;
3625 int ret = 0;
2908 int num_stripes; 3626 int num_stripes;
2909 int max_errors = 0; 3627 int max_errors = 0;
2910 struct btrfs_bio *bbio = NULL; 3628 struct btrfs_bio *bbio = NULL;
2911 3629
2912 if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
2913 stripes_allocated = 1;
2914again:
2915 if (bbio_ret) {
2916 bbio = kzalloc(btrfs_bio_size(stripes_allocated),
2917 GFP_NOFS);
2918 if (!bbio)
2919 return -ENOMEM;
2920
2921 atomic_set(&bbio->error, 0);
2922 }
2923
2924 read_lock(&em_tree->lock); 3630 read_lock(&em_tree->lock);
2925 em = lookup_extent_mapping(em_tree, logical, *length); 3631 em = lookup_extent_mapping(em_tree, logical, *length);
2926 read_unlock(&em_tree->lock); 3632 read_unlock(&em_tree->lock);
@@ -2939,32 +3645,6 @@ again:
2939 if (mirror_num > map->num_stripes) 3645 if (mirror_num > map->num_stripes)
2940 mirror_num = 0; 3646 mirror_num = 0;
2941 3647
2942 /* if our btrfs_bio struct is too small, back off and try again */
2943 if (rw & REQ_WRITE) {
2944 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2945 BTRFS_BLOCK_GROUP_DUP)) {
2946 stripes_required = map->num_stripes;
2947 max_errors = 1;
2948 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2949 stripes_required = map->sub_stripes;
2950 max_errors = 1;
2951 }
2952 }
2953 if (rw & REQ_DISCARD) {
2954 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2955 BTRFS_BLOCK_GROUP_RAID1 |
2956 BTRFS_BLOCK_GROUP_DUP |
2957 BTRFS_BLOCK_GROUP_RAID10)) {
2958 stripes_required = map->num_stripes;
2959 }
2960 }
2961 if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
2962 stripes_allocated < stripes_required) {
2963 stripes_allocated = map->num_stripes;
2964 free_extent_map(em);
2965 kfree(bbio);
2966 goto again;
2967 }
2968 stripe_nr = offset; 3648 stripe_nr = offset;
2969 /* 3649 /*
2970 * stripe_nr counts the total number of stripes we have to stride 3650 * stripe_nr counts the total number of stripes we have to stride
@@ -2980,10 +3660,7 @@ again:
2980 3660
2981 if (rw & REQ_DISCARD) 3661 if (rw & REQ_DISCARD)
2982 *length = min_t(u64, em->len - offset, *length); 3662 *length = min_t(u64, em->len - offset, *length);
2983 else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 3663 else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
2984 BTRFS_BLOCK_GROUP_RAID1 |
2985 BTRFS_BLOCK_GROUP_RAID10 |
2986 BTRFS_BLOCK_GROUP_DUP)) {
2987 /* we limit the length of each bio to what fits in a stripe */ 3664 /* we limit the length of each bio to what fits in a stripe */
2988 *length = min_t(u64, em->len - offset, 3665 *length = min_t(u64, em->len - offset,
2989 map->stripe_len - stripe_offset); 3666 map->stripe_len - stripe_offset);
@@ -3059,81 +3736,55 @@ again:
3059 } 3736 }
3060 BUG_ON(stripe_index >= map->num_stripes); 3737 BUG_ON(stripe_index >= map->num_stripes);
3061 3738
3739 bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS);
3740 if (!bbio) {
3741 ret = -ENOMEM;
3742 goto out;
3743 }
3744 atomic_set(&bbio->error, 0);
3745
3062 if (rw & REQ_DISCARD) { 3746 if (rw & REQ_DISCARD) {
3747 int factor = 0;
3748 int sub_stripes = 0;
3749 u64 stripes_per_dev = 0;
3750 u32 remaining_stripes = 0;
3751
3752 if (map->type &
3753 (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
3754 if (map->type & BTRFS_BLOCK_GROUP_RAID0)
3755 sub_stripes = 1;
3756 else
3757 sub_stripes = map->sub_stripes;
3758
3759 factor = map->num_stripes / sub_stripes;
3760 stripes_per_dev = div_u64_rem(stripe_nr_end -
3761 stripe_nr_orig,
3762 factor,
3763 &remaining_stripes);
3764 }
3765
3063 for (i = 0; i < num_stripes; i++) { 3766 for (i = 0; i < num_stripes; i++) {
3064 bbio->stripes[i].physical = 3767 bbio->stripes[i].physical =
3065 map->stripes[stripe_index].physical + 3768 map->stripes[stripe_index].physical +
3066 stripe_offset + stripe_nr * map->stripe_len; 3769 stripe_offset + stripe_nr * map->stripe_len;
3067 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 3770 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
3068 3771
3069 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3772 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3070 u64 stripes; 3773 BTRFS_BLOCK_GROUP_RAID10)) {
3071 u32 last_stripe = 0; 3774 bbio->stripes[i].length = stripes_per_dev *
3072 int j; 3775 map->stripe_len;
3073 3776 if (i / sub_stripes < remaining_stripes)
3074 div_u64_rem(stripe_nr_end - 1, 3777 bbio->stripes[i].length +=
3075 map->num_stripes, 3778 map->stripe_len;
3076 &last_stripe); 3779 if (i < sub_stripes)
3077
3078 for (j = 0; j < map->num_stripes; j++) {
3079 u32 test;
3080
3081 div_u64_rem(stripe_nr_end - 1 - j,
3082 map->num_stripes, &test);
3083 if (test == stripe_index)
3084 break;
3085 }
3086 stripes = stripe_nr_end - 1 - j;
3087 do_div(stripes, map->num_stripes);
3088 bbio->stripes[i].length = map->stripe_len *
3089 (stripes - stripe_nr + 1);
3090
3091 if (i == 0) {
3092 bbio->stripes[i].length -=
3093 stripe_offset;
3094 stripe_offset = 0;
3095 }
3096 if (stripe_index == last_stripe)
3097 bbio->stripes[i].length -=
3098 stripe_end_offset;
3099 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3100 u64 stripes;
3101 int j;
3102 int factor = map->num_stripes /
3103 map->sub_stripes;
3104 u32 last_stripe = 0;
3105
3106 div_u64_rem(stripe_nr_end - 1,
3107 factor, &last_stripe);
3108 last_stripe *= map->sub_stripes;
3109
3110 for (j = 0; j < factor; j++) {
3111 u32 test;
3112
3113 div_u64_rem(stripe_nr_end - 1 - j,
3114 factor, &test);
3115
3116 if (test ==
3117 stripe_index / map->sub_stripes)
3118 break;
3119 }
3120 stripes = stripe_nr_end - 1 - j;
3121 do_div(stripes, factor);
3122 bbio->stripes[i].length = map->stripe_len *
3123 (stripes - stripe_nr + 1);
3124
3125 if (i < map->sub_stripes) {
3126 bbio->stripes[i].length -= 3780 bbio->stripes[i].length -=
3127 stripe_offset; 3781 stripe_offset;
3128 if (i == map->sub_stripes - 1) 3782 if ((i / sub_stripes + 1) %
3129 stripe_offset = 0; 3783 sub_stripes == remaining_stripes)
3130 }
3131 if (stripe_index >= last_stripe &&
3132 stripe_index <= (last_stripe +
3133 map->sub_stripes - 1)) {
3134 bbio->stripes[i].length -= 3784 bbio->stripes[i].length -=
3135 stripe_end_offset; 3785 stripe_end_offset;
3136 } 3786 if (i == sub_stripes - 1)
3787 stripe_offset = 0;
3137 } else 3788 } else
3138 bbio->stripes[i].length = *length; 3789 bbio->stripes[i].length = *length;
3139 3790
@@ -3155,15 +3806,22 @@ again:
3155 stripe_index++; 3806 stripe_index++;
3156 } 3807 }
3157 } 3808 }
3158 if (bbio_ret) { 3809
3159 *bbio_ret = bbio; 3810 if (rw & REQ_WRITE) {
3160 bbio->num_stripes = num_stripes; 3811 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
3161 bbio->max_errors = max_errors; 3812 BTRFS_BLOCK_GROUP_RAID10 |
3162 bbio->mirror_num = mirror_num; 3813 BTRFS_BLOCK_GROUP_DUP)) {
3814 max_errors = 1;
3815 }
3163 } 3816 }
3817
3818 *bbio_ret = bbio;
3819 bbio->num_stripes = num_stripes;
3820 bbio->max_errors = max_errors;
3821 bbio->mirror_num = mirror_num;
3164out: 3822out:
3165 free_extent_map(em); 3823 free_extent_map(em);
3166 return 0; 3824 return ret;
3167} 3825}
3168 3826
3169int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 3827int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
@@ -3304,7 +3962,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
3304 /* don't bother with additional async steps for reads, right now */ 3962 /* don't bother with additional async steps for reads, right now */
3305 if (!(rw & REQ_WRITE)) { 3963 if (!(rw & REQ_WRITE)) {
3306 bio_get(bio); 3964 bio_get(bio);
3307 submit_bio(rw, bio); 3965 btrfsic_submit_bio(rw, bio);
3308 bio_put(bio); 3966 bio_put(bio);
3309 return 0; 3967 return 0;
3310 } 3968 }
@@ -3399,7 +4057,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3399 if (async_submit) 4057 if (async_submit)
3400 schedule_bio(root, dev, rw, bio); 4058 schedule_bio(root, dev, rw, bio);
3401 else 4059 else
3402 submit_bio(rw, bio); 4060 btrfsic_submit_bio(rw, bio);
3403 } else { 4061 } else {
3404 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; 4062 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
3405 bio->bi_sector = logical >> 9; 4063 bio->bi_sector = logical >> 9;
@@ -3568,7 +4226,7 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
3568 struct btrfs_fs_devices *fs_devices; 4226 struct btrfs_fs_devices *fs_devices;
3569 int ret; 4227 int ret;
3570 4228
3571 mutex_lock(&uuid_mutex); 4229 BUG_ON(!mutex_is_locked(&uuid_mutex));
3572 4230
3573 fs_devices = root->fs_info->fs_devices->seed; 4231 fs_devices = root->fs_info->fs_devices->seed;
3574 while (fs_devices) { 4232 while (fs_devices) {
@@ -3606,7 +4264,6 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
3606 fs_devices->seed = root->fs_info->fs_devices->seed; 4264 fs_devices->seed = root->fs_info->fs_devices->seed;
3607 root->fs_info->fs_devices->seed = fs_devices; 4265 root->fs_info->fs_devices->seed = fs_devices;
3608out: 4266out:
3609 mutex_unlock(&uuid_mutex);
3610 return ret; 4267 return ret;
3611} 4268}
3612 4269
@@ -3749,6 +4406,9 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
3749 if (!path) 4406 if (!path)
3750 return -ENOMEM; 4407 return -ENOMEM;
3751 4408
4409 mutex_lock(&uuid_mutex);
4410 lock_chunks(root);
4411
3752 /* first we search for all of the device items, and then we 4412 /* first we search for all of the device items, and then we
3753 * read in all of the chunk items. This way we can create chunk 4413 * read in all of the chunk items. This way we can create chunk
3754 * mappings that reference all of the devices that are afound 4414 * mappings that reference all of the devices that are afound
@@ -3799,6 +4459,9 @@ again:
3799 } 4459 }
3800 ret = 0; 4460 ret = 0;
3801error: 4461error:
4462 unlock_chunks(root);
4463 mutex_unlock(&uuid_mutex);
4464
3802 btrfs_free_path(path); 4465 btrfs_free_path(path);
3803 return ret; 4466 return ret;
3804} 4467}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 78f2d4d4f37f..19ac95048b88 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -186,6 +186,51 @@ struct map_lookup {
186#define map_lookup_size(n) (sizeof(struct map_lookup) + \ 186#define map_lookup_size(n) (sizeof(struct map_lookup) + \
187 (sizeof(struct btrfs_bio_stripe) * (n))) 187 (sizeof(struct btrfs_bio_stripe) * (n)))
188 188
189/*
190 * Restriper's general type filter
191 */
192#define BTRFS_BALANCE_DATA (1ULL << 0)
193#define BTRFS_BALANCE_SYSTEM (1ULL << 1)
194#define BTRFS_BALANCE_METADATA (1ULL << 2)
195
196#define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \
197 BTRFS_BALANCE_SYSTEM | \
198 BTRFS_BALANCE_METADATA)
199
200#define BTRFS_BALANCE_FORCE (1ULL << 3)
201#define BTRFS_BALANCE_RESUME (1ULL << 4)
202
203/*
204 * Balance filters
205 */
206#define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0)
207#define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1)
208#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2)
209#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3)
210#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4)
211
212/*
213 * Profile changing flags. When SOFT is set we won't relocate chunk if
214 * it already has the target profile (even though it may be
215 * half-filled).
216 */
217#define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8)
218#define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9)
219
220struct btrfs_balance_args;
221struct btrfs_balance_progress;
222struct btrfs_balance_control {
223 struct btrfs_fs_info *fs_info;
224
225 struct btrfs_balance_args data;
226 struct btrfs_balance_args meta;
227 struct btrfs_balance_args sys;
228
229 u64 flags;
230
231 struct btrfs_balance_progress stat;
232};
233
189int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 234int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
190 u64 end, u64 *length); 235 u64 end, u64 *length);
191 236
@@ -228,9 +273,12 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
228 u8 *uuid, u8 *fsid); 273 u8 *uuid, u8 *fsid);
229int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); 274int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
230int btrfs_init_new_device(struct btrfs_root *root, char *path); 275int btrfs_init_new_device(struct btrfs_root *root, char *path);
231int btrfs_balance(struct btrfs_root *dev_root); 276int btrfs_balance(struct btrfs_balance_control *bctl,
277 struct btrfs_ioctl_balance_args *bargs);
278int btrfs_recover_balance(struct btrfs_root *tree_root);
279int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
280int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
232int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); 281int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
233int find_free_dev_extent(struct btrfs_trans_handle *trans, 282int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
234 struct btrfs_device *device, u64 num_bytes,
235 u64 *start, u64 *max_avail); 283 u64 *start, u64 *max_avail);
236#endif 284#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3848b04e310e..e7a5659087e6 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -200,7 +200,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
200 ret = btrfs_update_inode(trans, root, inode); 200 ret = btrfs_update_inode(trans, root, inode);
201 BUG_ON(ret); 201 BUG_ON(ret);
202out: 202out:
203 btrfs_end_transaction_throttle(trans, root); 203 btrfs_end_transaction(trans, root);
204 return ret; 204 return ret;
205} 205}
206 206
diff --git a/fs/namei.c b/fs/namei.c
index c283a1ec008e..208c6aa4a989 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -140,21 +140,19 @@ static int do_getname(const char __user *filename, char *page)
140 140
141static char *getname_flags(const char __user *filename, int flags, int *empty) 141static char *getname_flags(const char __user *filename, int flags, int *empty)
142{ 142{
143 char *tmp, *result; 143 char *result = __getname();
144 144 int retval;
145 result = ERR_PTR(-ENOMEM); 145
146 tmp = __getname(); 146 if (!result)
147 if (tmp) { 147 return ERR_PTR(-ENOMEM);
148 int retval = do_getname(filename, tmp); 148
149 149 retval = do_getname(filename, result);
150 result = tmp; 150 if (retval < 0) {
151 if (retval < 0) { 151 if (retval == -ENOENT && empty)
152 if (retval == -ENOENT && empty) 152 *empty = 1;
153 *empty = 1; 153 if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
154 if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) { 154 __putname(result);
155 __putname(tmp); 155 return ERR_PTR(retval);
156 result = ERR_PTR(retval);
157 }
158 } 156 }
159 } 157 }
160 audit_getname(result); 158 audit_getname(result);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5485a5388ecb..9cde9edf9c4d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -198,65 +198,7 @@ static int proc_root_link(struct dentry *dentry, struct path *path)
198 return result; 198 return result;
199} 199}
200 200
201static struct mm_struct *__check_mem_permission(struct task_struct *task) 201static struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
202{
203 struct mm_struct *mm;
204
205 mm = get_task_mm(task);
206 if (!mm)
207 return ERR_PTR(-EINVAL);
208
209 /*
210 * A task can always look at itself, in case it chooses
211 * to use system calls instead of load instructions.
212 */
213 if (task == current)
214 return mm;
215
216 /*
217 * If current is actively ptrace'ing, and would also be
218 * permitted to freshly attach with ptrace now, permit it.
219 */
220 if (task_is_stopped_or_traced(task)) {
221 int match;
222 rcu_read_lock();
223 match = (ptrace_parent(task) == current);
224 rcu_read_unlock();
225 if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
226 return mm;
227 }
228
229 /*
230 * No one else is allowed.
231 */
232 mmput(mm);
233 return ERR_PTR(-EPERM);
234}
235
236/*
237 * If current may access user memory in @task return a reference to the
238 * corresponding mm, otherwise ERR_PTR.
239 */
240static struct mm_struct *check_mem_permission(struct task_struct *task)
241{
242 struct mm_struct *mm;
243 int err;
244
245 /*
246 * Avoid racing if task exec's as we might get a new mm but validate
247 * against old credentials.
248 */
249 err = mutex_lock_killable(&task->signal->cred_guard_mutex);
250 if (err)
251 return ERR_PTR(err);
252
253 mm = __check_mem_permission(task);
254 mutex_unlock(&task->signal->cred_guard_mutex);
255
256 return mm;
257}
258
259struct mm_struct *mm_for_maps(struct task_struct *task)
260{ 202{
261 struct mm_struct *mm; 203 struct mm_struct *mm;
262 int err; 204 int err;
@@ -267,7 +209,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
267 209
268 mm = get_task_mm(task); 210 mm = get_task_mm(task);
269 if (mm && mm != current->mm && 211 if (mm && mm != current->mm &&
270 !ptrace_may_access(task, PTRACE_MODE_READ)) { 212 !ptrace_may_access(task, mode)) {
271 mmput(mm); 213 mmput(mm);
272 mm = ERR_PTR(-EACCES); 214 mm = ERR_PTR(-EACCES);
273 } 215 }
@@ -276,6 +218,11 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
276 return mm; 218 return mm;
277} 219}
278 220
221struct mm_struct *mm_for_maps(struct task_struct *task)
222{
223 return mm_access(task, PTRACE_MODE_READ);
224}
225
279static int proc_pid_cmdline(struct task_struct *task, char * buffer) 226static int proc_pid_cmdline(struct task_struct *task, char * buffer)
280{ 227{
281 int res = 0; 228 int res = 0;
@@ -752,38 +699,39 @@ static const struct file_operations proc_single_file_operations = {
752 699
753static int mem_open(struct inode* inode, struct file* file) 700static int mem_open(struct inode* inode, struct file* file)
754{ 701{
755 file->private_data = (void*)((long)current->self_exec_id); 702 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
703 struct mm_struct *mm;
704
705 if (!task)
706 return -ESRCH;
707
708 mm = mm_access(task, PTRACE_MODE_ATTACH);
709 put_task_struct(task);
710
711 if (IS_ERR(mm))
712 return PTR_ERR(mm);
713
756 /* OK to pass negative loff_t, we can catch out-of-range */ 714 /* OK to pass negative loff_t, we can catch out-of-range */
757 file->f_mode |= FMODE_UNSIGNED_OFFSET; 715 file->f_mode |= FMODE_UNSIGNED_OFFSET;
716 file->private_data = mm;
717
758 return 0; 718 return 0;
759} 719}
760 720
761static ssize_t mem_read(struct file * file, char __user * buf, 721static ssize_t mem_read(struct file * file, char __user * buf,
762 size_t count, loff_t *ppos) 722 size_t count, loff_t *ppos)
763{ 723{
764 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); 724 int ret;
765 char *page; 725 char *page;
766 unsigned long src = *ppos; 726 unsigned long src = *ppos;
767 int ret = -ESRCH; 727 struct mm_struct *mm = file->private_data;
768 struct mm_struct *mm;
769 728
770 if (!task) 729 if (!mm)
771 goto out_no_task; 730 return 0;
772 731
773 ret = -ENOMEM;
774 page = (char *)__get_free_page(GFP_TEMPORARY); 732 page = (char *)__get_free_page(GFP_TEMPORARY);
775 if (!page) 733 if (!page)
776 goto out; 734 return -ENOMEM;
777
778 mm = check_mem_permission(task);
779 ret = PTR_ERR(mm);
780 if (IS_ERR(mm))
781 goto out_free;
782
783 ret = -EIO;
784
785 if (file->private_data != (void*)((long)current->self_exec_id))
786 goto out_put;
787 735
788 ret = 0; 736 ret = 0;
789 737
@@ -810,13 +758,7 @@ static ssize_t mem_read(struct file * file, char __user * buf,
810 } 758 }
811 *ppos = src; 759 *ppos = src;
812 760
813out_put:
814 mmput(mm);
815out_free:
816 free_page((unsigned long) page); 761 free_page((unsigned long) page);
817out:
818 put_task_struct(task);
819out_no_task:
820 return ret; 762 return ret;
821} 763}
822 764
@@ -825,27 +767,15 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
825{ 767{
826 int copied; 768 int copied;
827 char *page; 769 char *page;
828 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
829 unsigned long dst = *ppos; 770 unsigned long dst = *ppos;
830 struct mm_struct *mm; 771 struct mm_struct *mm = file->private_data;
831 772
832 copied = -ESRCH; 773 if (!mm)
833 if (!task) 774 return 0;
834 goto out_no_task;
835 775
836 copied = -ENOMEM;
837 page = (char *)__get_free_page(GFP_TEMPORARY); 776 page = (char *)__get_free_page(GFP_TEMPORARY);
838 if (!page) 777 if (!page)
839 goto out_task; 778 return -ENOMEM;
840
841 mm = check_mem_permission(task);
842 copied = PTR_ERR(mm);
843 if (IS_ERR(mm))
844 goto out_free;
845
846 copied = -EIO;
847 if (file->private_data != (void *)((long)current->self_exec_id))
848 goto out_mm;
849 779
850 copied = 0; 780 copied = 0;
851 while (count > 0) { 781 while (count > 0) {
@@ -869,13 +799,7 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
869 } 799 }
870 *ppos = dst; 800 *ppos = dst;
871 801
872out_mm:
873 mmput(mm);
874out_free:
875 free_page((unsigned long) page); 802 free_page((unsigned long) page);
876out_task:
877 put_task_struct(task);
878out_no_task:
879 return copied; 803 return copied;
880} 804}
881 805
@@ -895,11 +819,20 @@ loff_t mem_lseek(struct file *file, loff_t offset, int orig)
895 return file->f_pos; 819 return file->f_pos;
896} 820}
897 821
822static int mem_release(struct inode *inode, struct file *file)
823{
824 struct mm_struct *mm = file->private_data;
825
826 mmput(mm);
827 return 0;
828}
829
898static const struct file_operations proc_mem_operations = { 830static const struct file_operations proc_mem_operations = {
899 .llseek = mem_lseek, 831 .llseek = mem_lseek,
900 .read = mem_read, 832 .read = mem_read,
901 .write = mem_write, 833 .write = mem_write,
902 .open = mem_open, 834 .open = mem_open,
835 .release = mem_release,
903}; 836};
904 837
905static ssize_t environ_read(struct file *file, char __user *buf, 838static ssize_t environ_read(struct file *file, char __user *buf,
@@ -1199,9 +1132,6 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
1199 ssize_t length; 1132 ssize_t length;
1200 uid_t loginuid; 1133 uid_t loginuid;
1201 1134
1202 if (!capable(CAP_AUDIT_CONTROL))
1203 return -EPERM;
1204
1205 rcu_read_lock(); 1135 rcu_read_lock();
1206 if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { 1136 if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
1207 rcu_read_unlock(); 1137 rcu_read_unlock();
@@ -1230,7 +1160,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
1230 goto out_free_page; 1160 goto out_free_page;
1231 1161
1232 } 1162 }
1233 length = audit_set_loginuid(current, loginuid); 1163 length = audit_set_loginuid(loginuid);
1234 if (likely(length == 0)) 1164 if (likely(length == 0))
1235 length = count; 1165 length = count;
1236 1166
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 574d4ee9b625..74b9baf36ac3 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -111,8 +111,7 @@ xfs_ioend_new_eof(
111 xfs_fsize_t bsize; 111 xfs_fsize_t bsize;
112 112
113 bsize = ioend->io_offset + ioend->io_size; 113 bsize = ioend->io_offset + ioend->io_size;
114 isize = MAX(ip->i_size, ip->i_new_size); 114 isize = MIN(i_size_read(VFS_I(ip)), bsize);
115 isize = MIN(isize, bsize);
116 return isize > ip->i_d.di_size ? isize : 0; 115 return isize > ip->i_d.di_size ? isize : 0;
117} 116}
118 117
@@ -126,11 +125,7 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
126} 125}
127 126
128/* 127/*
129 * Update on-disk file size now that data has been written to disk. The 128 * Update on-disk file size now that data has been written to disk.
130 * current in-memory file size is i_size. If a write is beyond eof i_new_size
131 * will be the intended file size until i_size is updated. If this write does
132 * not extend all the way to the valid file size then restrict this update to
133 * the end of the write.
134 * 129 *
135 * This function does not block as blocking on the inode lock in IO completion 130 * This function does not block as blocking on the inode lock in IO completion
136 * can lead to IO completion order dependency deadlocks.. If it can't get the 131 * can lead to IO completion order dependency deadlocks.. If it can't get the
@@ -1279,6 +1274,15 @@ xfs_end_io_direct_write(
1279 struct xfs_ioend *ioend = iocb->private; 1274 struct xfs_ioend *ioend = iocb->private;
1280 1275
1281 /* 1276 /*
1277 * While the generic direct I/O code updates the inode size, it does
1278 * so only after the end_io handler is called, which means our
1279 * end_io handler thinks the on-disk size is outside the in-core
1280 * size. To prevent this just update it a little bit earlier here.
1281 */
1282 if (offset + size > i_size_read(ioend->io_inode))
1283 i_size_write(ioend->io_inode, offset + size);
1284
1285 /*
1282 * blockdev_direct_IO can return an error even after the I/O 1286 * blockdev_direct_IO can return an error even after the I/O
1283 * completion handler was called. Thus we need to protect 1287 * completion handler was called. Thus we need to protect
1284 * against double-freeing. 1288 * against double-freeing.
@@ -1340,12 +1344,11 @@ xfs_vm_write_failed(
1340 1344
1341 if (to > inode->i_size) { 1345 if (to > inode->i_size) {
1342 /* 1346 /*
1343 * punch out the delalloc blocks we have already allocated. We 1347 * Punch out the delalloc blocks we have already allocated.
1344 * don't call xfs_setattr() to do this as we may be in the 1348 *
1345 * middle of a multi-iovec write and so the vfs inode->i_size 1349 * Don't bother with xfs_setattr given that nothing can have
1346 * will not match the xfs ip->i_size and so it will zero too 1350 * made it to disk yet as the page is still locked at this
1347 * much. Hence we jus truncate the page cache to zero what is 1351 * point.
1348 * necessary and punch the delalloc blocks directly.
1349 */ 1352 */
1350 struct xfs_inode *ip = XFS_I(inode); 1353 struct xfs_inode *ip = XFS_I(inode);
1351 xfs_fileoff_t start_fsb; 1354 xfs_fileoff_t start_fsb;
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 1e5d97f86ea8..08b9ac644c31 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -827,10 +827,6 @@ xfs_attr_inactive(xfs_inode_t *dp)
827 if (error) 827 if (error)
828 goto out; 828 goto out;
829 829
830 /*
831 * Commit the last in the sequence of transactions.
832 */
833 xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
834 error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES); 830 error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
835 xfs_iunlock(dp, XFS_ILOCK_EXCL); 831 xfs_iunlock(dp, XFS_ILOCK_EXCL);
836 832
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index c1b55e596551..d25eafd4d28d 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -271,10 +271,6 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
271 dp = args->dp; 271 dp = args->dp;
272 mp = dp->i_mount; 272 mp = dp->i_mount;
273 dp->i_d.di_forkoff = forkoff; 273 dp->i_d.di_forkoff = forkoff;
274 dp->i_df.if_ext_max =
275 XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
276 dp->i_afp->if_ext_max =
277 XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
278 274
279 ifp = dp->i_afp; 275 ifp = dp->i_afp;
280 ASSERT(ifp->if_flags & XFS_IFINLINE); 276 ASSERT(ifp->if_flags & XFS_IFINLINE);
@@ -326,7 +322,6 @@ xfs_attr_fork_reset(
326 ASSERT(ip->i_d.di_anextents == 0); 322 ASSERT(ip->i_d.di_anextents == 0);
327 ASSERT(ip->i_afp == NULL); 323 ASSERT(ip->i_afp == NULL);
328 324
329 ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
330 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 325 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
331} 326}
332 327
@@ -389,10 +384,6 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
389 (args->op_flags & XFS_DA_OP_ADDNAME) || 384 (args->op_flags & XFS_DA_OP_ADDNAME) ||
390 !(mp->m_flags & XFS_MOUNT_ATTR2) || 385 !(mp->m_flags & XFS_MOUNT_ATTR2) ||
391 dp->i_d.di_format == XFS_DINODE_FMT_BTREE); 386 dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
392 dp->i_afp->if_ext_max =
393 XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
394 dp->i_df.if_ext_max =
395 XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
396 xfs_trans_log_inode(args->trans, dp, 387 xfs_trans_log_inode(args->trans, dp,
397 XFS_ILOG_CORE | XFS_ILOG_ADATA); 388 XFS_ILOG_CORE | XFS_ILOG_ADATA);
398 } 389 }
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index d0ab78837057..188ef2fbd628 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -249,7 +249,27 @@ xfs_bmbt_lookup_ge(
249} 249}
250 250
251/* 251/*
252* Update the record referred to by cur to the value given 252 * Check if the inode needs to be converted to btree format.
253 */
254static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
255{
256 return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
257 XFS_IFORK_NEXTENTS(ip, whichfork) >
258 XFS_IFORK_MAXEXT(ip, whichfork);
259}
260
261/*
262 * Check if the inode should be converted to extent format.
263 */
264static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
265{
266 return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
267 XFS_IFORK_NEXTENTS(ip, whichfork) <=
268 XFS_IFORK_MAXEXT(ip, whichfork);
269}
270
271/*
272 * Update the record referred to by cur to the value given
253 * by [off, bno, len, state]. 273 * by [off, bno, len, state].
254 * This either works (return 0) or gets an EFSCORRUPTED error. 274 * This either works (return 0) or gets an EFSCORRUPTED error.
255 */ 275 */
@@ -683,8 +703,8 @@ xfs_bmap_add_extent_delay_real(
683 goto done; 703 goto done;
684 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 704 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
685 } 705 }
686 if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && 706
687 bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { 707 if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
688 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, 708 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
689 bma->firstblock, bma->flist, 709 bma->firstblock, bma->flist,
690 &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); 710 &bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
@@ -767,8 +787,8 @@ xfs_bmap_add_extent_delay_real(
767 goto done; 787 goto done;
768 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 788 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
769 } 789 }
770 if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && 790
771 bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { 791 if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
772 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, 792 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
773 bma->firstblock, bma->flist, &bma->cur, 1, 793 bma->firstblock, bma->flist, &bma->cur, 1,
774 &tmp_rval, XFS_DATA_FORK); 794 &tmp_rval, XFS_DATA_FORK);
@@ -836,8 +856,8 @@ xfs_bmap_add_extent_delay_real(
836 goto done; 856 goto done;
837 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 857 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
838 } 858 }
839 if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && 859
840 bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { 860 if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
841 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, 861 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
842 bma->firstblock, bma->flist, &bma->cur, 862 bma->firstblock, bma->flist, &bma->cur,
843 1, &tmp_rval, XFS_DATA_FORK); 863 1, &tmp_rval, XFS_DATA_FORK);
@@ -884,8 +904,7 @@ xfs_bmap_add_extent_delay_real(
884 } 904 }
885 905
886 /* convert to a btree if necessary */ 906 /* convert to a btree if necessary */
887 if (XFS_IFORK_FORMAT(bma->ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS && 907 if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
888 XFS_IFORK_NEXTENTS(bma->ip, XFS_DATA_FORK) > ifp->if_ext_max) {
889 int tmp_logflags; /* partial log flag return val */ 908 int tmp_logflags; /* partial log flag return val */
890 909
891 ASSERT(bma->cur == NULL); 910 ASSERT(bma->cur == NULL);
@@ -1421,8 +1440,7 @@ xfs_bmap_add_extent_unwritten_real(
1421 } 1440 }
1422 1441
1423 /* convert to a btree if necessary */ 1442 /* convert to a btree if necessary */
1424 if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS && 1443 if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) {
1425 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) {
1426 int tmp_logflags; /* partial log flag return val */ 1444 int tmp_logflags; /* partial log flag return val */
1427 1445
1428 ASSERT(cur == NULL); 1446 ASSERT(cur == NULL);
@@ -1812,8 +1830,7 @@ xfs_bmap_add_extent_hole_real(
1812 } 1830 }
1813 1831
1814 /* convert to a btree if necessary */ 1832 /* convert to a btree if necessary */
1815 if (XFS_IFORK_FORMAT(bma->ip, whichfork) == XFS_DINODE_FMT_EXTENTS && 1833 if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
1816 XFS_IFORK_NEXTENTS(bma->ip, whichfork) > ifp->if_ext_max) {
1817 int tmp_logflags; /* partial log flag return val */ 1834 int tmp_logflags; /* partial log flag return val */
1818 1835
1819 ASSERT(bma->cur == NULL); 1836 ASSERT(bma->cur == NULL);
@@ -3037,8 +3054,7 @@ xfs_bmap_extents_to_btree(
3037 3054
3038 ifp = XFS_IFORK_PTR(ip, whichfork); 3055 ifp = XFS_IFORK_PTR(ip, whichfork);
3039 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); 3056 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
3040 ASSERT(ifp->if_ext_max == 3057
3041 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
3042 /* 3058 /*
3043 * Make space in the inode incore. 3059 * Make space in the inode incore.
3044 */ 3060 */
@@ -3184,13 +3200,8 @@ xfs_bmap_forkoff_reset(
3184 ip->i_d.di_format != XFS_DINODE_FMT_BTREE) { 3200 ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
3185 uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; 3201 uint dfl_forkoff = xfs_default_attroffset(ip) >> 3;
3186 3202
3187 if (dfl_forkoff > ip->i_d.di_forkoff) { 3203 if (dfl_forkoff > ip->i_d.di_forkoff)
3188 ip->i_d.di_forkoff = dfl_forkoff; 3204 ip->i_d.di_forkoff = dfl_forkoff;
3189 ip->i_df.if_ext_max =
3190 XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
3191 ip->i_afp->if_ext_max =
3192 XFS_IFORK_ASIZE(ip) / sizeof(xfs_bmbt_rec_t);
3193 }
3194 } 3205 }
3195} 3206}
3196 3207
@@ -3430,8 +3441,6 @@ xfs_bmap_add_attrfork(
3430 int error; /* error return value */ 3441 int error; /* error return value */
3431 3442
3432 ASSERT(XFS_IFORK_Q(ip) == 0); 3443 ASSERT(XFS_IFORK_Q(ip) == 0);
3433 ASSERT(ip->i_df.if_ext_max ==
3434 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
3435 3444
3436 mp = ip->i_mount; 3445 mp = ip->i_mount;
3437 ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); 3446 ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
@@ -3486,12 +3495,9 @@ xfs_bmap_add_attrfork(
3486 error = XFS_ERROR(EINVAL); 3495 error = XFS_ERROR(EINVAL);
3487 goto error1; 3496 goto error1;
3488 } 3497 }
3489 ip->i_df.if_ext_max = 3498
3490 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
3491 ASSERT(ip->i_afp == NULL); 3499 ASSERT(ip->i_afp == NULL);
3492 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); 3500 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
3493 ip->i_afp->if_ext_max =
3494 XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
3495 ip->i_afp->if_flags = XFS_IFEXTENTS; 3501 ip->i_afp->if_flags = XFS_IFEXTENTS;
3496 logflags = 0; 3502 logflags = 0;
3497 xfs_bmap_init(&flist, &firstblock); 3503 xfs_bmap_init(&flist, &firstblock);
@@ -3535,20 +3541,17 @@ xfs_bmap_add_attrfork(
3535 } else 3541 } else
3536 spin_unlock(&mp->m_sb_lock); 3542 spin_unlock(&mp->m_sb_lock);
3537 } 3543 }
3538 if ((error = xfs_bmap_finish(&tp, &flist, &committed))) 3544
3545 error = xfs_bmap_finish(&tp, &flist, &committed);
3546 if (error)
3539 goto error2; 3547 goto error2;
3540 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 3548 return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3541 ASSERT(ip->i_df.if_ext_max ==
3542 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
3543 return error;
3544error2: 3549error2:
3545 xfs_bmap_cancel(&flist); 3550 xfs_bmap_cancel(&flist);
3546error1: 3551error1:
3547 xfs_iunlock(ip, XFS_ILOCK_EXCL); 3552 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3548error0: 3553error0:
3549 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); 3554 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
3550 ASSERT(ip->i_df.if_ext_max ==
3551 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
3552 return error; 3555 return error;
3553} 3556}
3554 3557
@@ -3994,11 +3997,8 @@ xfs_bmap_one_block(
3994 xfs_bmbt_irec_t s; /* internal version of extent */ 3997 xfs_bmbt_irec_t s; /* internal version of extent */
3995 3998
3996#ifndef DEBUG 3999#ifndef DEBUG
3997 if (whichfork == XFS_DATA_FORK) { 4000 if (whichfork == XFS_DATA_FORK)
3998 return S_ISREG(ip->i_d.di_mode) ? 4001 return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize;
3999 (ip->i_size == ip->i_mount->m_sb.sb_blocksize) :
4000 (ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize);
4001 }
4002#endif /* !DEBUG */ 4002#endif /* !DEBUG */
4003 if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) 4003 if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
4004 return 0; 4004 return 0;
@@ -4010,7 +4010,7 @@ xfs_bmap_one_block(
4010 xfs_bmbt_get_all(ep, &s); 4010 xfs_bmbt_get_all(ep, &s);
4011 rval = s.br_startoff == 0 && s.br_blockcount == 1; 4011 rval = s.br_startoff == 0 && s.br_blockcount == 1;
4012 if (rval && whichfork == XFS_DATA_FORK) 4012 if (rval && whichfork == XFS_DATA_FORK)
4013 ASSERT(ip->i_size == ip->i_mount->m_sb.sb_blocksize); 4013 ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize);
4014 return rval; 4014 return rval;
4015} 4015}
4016 4016
@@ -4379,8 +4379,6 @@ xfs_bmapi_read(
4379 XFS_STATS_INC(xs_blk_mapr); 4379 XFS_STATS_INC(xs_blk_mapr);
4380 4380
4381 ifp = XFS_IFORK_PTR(ip, whichfork); 4381 ifp = XFS_IFORK_PTR(ip, whichfork);
4382 ASSERT(ifp->if_ext_max ==
4383 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
4384 4382
4385 if (!(ifp->if_flags & XFS_IFEXTENTS)) { 4383 if (!(ifp->if_flags & XFS_IFEXTENTS)) {
4386 error = xfs_iread_extents(NULL, ip, whichfork); 4384 error = xfs_iread_extents(NULL, ip, whichfork);
@@ -4871,8 +4869,6 @@ xfs_bmapi_write(
4871 return XFS_ERROR(EIO); 4869 return XFS_ERROR(EIO);
4872 4870
4873 ifp = XFS_IFORK_PTR(ip, whichfork); 4871 ifp = XFS_IFORK_PTR(ip, whichfork);
4874 ASSERT(ifp->if_ext_max ==
4875 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
4876 4872
4877 XFS_STATS_INC(xs_blk_mapw); 4873 XFS_STATS_INC(xs_blk_mapw);
4878 4874
@@ -4981,8 +4977,7 @@ xfs_bmapi_write(
4981 /* 4977 /*
4982 * Transform from btree to extents, give it cur. 4978 * Transform from btree to extents, give it cur.
4983 */ 4979 */
4984 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && 4980 if (xfs_bmap_wants_extents(ip, whichfork)) {
4985 XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
4986 int tmp_logflags = 0; 4981 int tmp_logflags = 0;
4987 4982
4988 ASSERT(bma.cur); 4983 ASSERT(bma.cur);
@@ -4992,10 +4987,10 @@ xfs_bmapi_write(
4992 if (error) 4987 if (error)
4993 goto error0; 4988 goto error0;
4994 } 4989 }
4995 ASSERT(ifp->if_ext_max == 4990
4996 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
4997 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || 4991 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
4998 XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max); 4992 XFS_IFORK_NEXTENTS(ip, whichfork) >
4993 XFS_IFORK_MAXEXT(ip, whichfork));
4999 error = 0; 4994 error = 0;
5000error0: 4995error0:
5001 /* 4996 /*
@@ -5095,8 +5090,7 @@ xfs_bunmapi(
5095 5090
5096 ASSERT(len > 0); 5091 ASSERT(len > 0);
5097 ASSERT(nexts >= 0); 5092 ASSERT(nexts >= 0);
5098 ASSERT(ifp->if_ext_max == 5093
5099 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
5100 if (!(ifp->if_flags & XFS_IFEXTENTS) && 5094 if (!(ifp->if_flags & XFS_IFEXTENTS) &&
5101 (error = xfs_iread_extents(tp, ip, whichfork))) 5095 (error = xfs_iread_extents(tp, ip, whichfork)))
5102 return error; 5096 return error;
@@ -5322,7 +5316,8 @@ xfs_bunmapi(
5322 */ 5316 */
5323 if (!wasdel && xfs_trans_get_block_res(tp) == 0 && 5317 if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
5324 XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && 5318 XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
5325 XFS_IFORK_NEXTENTS(ip, whichfork) >= ifp->if_ext_max && 5319 XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
5320 XFS_IFORK_MAXEXT(ip, whichfork) &&
5326 del.br_startoff > got.br_startoff && 5321 del.br_startoff > got.br_startoff &&
5327 del.br_startoff + del.br_blockcount < 5322 del.br_startoff + del.br_blockcount <
5328 got.br_startoff + got.br_blockcount) { 5323 got.br_startoff + got.br_blockcount) {
@@ -5353,13 +5348,11 @@ nodelete:
5353 } 5348 }
5354 } 5349 }
5355 *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0; 5350 *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
5356 ASSERT(ifp->if_ext_max == 5351
5357 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
5358 /* 5352 /*
5359 * Convert to a btree if necessary. 5353 * Convert to a btree if necessary.
5360 */ 5354 */
5361 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && 5355 if (xfs_bmap_needs_btree(ip, whichfork)) {
5362 XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
5363 ASSERT(cur == NULL); 5356 ASSERT(cur == NULL);
5364 error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, 5357 error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist,
5365 &cur, 0, &tmp_logflags, whichfork); 5358 &cur, 0, &tmp_logflags, whichfork);
@@ -5370,8 +5363,7 @@ nodelete:
5370 /* 5363 /*
5371 * transform from btree to extents, give it cur 5364 * transform from btree to extents, give it cur
5372 */ 5365 */
5373 else if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && 5366 else if (xfs_bmap_wants_extents(ip, whichfork)) {
5374 XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
5375 ASSERT(cur != NULL); 5367 ASSERT(cur != NULL);
5376 error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags, 5368 error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
5377 whichfork); 5369 whichfork);
@@ -5382,8 +5374,6 @@ nodelete:
5382 /* 5374 /*
5383 * transform from extents to local? 5375 * transform from extents to local?
5384 */ 5376 */
5385 ASSERT(ifp->if_ext_max ==
5386 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
5387 error = 0; 5377 error = 0;
5388error0: 5378error0:
5389 /* 5379 /*
@@ -5434,7 +5424,7 @@ xfs_getbmapx_fix_eof_hole(
5434 if (startblock == HOLESTARTBLOCK) { 5424 if (startblock == HOLESTARTBLOCK) {
5435 mp = ip->i_mount; 5425 mp = ip->i_mount;
5436 out->bmv_block = -1; 5426 out->bmv_block = -1;
5437 fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, ip->i_size)); 5427 fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
5438 fixlen -= out->bmv_offset; 5428 fixlen -= out->bmv_offset;
5439 if (prealloced && out->bmv_offset + out->bmv_length == end) { 5429 if (prealloced && out->bmv_offset + out->bmv_length == end) {
5440 /* Came to hole at EOF. Trim it. */ 5430 /* Came to hole at EOF. Trim it. */
@@ -5522,7 +5512,7 @@ xfs_getbmap(
5522 fixlen = XFS_MAXIOFFSET(mp); 5512 fixlen = XFS_MAXIOFFSET(mp);
5523 } else { 5513 } else {
5524 prealloced = 0; 5514 prealloced = 0;
5525 fixlen = ip->i_size; 5515 fixlen = XFS_ISIZE(ip);
5526 } 5516 }
5527 } 5517 }
5528 5518
@@ -5551,7 +5541,7 @@ xfs_getbmap(
5551 5541
5552 xfs_ilock(ip, XFS_IOLOCK_SHARED); 5542 xfs_ilock(ip, XFS_IOLOCK_SHARED);
5553 if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { 5543 if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
5554 if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) { 5544 if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
5555 error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF); 5545 error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF);
5556 if (error) 5546 if (error)
5557 goto out_unlock_iolock; 5547 goto out_unlock_iolock;
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 654dc6f05bac..dd974a55c77d 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -163,12 +163,14 @@ xfs_swap_extents_check_format(
163 163
164 /* Check temp in extent form to max in target */ 164 /* Check temp in extent form to max in target */
165 if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && 165 if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
166 XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max) 166 XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
167 XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
167 return EINVAL; 168 return EINVAL;
168 169
169 /* Check target in extent form to max in temp */ 170 /* Check target in extent form to max in temp */
170 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && 171 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
171 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max) 172 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
173 XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
172 return EINVAL; 174 return EINVAL;
173 175
174 /* 176 /*
@@ -180,18 +182,25 @@ xfs_swap_extents_check_format(
180 * (a common defrag case) which will occur when the temp inode is in 182 * (a common defrag case) which will occur when the temp inode is in
181 * extent format... 183 * extent format...
182 */ 184 */
183 if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE && 185 if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
184 ((XFS_IFORK_BOFF(ip) && 186 if (XFS_IFORK_BOFF(ip) &&
185 tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) || 187 tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
186 XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max)) 188 return EINVAL;
187 return EINVAL; 189 if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
190 XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
191 return EINVAL;
192 }
188 193
189 /* Reciprocal target->temp btree format checks */ 194 /* Reciprocal target->temp btree format checks */
190 if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && 195 if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
191 ((XFS_IFORK_BOFF(tip) && 196 if (XFS_IFORK_BOFF(tip) &&
192 ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) || 197 ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
193 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max)) 198 return EINVAL;
194 return EINVAL; 199
200 if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
201 XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
202 return EINVAL;
203 }
195 204
196 return 0; 205 return 0;
197} 206}
@@ -349,16 +358,6 @@ xfs_swap_extents(
349 *tifp = *tempifp; /* struct copy */ 358 *tifp = *tempifp; /* struct copy */
350 359
351 /* 360 /*
352 * Fix the in-memory data fork values that are dependent on the fork
353 * offset in the inode. We can't assume they remain the same as attr2
354 * has dynamic fork offsets.
355 */
356 ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) /
357 (uint)sizeof(xfs_bmbt_rec_t);
358 tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) /
359 (uint)sizeof(xfs_bmbt_rec_t);
360
361 /*
362 * Fix the on-disk inode values 361 * Fix the on-disk inode values
363 */ 362 */
364 tmp = (__uint64_t)ip->i_d.di_nblocks; 363 tmp = (__uint64_t)ip->i_d.di_nblocks;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f675f3d9d7b3..7e5bc872f2b4 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -327,7 +327,7 @@ xfs_file_aio_read(
327 mp->m_rtdev_targp : mp->m_ddev_targp; 327 mp->m_rtdev_targp : mp->m_ddev_targp;
328 if ((iocb->ki_pos & target->bt_smask) || 328 if ((iocb->ki_pos & target->bt_smask) ||
329 (size & target->bt_smask)) { 329 (size & target->bt_smask)) {
330 if (iocb->ki_pos == ip->i_size) 330 if (iocb->ki_pos == i_size_read(inode))
331 return 0; 331 return 0;
332 return -XFS_ERROR(EINVAL); 332 return -XFS_ERROR(EINVAL);
333 } 333 }
@@ -412,51 +412,6 @@ xfs_file_splice_read(
412 return ret; 412 return ret;
413} 413}
414 414
415STATIC void
416xfs_aio_write_isize_update(
417 struct inode *inode,
418 loff_t *ppos,
419 ssize_t bytes_written)
420{
421 struct xfs_inode *ip = XFS_I(inode);
422 xfs_fsize_t isize = i_size_read(inode);
423
424 if (bytes_written > 0)
425 XFS_STATS_ADD(xs_write_bytes, bytes_written);
426
427 if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
428 *ppos > isize))
429 *ppos = isize;
430
431 if (*ppos > ip->i_size) {
432 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
433 if (*ppos > ip->i_size)
434 ip->i_size = *ppos;
435 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
436 }
437}
438
439/*
440 * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
441 * part of the I/O may have been written to disk before the error occurred. In
442 * this case the on-disk file size may have been adjusted beyond the in-memory
443 * file size and now needs to be truncated back.
444 */
445STATIC void
446xfs_aio_write_newsize_update(
447 struct xfs_inode *ip,
448 xfs_fsize_t new_size)
449{
450 if (new_size == ip->i_new_size) {
451 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
452 if (new_size == ip->i_new_size)
453 ip->i_new_size = 0;
454 if (ip->i_d.di_size > ip->i_size)
455 ip->i_d.di_size = ip->i_size;
456 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
457 }
458}
459
460/* 415/*
461 * xfs_file_splice_write() does not use xfs_rw_ilock() because 416 * xfs_file_splice_write() does not use xfs_rw_ilock() because
462 * generic_file_splice_write() takes the i_mutex itself. This, in theory, 417 * generic_file_splice_write() takes the i_mutex itself. This, in theory,
@@ -475,7 +430,6 @@ xfs_file_splice_write(
475{ 430{
476 struct inode *inode = outfilp->f_mapping->host; 431 struct inode *inode = outfilp->f_mapping->host;
477 struct xfs_inode *ip = XFS_I(inode); 432 struct xfs_inode *ip = XFS_I(inode);
478 xfs_fsize_t new_size;
479 int ioflags = 0; 433 int ioflags = 0;
480 ssize_t ret; 434 ssize_t ret;
481 435
@@ -489,19 +443,12 @@ xfs_file_splice_write(
489 443
490 xfs_ilock(ip, XFS_IOLOCK_EXCL); 444 xfs_ilock(ip, XFS_IOLOCK_EXCL);
491 445
492 new_size = *ppos + count;
493
494 xfs_ilock(ip, XFS_ILOCK_EXCL);
495 if (new_size > ip->i_size)
496 ip->i_new_size = new_size;
497 xfs_iunlock(ip, XFS_ILOCK_EXCL);
498
499 trace_xfs_file_splice_write(ip, count, *ppos, ioflags); 446 trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
500 447
501 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); 448 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
449 if (ret > 0)
450 XFS_STATS_ADD(xs_write_bytes, ret);
502 451
503 xfs_aio_write_isize_update(inode, ppos, ret);
504 xfs_aio_write_newsize_update(ip, new_size);
505 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 452 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
506 return ret; 453 return ret;
507} 454}
@@ -689,28 +636,26 @@ out_lock:
689/* 636/*
690 * Common pre-write limit and setup checks. 637 * Common pre-write limit and setup checks.
691 * 638 *
692 * Returns with iolock held according to @iolock. 639 * Called with the iolocked held either shared and exclusive according to
640 * @iolock, and returns with it held. Might upgrade the iolock to exclusive
641 * if called for a direct write beyond i_size.
693 */ 642 */
694STATIC ssize_t 643STATIC ssize_t
695xfs_file_aio_write_checks( 644xfs_file_aio_write_checks(
696 struct file *file, 645 struct file *file,
697 loff_t *pos, 646 loff_t *pos,
698 size_t *count, 647 size_t *count,
699 xfs_fsize_t *new_sizep,
700 int *iolock) 648 int *iolock)
701{ 649{
702 struct inode *inode = file->f_mapping->host; 650 struct inode *inode = file->f_mapping->host;
703 struct xfs_inode *ip = XFS_I(inode); 651 struct xfs_inode *ip = XFS_I(inode);
704 xfs_fsize_t new_size;
705 int error = 0; 652 int error = 0;
706 653
707 xfs_rw_ilock(ip, XFS_ILOCK_EXCL); 654 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
708 *new_sizep = 0;
709restart: 655restart:
710 error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); 656 error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
711 if (error) { 657 if (error) {
712 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); 658 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
713 *iolock = 0;
714 return error; 659 return error;
715 } 660 }
716 661
@@ -720,36 +665,21 @@ restart:
720 /* 665 /*
721 * If the offset is beyond the size of the file, we need to zero any 666 * If the offset is beyond the size of the file, we need to zero any
722 * blocks that fall between the existing EOF and the start of this 667 * blocks that fall between the existing EOF and the start of this
723 * write. There is no need to issue zeroing if another in-flght IO ends 668 * write. If zeroing is needed and we are currently holding the
724 * at or before this one If zeronig is needed and we are currently 669 * iolock shared, we need to update it to exclusive which involves
725 * holding the iolock shared, we need to update it to exclusive which 670 * dropping all locks and relocking to maintain correct locking order.
726 * involves dropping all locks and relocking to maintain correct locking 671 * If we do this, restart the function to ensure all checks and values
727 * order. If we do this, restart the function to ensure all checks and 672 * are still valid.
728 * values are still valid.
729 */ 673 */
730 if ((ip->i_new_size && *pos > ip->i_new_size) || 674 if (*pos > i_size_read(inode)) {
731 (!ip->i_new_size && *pos > ip->i_size)) {
732 if (*iolock == XFS_IOLOCK_SHARED) { 675 if (*iolock == XFS_IOLOCK_SHARED) {
733 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); 676 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
734 *iolock = XFS_IOLOCK_EXCL; 677 *iolock = XFS_IOLOCK_EXCL;
735 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); 678 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
736 goto restart; 679 goto restart;
737 } 680 }
738 error = -xfs_zero_eof(ip, *pos, ip->i_size); 681 error = -xfs_zero_eof(ip, *pos, i_size_read(inode));
739 } 682 }
740
741 /*
742 * If this IO extends beyond EOF, we may need to update ip->i_new_size.
743 * We have already zeroed space beyond EOF (if necessary). Only update
744 * ip->i_new_size if this IO ends beyond any other in-flight writes.
745 */
746 new_size = *pos + *count;
747 if (new_size > ip->i_size) {
748 if (new_size > ip->i_new_size)
749 ip->i_new_size = new_size;
750 *new_sizep = new_size;
751 }
752
753 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); 683 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
754 if (error) 684 if (error)
755 return error; 685 return error;
@@ -794,9 +724,7 @@ xfs_file_dio_aio_write(
794 const struct iovec *iovp, 724 const struct iovec *iovp,
795 unsigned long nr_segs, 725 unsigned long nr_segs,
796 loff_t pos, 726 loff_t pos,
797 size_t ocount, 727 size_t ocount)
798 xfs_fsize_t *new_size,
799 int *iolock)
800{ 728{
801 struct file *file = iocb->ki_filp; 729 struct file *file = iocb->ki_filp;
802 struct address_space *mapping = file->f_mapping; 730 struct address_space *mapping = file->f_mapping;
@@ -806,10 +734,10 @@ xfs_file_dio_aio_write(
806 ssize_t ret = 0; 734 ssize_t ret = 0;
807 size_t count = ocount; 735 size_t count = ocount;
808 int unaligned_io = 0; 736 int unaligned_io = 0;
737 int iolock;
809 struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? 738 struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
810 mp->m_rtdev_targp : mp->m_ddev_targp; 739 mp->m_rtdev_targp : mp->m_ddev_targp;
811 740
812 *iolock = 0;
813 if ((pos & target->bt_smask) || (count & target->bt_smask)) 741 if ((pos & target->bt_smask) || (count & target->bt_smask))
814 return -XFS_ERROR(EINVAL); 742 return -XFS_ERROR(EINVAL);
815 743
@@ -824,31 +752,31 @@ xfs_file_dio_aio_write(
824 * EOF zeroing cases and fill out the new inode size as appropriate. 752 * EOF zeroing cases and fill out the new inode size as appropriate.
825 */ 753 */
826 if (unaligned_io || mapping->nrpages) 754 if (unaligned_io || mapping->nrpages)
827 *iolock = XFS_IOLOCK_EXCL; 755 iolock = XFS_IOLOCK_EXCL;
828 else 756 else
829 *iolock = XFS_IOLOCK_SHARED; 757 iolock = XFS_IOLOCK_SHARED;
830 xfs_rw_ilock(ip, *iolock); 758 xfs_rw_ilock(ip, iolock);
831 759
832 /* 760 /*
833 * Recheck if there are cached pages that need invalidate after we got 761 * Recheck if there are cached pages that need invalidate after we got
834 * the iolock to protect against other threads adding new pages while 762 * the iolock to protect against other threads adding new pages while
835 * we were waiting for the iolock. 763 * we were waiting for the iolock.
836 */ 764 */
837 if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) { 765 if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) {
838 xfs_rw_iunlock(ip, *iolock); 766 xfs_rw_iunlock(ip, iolock);
839 *iolock = XFS_IOLOCK_EXCL; 767 iolock = XFS_IOLOCK_EXCL;
840 xfs_rw_ilock(ip, *iolock); 768 xfs_rw_ilock(ip, iolock);
841 } 769 }
842 770
843 ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock); 771 ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
844 if (ret) 772 if (ret)
845 return ret; 773 goto out;
846 774
847 if (mapping->nrpages) { 775 if (mapping->nrpages) {
848 ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, 776 ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
849 FI_REMAPF_LOCKED); 777 FI_REMAPF_LOCKED);
850 if (ret) 778 if (ret)
851 return ret; 779 goto out;
852 } 780 }
853 781
854 /* 782 /*
@@ -857,15 +785,18 @@ xfs_file_dio_aio_write(
857 */ 785 */
858 if (unaligned_io) 786 if (unaligned_io)
859 inode_dio_wait(inode); 787 inode_dio_wait(inode);
860 else if (*iolock == XFS_IOLOCK_EXCL) { 788 else if (iolock == XFS_IOLOCK_EXCL) {
861 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); 789 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
862 *iolock = XFS_IOLOCK_SHARED; 790 iolock = XFS_IOLOCK_SHARED;
863 } 791 }
864 792
865 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); 793 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
866 ret = generic_file_direct_write(iocb, iovp, 794 ret = generic_file_direct_write(iocb, iovp,
867 &nr_segs, pos, &iocb->ki_pos, count, ocount); 795 &nr_segs, pos, &iocb->ki_pos, count, ocount);
868 796
797out:
798 xfs_rw_iunlock(ip, iolock);
799
869 /* No fallback to buffered IO on errors for XFS. */ 800 /* No fallback to buffered IO on errors for XFS. */
870 ASSERT(ret < 0 || ret == count); 801 ASSERT(ret < 0 || ret == count);
871 return ret; 802 return ret;
@@ -877,9 +808,7 @@ xfs_file_buffered_aio_write(
877 const struct iovec *iovp, 808 const struct iovec *iovp,
878 unsigned long nr_segs, 809 unsigned long nr_segs,
879 loff_t pos, 810 loff_t pos,
880 size_t ocount, 811 size_t ocount)
881 xfs_fsize_t *new_size,
882 int *iolock)
883{ 812{
884 struct file *file = iocb->ki_filp; 813 struct file *file = iocb->ki_filp;
885 struct address_space *mapping = file->f_mapping; 814 struct address_space *mapping = file->f_mapping;
@@ -887,14 +816,14 @@ xfs_file_buffered_aio_write(
887 struct xfs_inode *ip = XFS_I(inode); 816 struct xfs_inode *ip = XFS_I(inode);
888 ssize_t ret; 817 ssize_t ret;
889 int enospc = 0; 818 int enospc = 0;
819 int iolock = XFS_IOLOCK_EXCL;
890 size_t count = ocount; 820 size_t count = ocount;
891 821
892 *iolock = XFS_IOLOCK_EXCL; 822 xfs_rw_ilock(ip, iolock);
893 xfs_rw_ilock(ip, *iolock);
894 823
895 ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock); 824 ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
896 if (ret) 825 if (ret)
897 return ret; 826 goto out;
898 827
899 /* We can write back this queue in page reclaim */ 828 /* We can write back this queue in page reclaim */
900 current->backing_dev_info = mapping->backing_dev_info; 829 current->backing_dev_info = mapping->backing_dev_info;
@@ -908,13 +837,15 @@ write_retry:
908 * page locks and retry *once* 837 * page locks and retry *once*
909 */ 838 */
910 if (ret == -ENOSPC && !enospc) { 839 if (ret == -ENOSPC && !enospc) {
911 ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
912 if (ret)
913 return ret;
914 enospc = 1; 840 enospc = 1;
915 goto write_retry; 841 ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
842 if (!ret)
843 goto write_retry;
916 } 844 }
845
917 current->backing_dev_info = NULL; 846 current->backing_dev_info = NULL;
847out:
848 xfs_rw_iunlock(ip, iolock);
918 return ret; 849 return ret;
919} 850}
920 851
@@ -930,9 +861,7 @@ xfs_file_aio_write(
930 struct inode *inode = mapping->host; 861 struct inode *inode = mapping->host;
931 struct xfs_inode *ip = XFS_I(inode); 862 struct xfs_inode *ip = XFS_I(inode);
932 ssize_t ret; 863 ssize_t ret;
933 int iolock;
934 size_t ocount = 0; 864 size_t ocount = 0;
935 xfs_fsize_t new_size = 0;
936 865
937 XFS_STATS_INC(xs_write_calls); 866 XFS_STATS_INC(xs_write_calls);
938 867
@@ -951,33 +880,22 @@ xfs_file_aio_write(
951 return -EIO; 880 return -EIO;
952 881
953 if (unlikely(file->f_flags & O_DIRECT)) 882 if (unlikely(file->f_flags & O_DIRECT))
954 ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, 883 ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount);
955 ocount, &new_size, &iolock);
956 else 884 else
957 ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, 885 ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
958 ocount, &new_size, &iolock); 886 ocount);
959
960 xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
961 887
962 if (ret <= 0) 888 if (ret > 0) {
963 goto out_unlock; 889 ssize_t err;
964 890
965 /* Handle various SYNC-type writes */ 891 XFS_STATS_ADD(xs_write_bytes, ret);
966 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
967 loff_t end = pos + ret - 1;
968 int error;
969 892
970 xfs_rw_iunlock(ip, iolock); 893 /* Handle various SYNC-type writes */
971 error = xfs_file_fsync(file, pos, end, 894 err = generic_write_sync(file, pos, ret);
972 (file->f_flags & __O_SYNC) ? 0 : 1); 895 if (err < 0)
973 xfs_rw_ilock(ip, iolock); 896 ret = err;
974 if (error)
975 ret = error;
976 } 897 }
977 898
978out_unlock:
979 xfs_aio_write_newsize_update(ip, new_size);
980 xfs_rw_iunlock(ip, iolock);
981 return ret; 899 return ret;
982} 900}
983 901
diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
index ed88ed16811c..652b875a9d4c 100644
--- a/fs/xfs/xfs_fs_subr.c
+++ b/fs/xfs/xfs_fs_subr.c
@@ -90,7 +90,7 @@ xfs_wait_on_pages(
90 90
91 if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { 91 if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
92 return -filemap_fdatawait_range(mapping, first, 92 return -filemap_fdatawait_range(mapping, first,
93 last == -1 ? ip->i_size - 1 : last); 93 last == -1 ? XFS_ISIZE(ip) - 1 : last);
94 } 94 }
95 return 0; 95 return 0;
96} 96}
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 3960a066d7ff..8c3e46394d48 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -77,7 +77,7 @@ xfs_inode_alloc(
77 77
78 ASSERT(atomic_read(&ip->i_pincount) == 0); 78 ASSERT(atomic_read(&ip->i_pincount) == 0);
79 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 79 ASSERT(!spin_is_locked(&ip->i_flags_lock));
80 ASSERT(completion_done(&ip->i_flush)); 80 ASSERT(!xfs_isiflocked(ip));
81 ASSERT(ip->i_ino == 0); 81 ASSERT(ip->i_ino == 0);
82 82
83 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 83 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
@@ -94,8 +94,6 @@ xfs_inode_alloc(
94 ip->i_update_core = 0; 94 ip->i_update_core = 0;
95 ip->i_delayed_blks = 0; 95 ip->i_delayed_blks = 0;
96 memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); 96 memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
97 ip->i_size = 0;
98 ip->i_new_size = 0;
99 97
100 return ip; 98 return ip;
101} 99}
@@ -150,7 +148,7 @@ xfs_inode_free(
150 /* asserts to verify all state is correct here */ 148 /* asserts to verify all state is correct here */
151 ASSERT(atomic_read(&ip->i_pincount) == 0); 149 ASSERT(atomic_read(&ip->i_pincount) == 0);
152 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 150 ASSERT(!spin_is_locked(&ip->i_flags_lock));
153 ASSERT(completion_done(&ip->i_flush)); 151 ASSERT(!xfs_isiflocked(ip));
154 152
155 /* 153 /*
156 * Because we use RCU freeing we need to ensure the inode always 154 * Because we use RCU freeing we need to ensure the inode always
@@ -450,8 +448,6 @@ again:
450 448
451 *ipp = ip; 449 *ipp = ip;
452 450
453 ASSERT(ip->i_df.if_ext_max ==
454 XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
455 /* 451 /*
456 * If we have a real type for an on-disk inode, we can set ops(&unlock) 452 * If we have a real type for an on-disk inode, we can set ops(&unlock)
457 * now. If it's a new inode being created, xfs_ialloc will handle it. 453 * now. If it's a new inode being created, xfs_ialloc will handle it.
@@ -715,3 +711,19 @@ xfs_isilocked(
715 return 0; 711 return 0;
716} 712}
717#endif 713#endif
714
715void
716__xfs_iflock(
717 struct xfs_inode *ip)
718{
719 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
720 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
721
722 do {
723 prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
724 if (xfs_isiflocked(ip))
725 io_schedule();
726 } while (!xfs_iflock_nowait(ip));
727
728 finish_wait(wq, &wait.wait);
729}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 9dda7cc32848..b21022499c2e 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -299,11 +299,8 @@ xfs_iformat(
299{ 299{
300 xfs_attr_shortform_t *atp; 300 xfs_attr_shortform_t *atp;
301 int size; 301 int size;
302 int error; 302 int error = 0;
303 xfs_fsize_t di_size; 303 xfs_fsize_t di_size;
304 ip->i_df.if_ext_max =
305 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
306 error = 0;
307 304
308 if (unlikely(be32_to_cpu(dip->di_nextents) + 305 if (unlikely(be32_to_cpu(dip->di_nextents) +
309 be16_to_cpu(dip->di_anextents) > 306 be16_to_cpu(dip->di_anextents) >
@@ -350,7 +347,6 @@ xfs_iformat(
350 return XFS_ERROR(EFSCORRUPTED); 347 return XFS_ERROR(EFSCORRUPTED);
351 } 348 }
352 ip->i_d.di_size = 0; 349 ip->i_d.di_size = 0;
353 ip->i_size = 0;
354 ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip); 350 ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
355 break; 351 break;
356 352
@@ -409,10 +405,10 @@ xfs_iformat(
409 } 405 }
410 if (!XFS_DFORK_Q(dip)) 406 if (!XFS_DFORK_Q(dip))
411 return 0; 407 return 0;
408
412 ASSERT(ip->i_afp == NULL); 409 ASSERT(ip->i_afp == NULL);
413 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); 410 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
414 ip->i_afp->if_ext_max = 411
415 XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
416 switch (dip->di_aformat) { 412 switch (dip->di_aformat) {
417 case XFS_DINODE_FMT_LOCAL: 413 case XFS_DINODE_FMT_LOCAL:
418 atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); 414 atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
@@ -604,10 +600,11 @@ xfs_iformat_btree(
604 * or the number of extents is greater than the number of 600 * or the number of extents is greater than the number of
605 * blocks. 601 * blocks.
606 */ 602 */
607 if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max 603 if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
608 || XFS_BMDR_SPACE_CALC(nrecs) > 604 XFS_IFORK_MAXEXT(ip, whichfork) ||
609 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) 605 XFS_BMDR_SPACE_CALC(nrecs) >
610 || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { 606 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) ||
607 XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
611 xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).", 608 xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
612 (unsigned long long) ip->i_ino); 609 (unsigned long long) ip->i_ino);
613 XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, 610 XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
@@ -835,12 +832,6 @@ xfs_iread(
835 * with the uninitialized part of it. 832 * with the uninitialized part of it.
836 */ 833 */
837 ip->i_d.di_mode = 0; 834 ip->i_d.di_mode = 0;
838 /*
839 * Initialize the per-fork minima and maxima for a new
840 * inode here. xfs_iformat will do it for old inodes.
841 */
842 ip->i_df.if_ext_max =
843 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
844 } 835 }
845 836
846 /* 837 /*
@@ -861,7 +852,6 @@ xfs_iread(
861 } 852 }
862 853
863 ip->i_delayed_blks = 0; 854 ip->i_delayed_blks = 0;
864 ip->i_size = ip->i_d.di_size;
865 855
866 /* 856 /*
867 * Mark the buffer containing the inode as something to keep 857 * Mark the buffer containing the inode as something to keep
@@ -1051,7 +1041,6 @@ xfs_ialloc(
1051 } 1041 }
1052 1042
1053 ip->i_d.di_size = 0; 1043 ip->i_d.di_size = 0;
1054 ip->i_size = 0;
1055 ip->i_d.di_nextents = 0; 1044 ip->i_d.di_nextents = 0;
1056 ASSERT(ip->i_d.di_nblocks == 0); 1045 ASSERT(ip->i_d.di_nblocks == 0);
1057 1046
@@ -1166,52 +1155,6 @@ xfs_ialloc(
1166} 1155}
1167 1156
1168/* 1157/*
1169 * Check to make sure that there are no blocks allocated to the
1170 * file beyond the size of the file. We don't check this for
1171 * files with fixed size extents or real time extents, but we
1172 * at least do it for regular files.
1173 */
1174#ifdef DEBUG
1175STATIC void
1176xfs_isize_check(
1177 struct xfs_inode *ip,
1178 xfs_fsize_t isize)
1179{
1180 struct xfs_mount *mp = ip->i_mount;
1181 xfs_fileoff_t map_first;
1182 int nimaps;
1183 xfs_bmbt_irec_t imaps[2];
1184 int error;
1185
1186 if (!S_ISREG(ip->i_d.di_mode))
1187 return;
1188
1189 if (XFS_IS_REALTIME_INODE(ip))
1190 return;
1191
1192 if (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
1193 return;
1194
1195 nimaps = 2;
1196 map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
1197 /*
1198 * The filesystem could be shutting down, so bmapi may return
1199 * an error.
1200 */
1201 error = xfs_bmapi_read(ip, map_first,
1202 (XFS_B_TO_FSB(mp,
1203 (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first),
1204 imaps, &nimaps, XFS_BMAPI_ENTIRE);
1205 if (error)
1206 return;
1207 ASSERT(nimaps == 1);
1208 ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
1209}
1210#else /* DEBUG */
1211#define xfs_isize_check(ip, isize)
1212#endif /* DEBUG */
1213
1214/*
1215 * Free up the underlying blocks past new_size. The new size must be smaller 1158 * Free up the underlying blocks past new_size. The new size must be smaller
1216 * than the current size. This routine can be used both for the attribute and 1159 * than the current size. This routine can be used both for the attribute and
1217 * data fork, and does not modify the inode size, which is left to the caller. 1160 * data fork, and does not modify the inode size, which is left to the caller.
@@ -1252,12 +1195,14 @@ xfs_itruncate_extents(
1252 int done = 0; 1195 int done = 0;
1253 1196
1254 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); 1197 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
1255 ASSERT(new_size <= ip->i_size); 1198 ASSERT(new_size <= XFS_ISIZE(ip));
1256 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 1199 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1257 ASSERT(ip->i_itemp != NULL); 1200 ASSERT(ip->i_itemp != NULL);
1258 ASSERT(ip->i_itemp->ili_lock_flags == 0); 1201 ASSERT(ip->i_itemp->ili_lock_flags == 0);
1259 ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); 1202 ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
1260 1203
1204 trace_xfs_itruncate_extents_start(ip, new_size);
1205
1261 /* 1206 /*
1262 * Since it is possible for space to become allocated beyond 1207 * Since it is possible for space to become allocated beyond
1263 * the end of the file (in a crash where the space is allocated 1208 * the end of the file (in a crash where the space is allocated
@@ -1325,6 +1270,14 @@ xfs_itruncate_extents(
1325 goto out; 1270 goto out;
1326 } 1271 }
1327 1272
1273 /*
1274 * Always re-log the inode so that our permanent transaction can keep
1275 * on rolling it forward in the log.
1276 */
1277 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1278
1279 trace_xfs_itruncate_extents_end(ip, new_size);
1280
1328out: 1281out:
1329 *tpp = tp; 1282 *tpp = tp;
1330 return error; 1283 return error;
@@ -1338,74 +1291,6 @@ out_bmap_cancel:
1338 goto out; 1291 goto out;
1339} 1292}
1340 1293
1341int
1342xfs_itruncate_data(
1343 struct xfs_trans **tpp,
1344 struct xfs_inode *ip,
1345 xfs_fsize_t new_size)
1346{
1347 int error;
1348
1349 trace_xfs_itruncate_data_start(ip, new_size);
1350
1351 /*
1352 * The first thing we do is set the size to new_size permanently on
1353 * disk. This way we don't have to worry about anyone ever being able
1354 * to look at the data being freed even in the face of a crash.
1355 * What we're getting around here is the case where we free a block, it
1356 * is allocated to another file, it is written to, and then we crash.
1357 * If the new data gets written to the file but the log buffers
1358 * containing the free and reallocation don't, then we'd end up with
1359 * garbage in the blocks being freed. As long as we make the new_size
1360 * permanent before actually freeing any blocks it doesn't matter if
1361 * they get written to.
1362 */
1363 if (ip->i_d.di_nextents > 0) {
1364 /*
1365 * If we are not changing the file size then do not update
1366 * the on-disk file size - we may be called from
1367 * xfs_inactive_free_eofblocks(). If we update the on-disk
1368 * file size and then the system crashes before the contents
1369 * of the file are flushed to disk then the files may be
1370 * full of holes (ie NULL files bug).
1371 */
1372 if (ip->i_size != new_size) {
1373 ip->i_d.di_size = new_size;
1374 ip->i_size = new_size;
1375 xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
1376 }
1377 }
1378
1379 error = xfs_itruncate_extents(tpp, ip, XFS_DATA_FORK, new_size);
1380 if (error)
1381 return error;
1382
1383 /*
1384 * If we are not changing the file size then do not update the on-disk
1385 * file size - we may be called from xfs_inactive_free_eofblocks().
1386 * If we update the on-disk file size and then the system crashes
1387 * before the contents of the file are flushed to disk then the files
1388 * may be full of holes (ie NULL files bug).
1389 */
1390 xfs_isize_check(ip, new_size);
1391 if (ip->i_size != new_size) {
1392 ip->i_d.di_size = new_size;
1393 ip->i_size = new_size;
1394 }
1395
1396 ASSERT(new_size != 0 || ip->i_delayed_blks == 0);
1397 ASSERT(new_size != 0 || ip->i_d.di_nextents == 0);
1398
1399 /*
1400 * Always re-log the inode so that our permanent transaction can keep
1401 * on rolling it forward in the log.
1402 */
1403 xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
1404
1405 trace_xfs_itruncate_data_end(ip, new_size);
1406 return 0;
1407}
1408
1409/* 1294/*
1410 * This is called when the inode's link count goes to 0. 1295 * This is called when the inode's link count goes to 0.
1411 * We place the on-disk inode on a list in the AGI. It 1296 * We place the on-disk inode on a list in the AGI. It
@@ -1824,8 +1709,7 @@ xfs_ifree(
1824 ASSERT(ip->i_d.di_nlink == 0); 1709 ASSERT(ip->i_d.di_nlink == 0);
1825 ASSERT(ip->i_d.di_nextents == 0); 1710 ASSERT(ip->i_d.di_nextents == 0);
1826 ASSERT(ip->i_d.di_anextents == 0); 1711 ASSERT(ip->i_d.di_anextents == 0);
1827 ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) || 1712 ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode));
1828 (!S_ISREG(ip->i_d.di_mode)));
1829 ASSERT(ip->i_d.di_nblocks == 0); 1713 ASSERT(ip->i_d.di_nblocks == 0);
1830 1714
1831 /* 1715 /*
@@ -1844,8 +1728,6 @@ xfs_ifree(
1844 ip->i_d.di_flags = 0; 1728 ip->i_d.di_flags = 0;
1845 ip->i_d.di_dmevmask = 0; 1729 ip->i_d.di_dmevmask = 0;
1846 ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ 1730 ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */
1847 ip->i_df.if_ext_max =
1848 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
1849 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; 1731 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
1850 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; 1732 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
1851 /* 1733 /*
@@ -2151,7 +2033,7 @@ xfs_idestroy_fork(
2151 * once someone is waiting for it to be unpinned. 2033 * once someone is waiting for it to be unpinned.
2152 */ 2034 */
2153static void 2035static void
2154xfs_iunpin_nowait( 2036xfs_iunpin(
2155 struct xfs_inode *ip) 2037 struct xfs_inode *ip)
2156{ 2038{
2157 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2039 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
@@ -2163,14 +2045,29 @@ xfs_iunpin_nowait(
2163 2045
2164} 2046}
2165 2047
2048static void
2049__xfs_iunpin_wait(
2050 struct xfs_inode *ip)
2051{
2052 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
2053 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
2054
2055 xfs_iunpin(ip);
2056
2057 do {
2058 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
2059 if (xfs_ipincount(ip))
2060 io_schedule();
2061 } while (xfs_ipincount(ip));
2062 finish_wait(wq, &wait.wait);
2063}
2064
2166void 2065void
2167xfs_iunpin_wait( 2066xfs_iunpin_wait(
2168 struct xfs_inode *ip) 2067 struct xfs_inode *ip)
2169{ 2068{
2170 if (xfs_ipincount(ip)) { 2069 if (xfs_ipincount(ip))
2171 xfs_iunpin_nowait(ip); 2070 __xfs_iunpin_wait(ip);
2172 wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0));
2173 }
2174} 2071}
2175 2072
2176/* 2073/*
@@ -2510,9 +2407,9 @@ xfs_iflush(
2510 XFS_STATS_INC(xs_iflush_count); 2407 XFS_STATS_INC(xs_iflush_count);
2511 2408
2512 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2409 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2513 ASSERT(!completion_done(&ip->i_flush)); 2410 ASSERT(xfs_isiflocked(ip));
2514 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 2411 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
2515 ip->i_d.di_nextents > ip->i_df.if_ext_max); 2412 ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
2516 2413
2517 iip = ip->i_itemp; 2414 iip = ip->i_itemp;
2518 mp = ip->i_mount; 2415 mp = ip->i_mount;
@@ -2529,7 +2426,7 @@ xfs_iflush(
2529 * out for us if they occur after the log force completes. 2426 * out for us if they occur after the log force completes.
2530 */ 2427 */
2531 if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) { 2428 if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
2532 xfs_iunpin_nowait(ip); 2429 xfs_iunpin(ip);
2533 xfs_ifunlock(ip); 2430 xfs_ifunlock(ip);
2534 return EAGAIN; 2431 return EAGAIN;
2535 } 2432 }
@@ -2626,9 +2523,9 @@ xfs_iflush_int(
2626#endif 2523#endif
2627 2524
2628 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2525 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2629 ASSERT(!completion_done(&ip->i_flush)); 2526 ASSERT(xfs_isiflocked(ip));
2630 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 2527 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
2631 ip->i_d.di_nextents > ip->i_df.if_ext_max); 2528 ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
2632 2529
2633 iip = ip->i_itemp; 2530 iip = ip->i_itemp;
2634 mp = ip->i_mount; 2531 mp = ip->i_mount;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index f0e6b151ba37..2f27b7454085 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -66,7 +66,6 @@ typedef struct xfs_ifork {
66 struct xfs_btree_block *if_broot; /* file's incore btree root */ 66 struct xfs_btree_block *if_broot; /* file's incore btree root */
67 short if_broot_bytes; /* bytes allocated for root */ 67 short if_broot_bytes; /* bytes allocated for root */
68 unsigned char if_flags; /* per-fork flags */ 68 unsigned char if_flags; /* per-fork flags */
69 unsigned char if_ext_max; /* max # of extent records */
70 union { 69 union {
71 xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */ 70 xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
72 xfs_ext_irec_t *if_ext_irec; /* irec map file exts */ 71 xfs_ext_irec_t *if_ext_irec; /* irec map file exts */
@@ -206,12 +205,12 @@ typedef struct xfs_icdinode {
206 ((w) == XFS_DATA_FORK ? \ 205 ((w) == XFS_DATA_FORK ? \
207 ((ip)->i_d.di_nextents = (n)) : \ 206 ((ip)->i_d.di_nextents = (n)) : \
208 ((ip)->i_d.di_anextents = (n))) 207 ((ip)->i_d.di_anextents = (n)))
209 208#define XFS_IFORK_MAXEXT(ip, w) \
209 (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
210 210
211 211
212#ifdef __KERNEL__ 212#ifdef __KERNEL__
213 213
214struct bhv_desc;
215struct xfs_buf; 214struct xfs_buf;
216struct xfs_bmap_free; 215struct xfs_bmap_free;
217struct xfs_bmbt_irec; 216struct xfs_bmbt_irec;
@@ -220,12 +219,6 @@ struct xfs_mount;
220struct xfs_trans; 219struct xfs_trans;
221struct xfs_dquot; 220struct xfs_dquot;
222 221
223typedef struct dm_attrs_s {
224 __uint32_t da_dmevmask; /* DMIG event mask */
225 __uint16_t da_dmstate; /* DMIG state info */
226 __uint16_t da_pad; /* DMIG extra padding */
227} dm_attrs_t;
228
229typedef struct xfs_inode { 222typedef struct xfs_inode {
230 /* Inode linking and identification information. */ 223 /* Inode linking and identification information. */
231 struct xfs_mount *i_mount; /* fs mount struct ptr */ 224 struct xfs_mount *i_mount; /* fs mount struct ptr */
@@ -244,27 +237,19 @@ typedef struct xfs_inode {
244 struct xfs_inode_log_item *i_itemp; /* logging information */ 237 struct xfs_inode_log_item *i_itemp; /* logging information */
245 mrlock_t i_lock; /* inode lock */ 238 mrlock_t i_lock; /* inode lock */
246 mrlock_t i_iolock; /* inode IO lock */ 239 mrlock_t i_iolock; /* inode IO lock */
247 struct completion i_flush; /* inode flush completion q */
248 atomic_t i_pincount; /* inode pin count */ 240 atomic_t i_pincount; /* inode pin count */
249 wait_queue_head_t i_ipin_wait; /* inode pinning wait queue */
250 spinlock_t i_flags_lock; /* inode i_flags lock */ 241 spinlock_t i_flags_lock; /* inode i_flags lock */
251 /* Miscellaneous state. */ 242 /* Miscellaneous state. */
252 unsigned short i_flags; /* see defined flags below */ 243 unsigned long i_flags; /* see defined flags below */
253 unsigned char i_update_core; /* timestamps/size is dirty */ 244 unsigned char i_update_core; /* timestamps/size is dirty */
254 unsigned int i_delayed_blks; /* count of delay alloc blks */ 245 unsigned int i_delayed_blks; /* count of delay alloc blks */
255 246
256 xfs_icdinode_t i_d; /* most of ondisk inode */ 247 xfs_icdinode_t i_d; /* most of ondisk inode */
257 248
258 xfs_fsize_t i_size; /* in-memory size */
259 xfs_fsize_t i_new_size; /* size when write completes */
260
261 /* VFS inode */ 249 /* VFS inode */
262 struct inode i_vnode; /* embedded VFS inode */ 250 struct inode i_vnode; /* embedded VFS inode */
263} xfs_inode_t; 251} xfs_inode_t;
264 252
265#define XFS_ISIZE(ip) S_ISREG((ip)->i_d.di_mode) ? \
266 (ip)->i_size : (ip)->i_d.di_size;
267
268/* Convert from vfs inode to xfs inode */ 253/* Convert from vfs inode to xfs inode */
269static inline struct xfs_inode *XFS_I(struct inode *inode) 254static inline struct xfs_inode *XFS_I(struct inode *inode)
270{ 255{
@@ -278,6 +263,18 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
278} 263}
279 264
280/* 265/*
266 * For regular files we only update the on-disk filesize when actually
267 * writing data back to disk. Until then only the copy in the VFS inode
268 * is uptodate.
269 */
270static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip)
271{
272 if (S_ISREG(ip->i_d.di_mode))
273 return i_size_read(VFS_I(ip));
274 return ip->i_d.di_size;
275}
276
277/*
281 * i_flags helper functions 278 * i_flags helper functions
282 */ 279 */
283static inline void 280static inline void
@@ -331,6 +328,19 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
331 return ret; 328 return ret;
332} 329}
333 330
331static inline int
332xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags)
333{
334 int ret;
335
336 spin_lock(&ip->i_flags_lock);
337 ret = ip->i_flags & flags;
338 if (!ret)
339 ip->i_flags |= flags;
340 spin_unlock(&ip->i_flags_lock);
341 return ret;
342}
343
334/* 344/*
335 * Project quota id helpers (previously projid was 16bit only 345 * Project quota id helpers (previously projid was 16bit only
336 * and using two 16bit values to hold new 32bit projid was chosen 346 * and using two 16bit values to hold new 32bit projid was chosen
@@ -351,35 +361,19 @@ xfs_set_projid(struct xfs_inode *ip,
351} 361}
352 362
353/* 363/*
354 * Manage the i_flush queue embedded in the inode. This completion
355 * queue synchronizes processes attempting to flush the in-core
356 * inode back to disk.
357 */
358static inline void xfs_iflock(xfs_inode_t *ip)
359{
360 wait_for_completion(&ip->i_flush);
361}
362
363static inline int xfs_iflock_nowait(xfs_inode_t *ip)
364{
365 return try_wait_for_completion(&ip->i_flush);
366}
367
368static inline void xfs_ifunlock(xfs_inode_t *ip)
369{
370 complete(&ip->i_flush);
371}
372
373/*
374 * In-core inode flags. 364 * In-core inode flags.
375 */ 365 */
376#define XFS_IRECLAIM 0x0001 /* started reclaiming this inode */ 366#define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */
377#define XFS_ISTALE 0x0002 /* inode has been staled */ 367#define XFS_ISTALE (1 << 1) /* inode has been staled */
378#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */ 368#define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */
379#define XFS_INEW 0x0008 /* inode has just been allocated */ 369#define XFS_INEW (1 << 3) /* inode has just been allocated */
380#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */ 370#define XFS_IFILESTREAM (1 << 4) /* inode is in a filestream dir. */
381#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */ 371#define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */
382#define XFS_IDIRTY_RELEASE 0x0040 /* dirty release already seen */ 372#define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */
373#define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */
374#define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT)
375#define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */
376#define XFS_IPINNED (1 << __XFS_IPINNED_BIT)
383 377
384/* 378/*
385 * Per-lifetime flags need to be reset when re-using a reclaimable inode during 379 * Per-lifetime flags need to be reset when re-using a reclaimable inode during
@@ -392,6 +386,34 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
392 XFS_IFILESTREAM); 386 XFS_IFILESTREAM);
393 387
394/* 388/*
389 * Synchronize processes attempting to flush the in-core inode back to disk.
390 */
391
392extern void __xfs_iflock(struct xfs_inode *ip);
393
394static inline int xfs_iflock_nowait(struct xfs_inode *ip)
395{
396 return !xfs_iflags_test_and_set(ip, XFS_IFLOCK);
397}
398
399static inline void xfs_iflock(struct xfs_inode *ip)
400{
401 if (!xfs_iflock_nowait(ip))
402 __xfs_iflock(ip);
403}
404
405static inline void xfs_ifunlock(struct xfs_inode *ip)
406{
407 xfs_iflags_clear(ip, XFS_IFLOCK);
408 wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);
409}
410
411static inline int xfs_isiflocked(struct xfs_inode *ip)
412{
413 return xfs_iflags_test(ip, XFS_IFLOCK);
414}
415
416/*
395 * Flags for inode locking. 417 * Flags for inode locking.
396 * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) 418 * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield)
397 * 1<<16 - 1<<32-1 -- lockdep annotation (integers) 419 * 1<<16 - 1<<32-1 -- lockdep annotation (integers)
@@ -491,8 +513,6 @@ int xfs_ifree(struct xfs_trans *, xfs_inode_t *,
491 struct xfs_bmap_free *); 513 struct xfs_bmap_free *);
492int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *, 514int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
493 int, xfs_fsize_t); 515 int, xfs_fsize_t);
494int xfs_itruncate_data(struct xfs_trans **, struct xfs_inode *,
495 xfs_fsize_t);
496int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); 516int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
497 517
498void xfs_iext_realloc(xfs_inode_t *, int, int); 518void xfs_iext_realloc(xfs_inode_t *, int, int);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index cfd6c7f8cc3c..91d71dcd4852 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -79,8 +79,6 @@ xfs_inode_item_size(
79 break; 79 break;
80 80
81 case XFS_DINODE_FMT_BTREE: 81 case XFS_DINODE_FMT_BTREE:
82 ASSERT(ip->i_df.if_ext_max ==
83 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
84 iip->ili_format.ilf_fields &= 82 iip->ili_format.ilf_fields &=
85 ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | 83 ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
86 XFS_ILOG_DEV | XFS_ILOG_UUID); 84 XFS_ILOG_DEV | XFS_ILOG_UUID);
@@ -557,7 +555,7 @@ xfs_inode_item_unpin(
557 trace_xfs_inode_unpin(ip, _RET_IP_); 555 trace_xfs_inode_unpin(ip, _RET_IP_);
558 ASSERT(atomic_read(&ip->i_pincount) > 0); 556 ASSERT(atomic_read(&ip->i_pincount) > 0);
559 if (atomic_dec_and_test(&ip->i_pincount)) 557 if (atomic_dec_and_test(&ip->i_pincount))
560 wake_up(&ip->i_ipin_wait); 558 wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
561} 559}
562 560
563/* 561/*
@@ -719,7 +717,7 @@ xfs_inode_item_pushbuf(
719 * If a flush is not in progress anymore, chances are that the 717 * If a flush is not in progress anymore, chances are that the
720 * inode was taken off the AIL. So, just get out. 718 * inode was taken off the AIL. So, just get out.
721 */ 719 */
722 if (completion_done(&ip->i_flush) || 720 if (!xfs_isiflocked(ip) ||
723 !(lip->li_flags & XFS_LI_IN_AIL)) { 721 !(lip->li_flags & XFS_LI_IN_AIL)) {
724 xfs_iunlock(ip, XFS_ILOCK_SHARED); 722 xfs_iunlock(ip, XFS_ILOCK_SHARED);
725 return true; 723 return true;
@@ -752,7 +750,7 @@ xfs_inode_item_push(
752 struct xfs_inode *ip = iip->ili_inode; 750 struct xfs_inode *ip = iip->ili_inode;
753 751
754 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); 752 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
755 ASSERT(!completion_done(&ip->i_flush)); 753 ASSERT(xfs_isiflocked(ip));
756 754
757 /* 755 /*
758 * Since we were able to lock the inode's flush lock and 756 * Since we were able to lock the inode's flush lock and
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 9afa282aa937..246c7d57c6f9 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -57,26 +57,26 @@ xfs_iomap_eof_align_last_fsb(
57 xfs_fileoff_t *last_fsb) 57 xfs_fileoff_t *last_fsb)
58{ 58{
59 xfs_fileoff_t new_last_fsb = 0; 59 xfs_fileoff_t new_last_fsb = 0;
60 xfs_extlen_t align; 60 xfs_extlen_t align = 0;
61 int eof, error; 61 int eof, error;
62 62
63 if (XFS_IS_REALTIME_INODE(ip)) 63 if (!XFS_IS_REALTIME_INODE(ip)) {
64 ; 64 /*
65 /* 65 * Round up the allocation request to a stripe unit
66 * If mounted with the "-o swalloc" option, roundup the allocation 66 * (m_dalign) boundary if the file size is >= stripe unit
67 * request to a stripe width boundary if the file size is >= 67 * size, and we are allocating past the allocation eof.
68 * stripe width and we are allocating past the allocation eof. 68 *
69 */ 69 * If mounted with the "-o swalloc" option the alignment is
70 else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) && 70 * increased from the strip unit size to the stripe width.
71 (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_swidth))) 71 */
72 new_last_fsb = roundup_64(*last_fsb, mp->m_swidth); 72 if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
73 /* 73 align = mp->m_swidth;
74 * Roundup the allocation request to a stripe unit (m_dalign) boundary 74 else if (mp->m_dalign)
75 * if the file size is >= stripe unit size, and we are allocating past 75 align = mp->m_dalign;
76 * the allocation eof. 76
77 */ 77 if (align && XFS_ISIZE(ip) >= XFS_FSB_TO_B(mp, align))
78 else if (mp->m_dalign && (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_dalign))) 78 new_last_fsb = roundup_64(*last_fsb, align);
79 new_last_fsb = roundup_64(*last_fsb, mp->m_dalign); 79 }
80 80
81 /* 81 /*
82 * Always round up the allocation request to an extent boundary 82 * Always round up the allocation request to an extent boundary
@@ -154,7 +154,7 @@ xfs_iomap_write_direct(
154 154
155 offset_fsb = XFS_B_TO_FSBT(mp, offset); 155 offset_fsb = XFS_B_TO_FSBT(mp, offset);
156 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); 156 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
157 if ((offset + count) > ip->i_size) { 157 if ((offset + count) > XFS_ISIZE(ip)) {
158 error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); 158 error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
159 if (error) 159 if (error)
160 goto error_out; 160 goto error_out;
@@ -211,7 +211,7 @@ xfs_iomap_write_direct(
211 xfs_trans_ijoin(tp, ip, 0); 211 xfs_trans_ijoin(tp, ip, 0);
212 212
213 bmapi_flag = 0; 213 bmapi_flag = 0;
214 if (offset < ip->i_size || extsz) 214 if (offset < XFS_ISIZE(ip) || extsz)
215 bmapi_flag |= XFS_BMAPI_PREALLOC; 215 bmapi_flag |= XFS_BMAPI_PREALLOC;
216 216
217 /* 217 /*
@@ -286,7 +286,7 @@ xfs_iomap_eof_want_preallocate(
286 int found_delalloc = 0; 286 int found_delalloc = 0;
287 287
288 *prealloc = 0; 288 *prealloc = 0;
289 if ((offset + count) <= ip->i_size) 289 if (offset + count <= XFS_ISIZE(ip))
290 return 0; 290 return 0;
291 291
292 /* 292 /*
@@ -340,7 +340,7 @@ xfs_iomap_prealloc_size(
340 * if we pass in alloc_blocks = 0. Hence the "+ 1" to 340 * if we pass in alloc_blocks = 0. Hence the "+ 1" to
341 * ensure we always pass in a non-zero value. 341 * ensure we always pass in a non-zero value.
342 */ 342 */
343 alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1; 343 alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1;
344 alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN, 344 alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
345 rounddown_pow_of_two(alloc_blocks)); 345 rounddown_pow_of_two(alloc_blocks));
346 346
@@ -564,7 +564,7 @@ xfs_iomap_write_allocate(
564 * back.... 564 * back....
565 */ 565 */
566 nimaps = 1; 566 nimaps = 1;
567 end_fsb = XFS_B_TO_FSB(mp, ip->i_size); 567 end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
568 error = xfs_bmap_last_offset(NULL, ip, &last_block, 568 error = xfs_bmap_last_offset(NULL, ip, &last_block,
569 XFS_DATA_FORK); 569 XFS_DATA_FORK);
570 if (error) 570 if (error)
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index f9babd179223..ab302539e5b9 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -750,6 +750,7 @@ xfs_setattr_size(
750 struct xfs_mount *mp = ip->i_mount; 750 struct xfs_mount *mp = ip->i_mount;
751 struct inode *inode = VFS_I(ip); 751 struct inode *inode = VFS_I(ip);
752 int mask = iattr->ia_valid; 752 int mask = iattr->ia_valid;
753 xfs_off_t oldsize, newsize;
753 struct xfs_trans *tp; 754 struct xfs_trans *tp;
754 int error; 755 int error;
755 uint lock_flags; 756 uint lock_flags;
@@ -777,11 +778,13 @@ xfs_setattr_size(
777 lock_flags |= XFS_IOLOCK_EXCL; 778 lock_flags |= XFS_IOLOCK_EXCL;
778 xfs_ilock(ip, lock_flags); 779 xfs_ilock(ip, lock_flags);
779 780
781 oldsize = inode->i_size;
782 newsize = iattr->ia_size;
783
780 /* 784 /*
781 * Short circuit the truncate case for zero length files. 785 * Short circuit the truncate case for zero length files.
782 */ 786 */
783 if (iattr->ia_size == 0 && 787 if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) {
784 ip->i_size == 0 && ip->i_d.di_nextents == 0) {
785 if (!(mask & (ATTR_CTIME|ATTR_MTIME))) 788 if (!(mask & (ATTR_CTIME|ATTR_MTIME)))
786 goto out_unlock; 789 goto out_unlock;
787 790
@@ -807,14 +810,14 @@ xfs_setattr_size(
807 * the inode to the transaction, because the inode cannot be unlocked 810 * the inode to the transaction, because the inode cannot be unlocked
808 * once it is a part of the transaction. 811 * once it is a part of the transaction.
809 */ 812 */
810 if (iattr->ia_size > ip->i_size) { 813 if (newsize > oldsize) {
811 /* 814 /*
812 * Do the first part of growing a file: zero any data in the 815 * Do the first part of growing a file: zero any data in the
813 * last block that is beyond the old EOF. We need to do this 816 * last block that is beyond the old EOF. We need to do this
814 * before the inode is joined to the transaction to modify 817 * before the inode is joined to the transaction to modify
815 * i_size. 818 * i_size.
816 */ 819 */
817 error = xfs_zero_eof(ip, iattr->ia_size, ip->i_size); 820 error = xfs_zero_eof(ip, newsize, oldsize);
818 if (error) 821 if (error)
819 goto out_unlock; 822 goto out_unlock;
820 } 823 }
@@ -833,8 +836,8 @@ xfs_setattr_size(
833 * here and prevents waiting for other data not within the range we 836 * here and prevents waiting for other data not within the range we
834 * care about here. 837 * care about here.
835 */ 838 */
836 if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) { 839 if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
837 error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0, 840 error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0,
838 FI_NONE); 841 FI_NONE);
839 if (error) 842 if (error)
840 goto out_unlock; 843 goto out_unlock;
@@ -845,8 +848,7 @@ xfs_setattr_size(
845 */ 848 */
846 inode_dio_wait(inode); 849 inode_dio_wait(inode);
847 850
848 error = -block_truncate_page(inode->i_mapping, iattr->ia_size, 851 error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
849 xfs_get_blocks);
850 if (error) 852 if (error)
851 goto out_unlock; 853 goto out_unlock;
852 854
@@ -857,7 +859,7 @@ xfs_setattr_size(
857 if (error) 859 if (error)
858 goto out_trans_cancel; 860 goto out_trans_cancel;
859 861
860 truncate_setsize(inode, iattr->ia_size); 862 truncate_setsize(inode, newsize);
861 863
862 commit_flags = XFS_TRANS_RELEASE_LOG_RES; 864 commit_flags = XFS_TRANS_RELEASE_LOG_RES;
863 lock_flags |= XFS_ILOCK_EXCL; 865 lock_flags |= XFS_ILOCK_EXCL;
@@ -876,19 +878,29 @@ xfs_setattr_size(
876 * these flags set. For all other operations the VFS set these flags 878 * these flags set. For all other operations the VFS set these flags
877 * explicitly if it wants a timestamp update. 879 * explicitly if it wants a timestamp update.
878 */ 880 */
879 if (iattr->ia_size != ip->i_size && 881 if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
880 (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
881 iattr->ia_ctime = iattr->ia_mtime = 882 iattr->ia_ctime = iattr->ia_mtime =
882 current_fs_time(inode->i_sb); 883 current_fs_time(inode->i_sb);
883 mask |= ATTR_CTIME | ATTR_MTIME; 884 mask |= ATTR_CTIME | ATTR_MTIME;
884 } 885 }
885 886
886 if (iattr->ia_size > ip->i_size) { 887 /*
887 ip->i_d.di_size = iattr->ia_size; 888 * The first thing we do is set the size to new_size permanently on
888 ip->i_size = iattr->ia_size; 889 * disk. This way we don't have to worry about anyone ever being able
889 } else if (iattr->ia_size <= ip->i_size || 890 * to look at the data being freed even in the face of a crash.
890 (iattr->ia_size == 0 && ip->i_d.di_nextents)) { 891 * What we're getting around here is the case where we free a block, it
891 error = xfs_itruncate_data(&tp, ip, iattr->ia_size); 892 * is allocated to another file, it is written to, and then we crash.
893 * If the new data gets written to the file but the log buffers
894 * containing the free and reallocation don't, then we'd end up with
895 * garbage in the blocks being freed. As long as we make the new size
896 * permanent before actually freeing any blocks it doesn't matter if
897 * they get written to.
898 */
899 ip->i_d.di_size = newsize;
900 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
901
902 if (newsize <= oldsize) {
903 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize);
892 if (error) 904 if (error)
893 goto out_trans_abort; 905 goto out_trans_abort;
894 906
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 5cc3dde1bc90..eafbcff81f3a 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -31,6 +31,7 @@
31#include "xfs_mount.h" 31#include "xfs_mount.h"
32#include "xfs_bmap_btree.h" 32#include "xfs_bmap_btree.h"
33#include "xfs_inode.h" 33#include "xfs_inode.h"
34#include "xfs_inode_item.h"
34#include "xfs_itable.h" 35#include "xfs_itable.h"
35#include "xfs_bmap.h" 36#include "xfs_bmap.h"
36#include "xfs_rtalloc.h" 37#include "xfs_rtalloc.h"
@@ -263,13 +264,18 @@ xfs_qm_scall_trunc_qfile(
263 xfs_ilock(ip, XFS_ILOCK_EXCL); 264 xfs_ilock(ip, XFS_ILOCK_EXCL);
264 xfs_trans_ijoin(tp, ip, 0); 265 xfs_trans_ijoin(tp, ip, 0);
265 266
266 error = xfs_itruncate_data(&tp, ip, 0); 267 ip->i_d.di_size = 0;
268 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
269
270 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
267 if (error) { 271 if (error) {
268 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | 272 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
269 XFS_TRANS_ABORT); 273 XFS_TRANS_ABORT);
270 goto out_unlock; 274 goto out_unlock;
271 } 275 }
272 276
277 ASSERT(ip->i_d.di_nextents == 0);
278
273 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 279 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
274 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 280 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
275 281
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 281961c1d81a..ee5b695c99a7 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -828,14 +828,6 @@ xfs_fs_inode_init_once(
828 /* xfs inode */ 828 /* xfs inode */
829 atomic_set(&ip->i_pincount, 0); 829 atomic_set(&ip->i_pincount, 0);
830 spin_lock_init(&ip->i_flags_lock); 830 spin_lock_init(&ip->i_flags_lock);
831 init_waitqueue_head(&ip->i_ipin_wait);
832 /*
833 * Because we want to use a counting completion, complete
834 * the flush completion once to allow a single access to
835 * the flush completion without blocking.
836 */
837 init_completion(&ip->i_flush);
838 complete(&ip->i_flush);
839 831
840 mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, 832 mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
841 "xfsino", ip->i_ino); 833 "xfsino", ip->i_ino);
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 72c01a1c16e7..40b75eecd2b4 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -707,14 +707,13 @@ xfs_reclaim_inode_grab(
707 return 1; 707 return 1;
708 708
709 /* 709 /*
710 * do some unlocked checks first to avoid unnecessary lock traffic. 710 * If we are asked for non-blocking operation, do unlocked checks to
711 * The first is a flush lock check, the second is a already in reclaim 711 * see if the inode already is being flushed or in reclaim to avoid
712 * check. Only do these checks if we are not going to block on locks. 712 * lock traffic.
713 */ 713 */
714 if ((flags & SYNC_TRYLOCK) && 714 if ((flags & SYNC_TRYLOCK) &&
715 (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) { 715 __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
716 return 1; 716 return 1;
717 }
718 717
719 /* 718 /*
720 * The radix tree lock here protects a thread in xfs_iget from racing 719 * The radix tree lock here protects a thread in xfs_iget from racing
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index a9d5b1e06efe..6b6df5802e95 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -891,7 +891,6 @@ DECLARE_EVENT_CLASS(xfs_file_class,
891 __field(dev_t, dev) 891 __field(dev_t, dev)
892 __field(xfs_ino_t, ino) 892 __field(xfs_ino_t, ino)
893 __field(xfs_fsize_t, size) 893 __field(xfs_fsize_t, size)
894 __field(xfs_fsize_t, new_size)
895 __field(loff_t, offset) 894 __field(loff_t, offset)
896 __field(size_t, count) 895 __field(size_t, count)
897 __field(int, flags) 896 __field(int, flags)
@@ -900,17 +899,15 @@ DECLARE_EVENT_CLASS(xfs_file_class,
900 __entry->dev = VFS_I(ip)->i_sb->s_dev; 899 __entry->dev = VFS_I(ip)->i_sb->s_dev;
901 __entry->ino = ip->i_ino; 900 __entry->ino = ip->i_ino;
902 __entry->size = ip->i_d.di_size; 901 __entry->size = ip->i_d.di_size;
903 __entry->new_size = ip->i_new_size;
904 __entry->offset = offset; 902 __entry->offset = offset;
905 __entry->count = count; 903 __entry->count = count;
906 __entry->flags = flags; 904 __entry->flags = flags;
907 ), 905 ),
908 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " 906 TP_printk("dev %d:%d ino 0x%llx size 0x%llx "
909 "offset 0x%llx count 0x%zx ioflags %s", 907 "offset 0x%llx count 0x%zx ioflags %s",
910 MAJOR(__entry->dev), MINOR(__entry->dev), 908 MAJOR(__entry->dev), MINOR(__entry->dev),
911 __entry->ino, 909 __entry->ino,
912 __entry->size, 910 __entry->size,
913 __entry->new_size,
914 __entry->offset, 911 __entry->offset,
915 __entry->count, 912 __entry->count,
916 __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) 913 __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
@@ -978,7 +975,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
978 __field(dev_t, dev) 975 __field(dev_t, dev)
979 __field(xfs_ino_t, ino) 976 __field(xfs_ino_t, ino)
980 __field(loff_t, size) 977 __field(loff_t, size)
981 __field(loff_t, new_size)
982 __field(loff_t, offset) 978 __field(loff_t, offset)
983 __field(size_t, count) 979 __field(size_t, count)
984 __field(int, type) 980 __field(int, type)
@@ -990,7 +986,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
990 __entry->dev = VFS_I(ip)->i_sb->s_dev; 986 __entry->dev = VFS_I(ip)->i_sb->s_dev;
991 __entry->ino = ip->i_ino; 987 __entry->ino = ip->i_ino;
992 __entry->size = ip->i_d.di_size; 988 __entry->size = ip->i_d.di_size;
993 __entry->new_size = ip->i_new_size;
994 __entry->offset = offset; 989 __entry->offset = offset;
995 __entry->count = count; 990 __entry->count = count;
996 __entry->type = type; 991 __entry->type = type;
@@ -998,13 +993,11 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
998 __entry->startblock = irec ? irec->br_startblock : 0; 993 __entry->startblock = irec ? irec->br_startblock : 0;
999 __entry->blockcount = irec ? irec->br_blockcount : 0; 994 __entry->blockcount = irec ? irec->br_blockcount : 0;
1000 ), 995 ),
1001 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " 996 TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd "
1002 "offset 0x%llx count %zd type %s " 997 "type %s startoff 0x%llx startblock %lld blockcount 0x%llx",
1003 "startoff 0x%llx startblock %lld blockcount 0x%llx",
1004 MAJOR(__entry->dev), MINOR(__entry->dev), 998 MAJOR(__entry->dev), MINOR(__entry->dev),
1005 __entry->ino, 999 __entry->ino,
1006 __entry->size, 1000 __entry->size,
1007 __entry->new_size,
1008 __entry->offset, 1001 __entry->offset,
1009 __entry->count, 1002 __entry->count,
1010 __print_symbolic(__entry->type, XFS_IO_TYPES), 1003 __print_symbolic(__entry->type, XFS_IO_TYPES),
@@ -1031,26 +1024,23 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class,
1031 __field(xfs_ino_t, ino) 1024 __field(xfs_ino_t, ino)
1032 __field(loff_t, isize) 1025 __field(loff_t, isize)
1033 __field(loff_t, disize) 1026 __field(loff_t, disize)
1034 __field(loff_t, new_size)
1035 __field(loff_t, offset) 1027 __field(loff_t, offset)
1036 __field(size_t, count) 1028 __field(size_t, count)
1037 ), 1029 ),
1038 TP_fast_assign( 1030 TP_fast_assign(
1039 __entry->dev = VFS_I(ip)->i_sb->s_dev; 1031 __entry->dev = VFS_I(ip)->i_sb->s_dev;
1040 __entry->ino = ip->i_ino; 1032 __entry->ino = ip->i_ino;
1041 __entry->isize = ip->i_size; 1033 __entry->isize = VFS_I(ip)->i_size;
1042 __entry->disize = ip->i_d.di_size; 1034 __entry->disize = ip->i_d.di_size;
1043 __entry->new_size = ip->i_new_size;
1044 __entry->offset = offset; 1035 __entry->offset = offset;
1045 __entry->count = count; 1036 __entry->count = count;
1046 ), 1037 ),
1047 TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx new_size 0x%llx " 1038 TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx "
1048 "offset 0x%llx count %zd", 1039 "offset 0x%llx count %zd",
1049 MAJOR(__entry->dev), MINOR(__entry->dev), 1040 MAJOR(__entry->dev), MINOR(__entry->dev),
1050 __entry->ino, 1041 __entry->ino,
1051 __entry->isize, 1042 __entry->isize,
1052 __entry->disize, 1043 __entry->disize,
1053 __entry->new_size,
1054 __entry->offset, 1044 __entry->offset,
1055 __entry->count) 1045 __entry->count)
1056); 1046);
@@ -1090,8 +1080,8 @@ DECLARE_EVENT_CLASS(xfs_itrunc_class,
1090DEFINE_EVENT(xfs_itrunc_class, name, \ 1080DEFINE_EVENT(xfs_itrunc_class, name, \
1091 TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \ 1081 TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \
1092 TP_ARGS(ip, new_size)) 1082 TP_ARGS(ip, new_size))
1093DEFINE_ITRUNC_EVENT(xfs_itruncate_data_start); 1083DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_start);
1094DEFINE_ITRUNC_EVENT(xfs_itruncate_data_end); 1084DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_end);
1095 1085
1096TRACE_EVENT(xfs_pagecache_inval, 1086TRACE_EVENT(xfs_pagecache_inval,
1097 TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish), 1087 TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish),
@@ -1568,7 +1558,6 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
1568 __field(xfs_ino_t, ino) 1558 __field(xfs_ino_t, ino)
1569 __field(int, format) 1559 __field(int, format)
1570 __field(int, nex) 1560 __field(int, nex)
1571 __field(int, max_nex)
1572 __field(int, broot_size) 1561 __field(int, broot_size)
1573 __field(int, fork_off) 1562 __field(int, fork_off)
1574 ), 1563 ),
@@ -1578,18 +1567,16 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
1578 __entry->ino = ip->i_ino; 1567 __entry->ino = ip->i_ino;
1579 __entry->format = ip->i_d.di_format; 1568 __entry->format = ip->i_d.di_format;
1580 __entry->nex = ip->i_d.di_nextents; 1569 __entry->nex = ip->i_d.di_nextents;
1581 __entry->max_nex = ip->i_df.if_ext_max;
1582 __entry->broot_size = ip->i_df.if_broot_bytes; 1570 __entry->broot_size = ip->i_df.if_broot_bytes;
1583 __entry->fork_off = XFS_IFORK_BOFF(ip); 1571 __entry->fork_off = XFS_IFORK_BOFF(ip);
1584 ), 1572 ),
1585 TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, " 1573 TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
1586 "Max in-fork extents %d, broot size %d, fork offset %d", 1574 "broot size %d, fork offset %d",
1587 MAJOR(__entry->dev), MINOR(__entry->dev), 1575 MAJOR(__entry->dev), MINOR(__entry->dev),
1588 __entry->ino, 1576 __entry->ino,
1589 __print_symbolic(__entry->which, XFS_SWAPEXT_INODES), 1577 __print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
1590 __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR), 1578 __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
1591 __entry->nex, 1579 __entry->nex,
1592 __entry->max_nex,
1593 __entry->broot_size, 1580 __entry->broot_size,
1594 __entry->fork_off) 1581 __entry->fork_off)
1595) 1582)
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index f2fea868d4db..0cf52da9d246 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -175,7 +175,7 @@ xfs_free_eofblocks(
175 * Figure out if there are any blocks beyond the end 175 * Figure out if there are any blocks beyond the end
176 * of the file. If not, then there is nothing to do. 176 * of the file. If not, then there is nothing to do.
177 */ 177 */
178 end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size)); 178 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
179 last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); 179 last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
180 if (last_fsb <= end_fsb) 180 if (last_fsb <= end_fsb)
181 return 0; 181 return 0;
@@ -226,7 +226,14 @@ xfs_free_eofblocks(
226 xfs_ilock(ip, XFS_ILOCK_EXCL); 226 xfs_ilock(ip, XFS_ILOCK_EXCL);
227 xfs_trans_ijoin(tp, ip, 0); 227 xfs_trans_ijoin(tp, ip, 0);
228 228
229 error = xfs_itruncate_data(&tp, ip, ip->i_size); 229 /*
230 * Do not update the on-disk file size. If we update the
231 * on-disk file size and then the system crashes before the
232 * contents of the file are flushed to disk then the files
233 * may be full of holes (ie NULL files bug).
234 */
235 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
236 XFS_ISIZE(ip));
230 if (error) { 237 if (error) {
231 /* 238 /*
232 * If we get an error at this point we simply don't 239 * If we get an error at this point we simply don't
@@ -540,8 +547,8 @@ xfs_release(
540 return 0; 547 return 0;
541 548
542 if ((S_ISREG(ip->i_d.di_mode) && 549 if ((S_ISREG(ip->i_d.di_mode) &&
543 ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || 550 (VFS_I(ip)->i_size > 0 ||
544 ip->i_delayed_blks > 0)) && 551 (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
545 (ip->i_df.if_flags & XFS_IFEXTENTS)) && 552 (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
546 (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) { 553 (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
547 554
@@ -618,7 +625,7 @@ xfs_inactive(
618 * only one with a reference to the inode. 625 * only one with a reference to the inode.
619 */ 626 */
620 truncate = ((ip->i_d.di_nlink == 0) && 627 truncate = ((ip->i_d.di_nlink == 0) &&
621 ((ip->i_d.di_size != 0) || (ip->i_size != 0) || 628 ((ip->i_d.di_size != 0) || XFS_ISIZE(ip) != 0 ||
622 (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) && 629 (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
623 S_ISREG(ip->i_d.di_mode)); 630 S_ISREG(ip->i_d.di_mode));
624 631
@@ -632,12 +639,12 @@ xfs_inactive(
632 639
633 if (ip->i_d.di_nlink != 0) { 640 if (ip->i_d.di_nlink != 0) {
634 if ((S_ISREG(ip->i_d.di_mode) && 641 if ((S_ISREG(ip->i_d.di_mode) &&
635 ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || 642 (VFS_I(ip)->i_size > 0 ||
636 ip->i_delayed_blks > 0)) && 643 (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
637 (ip->i_df.if_flags & XFS_IFEXTENTS) && 644 (ip->i_df.if_flags & XFS_IFEXTENTS) &&
638 (!(ip->i_d.di_flags & 645 (!(ip->i_d.di_flags &
639 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) || 646 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
640 (ip->i_delayed_blks != 0)))) { 647 ip->i_delayed_blks != 0))) {
641 error = xfs_free_eofblocks(mp, ip, 0); 648 error = xfs_free_eofblocks(mp, ip, 0);
642 if (error) 649 if (error)
643 return VN_INACTIVE_CACHE; 650 return VN_INACTIVE_CACHE;
@@ -670,13 +677,18 @@ xfs_inactive(
670 xfs_ilock(ip, XFS_ILOCK_EXCL); 677 xfs_ilock(ip, XFS_ILOCK_EXCL);
671 xfs_trans_ijoin(tp, ip, 0); 678 xfs_trans_ijoin(tp, ip, 0);
672 679
673 error = xfs_itruncate_data(&tp, ip, 0); 680 ip->i_d.di_size = 0;
681 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
682
683 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
674 if (error) { 684 if (error) {
675 xfs_trans_cancel(tp, 685 xfs_trans_cancel(tp,
676 XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); 686 XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
677 xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 687 xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
678 return VN_INACTIVE_CACHE; 688 return VN_INACTIVE_CACHE;
679 } 689 }
690
691 ASSERT(ip->i_d.di_nextents == 0);
680 } else if (S_ISLNK(ip->i_d.di_mode)) { 692 } else if (S_ISLNK(ip->i_d.di_mode)) {
681 693
682 /* 694 /*
@@ -1961,11 +1973,11 @@ xfs_zero_remaining_bytes(
1961 * since nothing can read beyond eof. The space will 1973 * since nothing can read beyond eof. The space will
1962 * be zeroed when the file is extended anyway. 1974 * be zeroed when the file is extended anyway.
1963 */ 1975 */
1964 if (startoff >= ip->i_size) 1976 if (startoff >= XFS_ISIZE(ip))
1965 return 0; 1977 return 0;
1966 1978
1967 if (endoff > ip->i_size) 1979 if (endoff > XFS_ISIZE(ip))
1968 endoff = ip->i_size; 1980 endoff = XFS_ISIZE(ip);
1969 1981
1970 bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ? 1982 bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
1971 mp->m_rtdev_targp : mp->m_ddev_targp, 1983 mp->m_rtdev_targp : mp->m_ddev_targp,
@@ -2260,7 +2272,7 @@ xfs_change_file_space(
2260 bf->l_start += offset; 2272 bf->l_start += offset;
2261 break; 2273 break;
2262 case 2: /*SEEK_END*/ 2274 case 2: /*SEEK_END*/
2263 bf->l_start += ip->i_size; 2275 bf->l_start += XFS_ISIZE(ip);
2264 break; 2276 break;
2265 default: 2277 default:
2266 return XFS_ERROR(EINVAL); 2278 return XFS_ERROR(EINVAL);
@@ -2277,7 +2289,7 @@ xfs_change_file_space(
2277 bf->l_whence = 0; 2289 bf->l_whence = 0;
2278 2290
2279 startoffset = bf->l_start; 2291 startoffset = bf->l_start;
2280 fsize = ip->i_size; 2292 fsize = XFS_ISIZE(ip);
2281 2293
2282 /* 2294 /*
2283 * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve 2295 * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve