aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/btrfs/acl.c4
-rw-r--r--fs/btrfs/backref.c495
-rw-r--r--fs/btrfs/backref.h3
-rw-r--r--fs/btrfs/btrfs_inode.h50
-rw-r--r--fs/btrfs/check-integrity.c584
-rw-r--r--fs/btrfs/ctree.c861
-rw-r--r--fs/btrfs/ctree.h77
-rw-r--r--fs/btrfs/delayed-inode.c8
-rw-r--r--fs/btrfs/delayed-ref.c10
-rw-r--r--fs/btrfs/delayed-ref.h24
-rw-r--r--fs/btrfs/disk-io.c57
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/extent-tree.c23
-rw-r--r--fs/btrfs/extent_io.c168
-rw-r--r--fs/btrfs/extent_io.h8
-rw-r--r--fs/btrfs/file.c76
-rw-r--r--fs/btrfs/free-space-cache.c45
-rw-r--r--fs/btrfs/inode.c264
-rw-r--r--fs/btrfs/ioctl.c50
-rw-r--r--fs/btrfs/ioctl.h33
-rw-r--r--fs/btrfs/ordered-data.c165
-rw-r--r--fs/btrfs/ordered-data.h13
-rw-r--r--fs/btrfs/print-tree.c3
-rw-r--r--fs/btrfs/reada.c5
-rw-r--r--fs/btrfs/scrub.c65
-rw-r--r--fs/btrfs/super.c117
-rw-r--r--fs/btrfs/transaction.c59
-rw-r--r--fs/btrfs/tree-log.c35
-rw-r--r--fs/btrfs/ulist.c38
-rw-r--r--fs/btrfs/ulist.h15
-rw-r--r--fs/btrfs/volumes.c306
-rw-r--r--fs/btrfs/volumes.h52
-rw-r--r--fs/btrfs/xattr.c1
33 files changed, 2849 insertions, 866 deletions
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 89b156d85d63..761e2cd8fed1 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -227,7 +227,11 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans,
227 if (ret > 0) { 227 if (ret > 0) {
228 /* we need an acl */ 228 /* we need an acl */
229 ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS); 229 ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS);
230 } else {
231 cache_no_acl(inode);
230 } 232 }
233 } else {
234 cache_no_acl(inode);
231 } 235 }
232failed: 236failed:
233 posix_acl_release(acl); 237 posix_acl_release(acl);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index bcec06750232..3f75895c919b 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -24,22 +24,135 @@
24#include "delayed-ref.h" 24#include "delayed-ref.h"
25#include "locking.h" 25#include "locking.h"
26 26
27struct extent_inode_elem {
28 u64 inum;
29 u64 offset;
30 struct extent_inode_elem *next;
31};
32
33static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb,
34 struct btrfs_file_extent_item *fi,
35 u64 extent_item_pos,
36 struct extent_inode_elem **eie)
37{
38 u64 data_offset;
39 u64 data_len;
40 struct extent_inode_elem *e;
41
42 data_offset = btrfs_file_extent_offset(eb, fi);
43 data_len = btrfs_file_extent_num_bytes(eb, fi);
44
45 if (extent_item_pos < data_offset ||
46 extent_item_pos >= data_offset + data_len)
47 return 1;
48
49 e = kmalloc(sizeof(*e), GFP_NOFS);
50 if (!e)
51 return -ENOMEM;
52
53 e->next = *eie;
54 e->inum = key->objectid;
55 e->offset = key->offset + (extent_item_pos - data_offset);
56 *eie = e;
57
58 return 0;
59}
60
61static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte,
62 u64 extent_item_pos,
63 struct extent_inode_elem **eie)
64{
65 u64 disk_byte;
66 struct btrfs_key key;
67 struct btrfs_file_extent_item *fi;
68 int slot;
69 int nritems;
70 int extent_type;
71 int ret;
72
73 /*
74 * from the shared data ref, we only have the leaf but we need
75 * the key. thus, we must look into all items and see that we
76 * find one (some) with a reference to our extent item.
77 */
78 nritems = btrfs_header_nritems(eb);
79 for (slot = 0; slot < nritems; ++slot) {
80 btrfs_item_key_to_cpu(eb, &key, slot);
81 if (key.type != BTRFS_EXTENT_DATA_KEY)
82 continue;
83 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
84 extent_type = btrfs_file_extent_type(eb, fi);
85 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
86 continue;
87 /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
88 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
89 if (disk_byte != wanted_disk_byte)
90 continue;
91
92 ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie);
93 if (ret < 0)
94 return ret;
95 }
96
97 return 0;
98}
99
27/* 100/*
28 * this structure records all encountered refs on the way up to the root 101 * this structure records all encountered refs on the way up to the root
29 */ 102 */
30struct __prelim_ref { 103struct __prelim_ref {
31 struct list_head list; 104 struct list_head list;
32 u64 root_id; 105 u64 root_id;
33 struct btrfs_key key; 106 struct btrfs_key key_for_search;
34 int level; 107 int level;
35 int count; 108 int count;
109 struct extent_inode_elem *inode_list;
36 u64 parent; 110 u64 parent;
37 u64 wanted_disk_byte; 111 u64 wanted_disk_byte;
38}; 112};
39 113
114/*
115 * the rules for all callers of this function are:
116 * - obtaining the parent is the goal
117 * - if you add a key, you must know that it is a correct key
118 * - if you cannot add the parent or a correct key, then we will look into the
119 * block later to set a correct key
120 *
121 * delayed refs
122 * ============
123 * backref type | shared | indirect | shared | indirect
124 * information | tree | tree | data | data
125 * --------------------+--------+----------+--------+----------
126 * parent logical | y | - | - | -
127 * key to resolve | - | y | y | y
128 * tree block logical | - | - | - | -
129 * root for resolving | y | y | y | y
130 *
131 * - column 1: we've the parent -> done
132 * - column 2, 3, 4: we use the key to find the parent
133 *
134 * on disk refs (inline or keyed)
135 * ==============================
136 * backref type | shared | indirect | shared | indirect
137 * information | tree | tree | data | data
138 * --------------------+--------+----------+--------+----------
139 * parent logical | y | - | y | -
140 * key to resolve | - | - | - | y
141 * tree block logical | y | y | y | y
142 * root for resolving | - | y | y | y
143 *
144 * - column 1, 3: we've the parent -> done
145 * - column 2: we take the first key from the block to find the parent
146 * (see __add_missing_keys)
147 * - column 4: we use the key to find the parent
148 *
149 * additional information that's available but not required to find the parent
150 * block might help in merging entries to gain some speed.
151 */
152
40static int __add_prelim_ref(struct list_head *head, u64 root_id, 153static int __add_prelim_ref(struct list_head *head, u64 root_id,
41 struct btrfs_key *key, int level, u64 parent, 154 struct btrfs_key *key, int level,
42 u64 wanted_disk_byte, int count) 155 u64 parent, u64 wanted_disk_byte, int count)
43{ 156{
44 struct __prelim_ref *ref; 157 struct __prelim_ref *ref;
45 158
@@ -50,10 +163,11 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
50 163
51 ref->root_id = root_id; 164 ref->root_id = root_id;
52 if (key) 165 if (key)
53 ref->key = *key; 166 ref->key_for_search = *key;
54 else 167 else
55 memset(&ref->key, 0, sizeof(ref->key)); 168 memset(&ref->key_for_search, 0, sizeof(ref->key_for_search));
56 169
170 ref->inode_list = NULL;
57 ref->level = level; 171 ref->level = level;
58 ref->count = count; 172 ref->count = count;
59 ref->parent = parent; 173 ref->parent = parent;
@@ -64,18 +178,26 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
64} 178}
65 179
66static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, 180static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
67 struct ulist *parents, 181 struct ulist *parents, int level,
68 struct extent_buffer *eb, int level, 182 struct btrfs_key *key, u64 wanted_disk_byte,
69 u64 wanted_objectid, u64 wanted_disk_byte) 183 const u64 *extent_item_pos)
70{ 184{
71 int ret; 185 int ret;
72 int slot; 186 int slot = path->slots[level];
187 struct extent_buffer *eb = path->nodes[level];
73 struct btrfs_file_extent_item *fi; 188 struct btrfs_file_extent_item *fi;
74 struct btrfs_key key; 189 struct extent_inode_elem *eie = NULL;
75 u64 disk_byte; 190 u64 disk_byte;
191 u64 wanted_objectid = key->objectid;
76 192
77add_parent: 193add_parent:
78 ret = ulist_add(parents, eb->start, 0, GFP_NOFS); 194 if (level == 0 && extent_item_pos) {
195 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
196 ret = check_extent_in_eb(key, eb, fi, *extent_item_pos, &eie);
197 if (ret < 0)
198 return ret;
199 }
200 ret = ulist_add(parents, eb->start, (unsigned long)eie, GFP_NOFS);
79 if (ret < 0) 201 if (ret < 0)
80 return ret; 202 return ret;
81 203
@@ -89,6 +211,7 @@ add_parent:
89 * repeat this until we don't find any additional EXTENT_DATA items. 211 * repeat this until we don't find any additional EXTENT_DATA items.
90 */ 212 */
91 while (1) { 213 while (1) {
214 eie = NULL;
92 ret = btrfs_next_leaf(root, path); 215 ret = btrfs_next_leaf(root, path);
93 if (ret < 0) 216 if (ret < 0)
94 return ret; 217 return ret;
@@ -97,9 +220,9 @@ add_parent:
97 220
98 eb = path->nodes[0]; 221 eb = path->nodes[0];
99 for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) { 222 for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) {
100 btrfs_item_key_to_cpu(eb, &key, slot); 223 btrfs_item_key_to_cpu(eb, key, slot);
101 if (key.objectid != wanted_objectid || 224 if (key->objectid != wanted_objectid ||
102 key.type != BTRFS_EXTENT_DATA_KEY) 225 key->type != BTRFS_EXTENT_DATA_KEY)
103 return 0; 226 return 0;
104 fi = btrfs_item_ptr(eb, slot, 227 fi = btrfs_item_ptr(eb, slot,
105 struct btrfs_file_extent_item); 228 struct btrfs_file_extent_item);
@@ -118,8 +241,10 @@ add_parent:
118 */ 241 */
119static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, 242static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
120 int search_commit_root, 243 int search_commit_root,
244 u64 time_seq,
121 struct __prelim_ref *ref, 245 struct __prelim_ref *ref,
122 struct ulist *parents) 246 struct ulist *parents,
247 const u64 *extent_item_pos)
123{ 248{
124 struct btrfs_path *path; 249 struct btrfs_path *path;
125 struct btrfs_root *root; 250 struct btrfs_root *root;
@@ -152,12 +277,13 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
152 goto out; 277 goto out;
153 278
154 path->lowest_level = level; 279 path->lowest_level = level;
155 ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0); 280 ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq);
156 pr_debug("search slot in root %llu (level %d, ref count %d) returned " 281 pr_debug("search slot in root %llu (level %d, ref count %d) returned "
157 "%d for key (%llu %u %llu)\n", 282 "%d for key (%llu %u %llu)\n",
158 (unsigned long long)ref->root_id, level, ref->count, ret, 283 (unsigned long long)ref->root_id, level, ref->count, ret,
159 (unsigned long long)ref->key.objectid, ref->key.type, 284 (unsigned long long)ref->key_for_search.objectid,
160 (unsigned long long)ref->key.offset); 285 ref->key_for_search.type,
286 (unsigned long long)ref->key_for_search.offset);
161 if (ret < 0) 287 if (ret < 0)
162 goto out; 288 goto out;
163 289
@@ -179,9 +305,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
179 btrfs_item_key_to_cpu(eb, &key, path->slots[0]); 305 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
180 } 306 }
181 307
182 /* the last two parameters will only be used for level == 0 */ 308 ret = add_all_parents(root, path, parents, level, &key,
183 ret = add_all_parents(root, path, parents, eb, level, key.objectid, 309 ref->wanted_disk_byte, extent_item_pos);
184 ref->wanted_disk_byte);
185out: 310out:
186 btrfs_free_path(path); 311 btrfs_free_path(path);
187 return ret; 312 return ret;
@@ -191,8 +316,9 @@ out:
191 * resolve all indirect backrefs from the list 316 * resolve all indirect backrefs from the list
192 */ 317 */
193static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, 318static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
194 int search_commit_root, 319 int search_commit_root, u64 time_seq,
195 struct list_head *head) 320 struct list_head *head,
321 const u64 *extent_item_pos)
196{ 322{
197 int err; 323 int err;
198 int ret = 0; 324 int ret = 0;
@@ -201,6 +327,7 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
201 struct __prelim_ref *new_ref; 327 struct __prelim_ref *new_ref;
202 struct ulist *parents; 328 struct ulist *parents;
203 struct ulist_node *node; 329 struct ulist_node *node;
330 struct ulist_iterator uiter;
204 331
205 parents = ulist_alloc(GFP_NOFS); 332 parents = ulist_alloc(GFP_NOFS);
206 if (!parents) 333 if (!parents)
@@ -217,7 +344,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
217 if (ref->count == 0) 344 if (ref->count == 0)
218 continue; 345 continue;
219 err = __resolve_indirect_ref(fs_info, search_commit_root, 346 err = __resolve_indirect_ref(fs_info, search_commit_root,
220 ref, parents); 347 time_seq, ref, parents,
348 extent_item_pos);
221 if (err) { 349 if (err) {
222 if (ret == 0) 350 if (ret == 0)
223 ret = err; 351 ret = err;
@@ -225,11 +353,14 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
225 } 353 }
226 354
227 /* we put the first parent into the ref at hand */ 355 /* we put the first parent into the ref at hand */
228 node = ulist_next(parents, NULL); 356 ULIST_ITER_INIT(&uiter);
357 node = ulist_next(parents, &uiter);
229 ref->parent = node ? node->val : 0; 358 ref->parent = node ? node->val : 0;
359 ref->inode_list =
360 node ? (struct extent_inode_elem *)node->aux : 0;
230 361
231 /* additional parents require new refs being added here */ 362 /* additional parents require new refs being added here */
232 while ((node = ulist_next(parents, node))) { 363 while ((node = ulist_next(parents, &uiter))) {
233 new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS); 364 new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS);
234 if (!new_ref) { 365 if (!new_ref) {
235 ret = -ENOMEM; 366 ret = -ENOMEM;
@@ -237,6 +368,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
237 } 368 }
238 memcpy(new_ref, ref, sizeof(*ref)); 369 memcpy(new_ref, ref, sizeof(*ref));
239 new_ref->parent = node->val; 370 new_ref->parent = node->val;
371 new_ref->inode_list =
372 (struct extent_inode_elem *)node->aux;
240 list_add(&new_ref->list, &ref->list); 373 list_add(&new_ref->list, &ref->list);
241 } 374 }
242 ulist_reinit(parents); 375 ulist_reinit(parents);
@@ -246,10 +379,65 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
246 return ret; 379 return ret;
247} 380}
248 381
382static inline int ref_for_same_block(struct __prelim_ref *ref1,
383 struct __prelim_ref *ref2)
384{
385 if (ref1->level != ref2->level)
386 return 0;
387 if (ref1->root_id != ref2->root_id)
388 return 0;
389 if (ref1->key_for_search.type != ref2->key_for_search.type)
390 return 0;
391 if (ref1->key_for_search.objectid != ref2->key_for_search.objectid)
392 return 0;
393 if (ref1->key_for_search.offset != ref2->key_for_search.offset)
394 return 0;
395 if (ref1->parent != ref2->parent)
396 return 0;
397
398 return 1;
399}
400
401/*
402 * read tree blocks and add keys where required.
403 */
404static int __add_missing_keys(struct btrfs_fs_info *fs_info,
405 struct list_head *head)
406{
407 struct list_head *pos;
408 struct extent_buffer *eb;
409
410 list_for_each(pos, head) {
411 struct __prelim_ref *ref;
412 ref = list_entry(pos, struct __prelim_ref, list);
413
414 if (ref->parent)
415 continue;
416 if (ref->key_for_search.type)
417 continue;
418 BUG_ON(!ref->wanted_disk_byte);
419 eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
420 fs_info->tree_root->leafsize, 0);
421 BUG_ON(!eb);
422 btrfs_tree_read_lock(eb);
423 if (btrfs_header_level(eb) == 0)
424 btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
425 else
426 btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
427 btrfs_tree_read_unlock(eb);
428 free_extent_buffer(eb);
429 }
430 return 0;
431}
432
249/* 433/*
250 * merge two lists of backrefs and adjust counts accordingly 434 * merge two lists of backrefs and adjust counts accordingly
251 * 435 *
252 * mode = 1: merge identical keys, if key is set 436 * mode = 1: merge identical keys, if key is set
437 * FIXME: if we add more keys in __add_prelim_ref, we can merge more here.
438 * additionally, we could even add a key range for the blocks we
439 * looked into to merge even more (-> replace unresolved refs by those
440 * having a parent).
253 * mode = 2: merge identical parents 441 * mode = 2: merge identical parents
254 */ 442 */
255static int __merge_refs(struct list_head *head, int mode) 443static int __merge_refs(struct list_head *head, int mode)
@@ -263,20 +451,21 @@ static int __merge_refs(struct list_head *head, int mode)
263 451
264 ref1 = list_entry(pos1, struct __prelim_ref, list); 452 ref1 = list_entry(pos1, struct __prelim_ref, list);
265 453
266 if (mode == 1 && ref1->key.type == 0)
267 continue;
268 for (pos2 = pos1->next, n2 = pos2->next; pos2 != head; 454 for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
269 pos2 = n2, n2 = pos2->next) { 455 pos2 = n2, n2 = pos2->next) {
270 struct __prelim_ref *ref2; 456 struct __prelim_ref *ref2;
457 struct __prelim_ref *xchg;
271 458
272 ref2 = list_entry(pos2, struct __prelim_ref, list); 459 ref2 = list_entry(pos2, struct __prelim_ref, list);
273 460
274 if (mode == 1) { 461 if (mode == 1) {
275 if (memcmp(&ref1->key, &ref2->key, 462 if (!ref_for_same_block(ref1, ref2))
276 sizeof(ref1->key)) ||
277 ref1->level != ref2->level ||
278 ref1->root_id != ref2->root_id)
279 continue; 463 continue;
464 if (!ref1->parent && ref2->parent) {
465 xchg = ref1;
466 ref1 = ref2;
467 ref2 = xchg;
468 }
280 ref1->count += ref2->count; 469 ref1->count += ref2->count;
281 } else { 470 } else {
282 if (ref1->parent != ref2->parent) 471 if (ref1->parent != ref2->parent)
@@ -296,16 +485,17 @@ static int __merge_refs(struct list_head *head, int mode)
296 * smaller or equal that seq to the list 485 * smaller or equal that seq to the list
297 */ 486 */
298static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, 487static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
299 struct btrfs_key *info_key,
300 struct list_head *prefs) 488 struct list_head *prefs)
301{ 489{
302 struct btrfs_delayed_extent_op *extent_op = head->extent_op; 490 struct btrfs_delayed_extent_op *extent_op = head->extent_op;
303 struct rb_node *n = &head->node.rb_node; 491 struct rb_node *n = &head->node.rb_node;
492 struct btrfs_key key;
493 struct btrfs_key op_key = {0};
304 int sgn; 494 int sgn;
305 int ret = 0; 495 int ret = 0;
306 496
307 if (extent_op && extent_op->update_key) 497 if (extent_op && extent_op->update_key)
308 btrfs_disk_key_to_cpu(info_key, &extent_op->key); 498 btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
309 499
310 while ((n = rb_prev(n))) { 500 while ((n = rb_prev(n))) {
311 struct btrfs_delayed_ref_node *node; 501 struct btrfs_delayed_ref_node *node;
@@ -337,7 +527,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
337 struct btrfs_delayed_tree_ref *ref; 527 struct btrfs_delayed_tree_ref *ref;
338 528
339 ref = btrfs_delayed_node_to_tree_ref(node); 529 ref = btrfs_delayed_node_to_tree_ref(node);
340 ret = __add_prelim_ref(prefs, ref->root, info_key, 530 ret = __add_prelim_ref(prefs, ref->root, &op_key,
341 ref->level + 1, 0, node->bytenr, 531 ref->level + 1, 0, node->bytenr,
342 node->ref_mod * sgn); 532 node->ref_mod * sgn);
343 break; 533 break;
@@ -346,7 +536,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
346 struct btrfs_delayed_tree_ref *ref; 536 struct btrfs_delayed_tree_ref *ref;
347 537
348 ref = btrfs_delayed_node_to_tree_ref(node); 538 ref = btrfs_delayed_node_to_tree_ref(node);
349 ret = __add_prelim_ref(prefs, ref->root, info_key, 539 ret = __add_prelim_ref(prefs, ref->root, NULL,
350 ref->level + 1, ref->parent, 540 ref->level + 1, ref->parent,
351 node->bytenr, 541 node->bytenr,
352 node->ref_mod * sgn); 542 node->ref_mod * sgn);
@@ -354,8 +544,6 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
354 } 544 }
355 case BTRFS_EXTENT_DATA_REF_KEY: { 545 case BTRFS_EXTENT_DATA_REF_KEY: {
356 struct btrfs_delayed_data_ref *ref; 546 struct btrfs_delayed_data_ref *ref;
357 struct btrfs_key key;
358
359 ref = btrfs_delayed_node_to_data_ref(node); 547 ref = btrfs_delayed_node_to_data_ref(node);
360 548
361 key.objectid = ref->objectid; 549 key.objectid = ref->objectid;
@@ -368,7 +556,6 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
368 } 556 }
369 case BTRFS_SHARED_DATA_REF_KEY: { 557 case BTRFS_SHARED_DATA_REF_KEY: {
370 struct btrfs_delayed_data_ref *ref; 558 struct btrfs_delayed_data_ref *ref;
371 struct btrfs_key key;
372 559
373 ref = btrfs_delayed_node_to_data_ref(node); 560 ref = btrfs_delayed_node_to_data_ref(node);
374 561
@@ -394,8 +581,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
394 */ 581 */
395static int __add_inline_refs(struct btrfs_fs_info *fs_info, 582static int __add_inline_refs(struct btrfs_fs_info *fs_info,
396 struct btrfs_path *path, u64 bytenr, 583 struct btrfs_path *path, u64 bytenr,
397 struct btrfs_key *info_key, int *info_level, 584 int *info_level, struct list_head *prefs)
398 struct list_head *prefs)
399{ 585{
400 int ret = 0; 586 int ret = 0;
401 int slot; 587 int slot;
@@ -411,7 +597,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
411 * enumerate all inline refs 597 * enumerate all inline refs
412 */ 598 */
413 leaf = path->nodes[0]; 599 leaf = path->nodes[0];
414 slot = path->slots[0] - 1; 600 slot = path->slots[0];
415 601
416 item_size = btrfs_item_size_nr(leaf, slot); 602 item_size = btrfs_item_size_nr(leaf, slot);
417 BUG_ON(item_size < sizeof(*ei)); 603 BUG_ON(item_size < sizeof(*ei));
@@ -424,12 +610,9 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
424 610
425 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 611 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
426 struct btrfs_tree_block_info *info; 612 struct btrfs_tree_block_info *info;
427 struct btrfs_disk_key disk_key;
428 613
429 info = (struct btrfs_tree_block_info *)ptr; 614 info = (struct btrfs_tree_block_info *)ptr;
430 *info_level = btrfs_tree_block_level(leaf, info); 615 *info_level = btrfs_tree_block_level(leaf, info);
431 btrfs_tree_block_key(leaf, info, &disk_key);
432 btrfs_disk_key_to_cpu(info_key, &disk_key);
433 ptr += sizeof(struct btrfs_tree_block_info); 616 ptr += sizeof(struct btrfs_tree_block_info);
434 BUG_ON(ptr > end); 617 BUG_ON(ptr > end);
435 } else { 618 } else {
@@ -447,7 +630,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
447 630
448 switch (type) { 631 switch (type) {
449 case BTRFS_SHARED_BLOCK_REF_KEY: 632 case BTRFS_SHARED_BLOCK_REF_KEY:
450 ret = __add_prelim_ref(prefs, 0, info_key, 633 ret = __add_prelim_ref(prefs, 0, NULL,
451 *info_level + 1, offset, 634 *info_level + 1, offset,
452 bytenr, 1); 635 bytenr, 1);
453 break; 636 break;
@@ -462,8 +645,9 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
462 break; 645 break;
463 } 646 }
464 case BTRFS_TREE_BLOCK_REF_KEY: 647 case BTRFS_TREE_BLOCK_REF_KEY:
465 ret = __add_prelim_ref(prefs, offset, info_key, 648 ret = __add_prelim_ref(prefs, offset, NULL,
466 *info_level + 1, 0, bytenr, 1); 649 *info_level + 1, 0,
650 bytenr, 1);
467 break; 651 break;
468 case BTRFS_EXTENT_DATA_REF_KEY: { 652 case BTRFS_EXTENT_DATA_REF_KEY: {
469 struct btrfs_extent_data_ref *dref; 653 struct btrfs_extent_data_ref *dref;
@@ -477,8 +661,8 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
477 key.type = BTRFS_EXTENT_DATA_KEY; 661 key.type = BTRFS_EXTENT_DATA_KEY;
478 key.offset = btrfs_extent_data_ref_offset(leaf, dref); 662 key.offset = btrfs_extent_data_ref_offset(leaf, dref);
479 root = btrfs_extent_data_ref_root(leaf, dref); 663 root = btrfs_extent_data_ref_root(leaf, dref);
480 ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr, 664 ret = __add_prelim_ref(prefs, root, &key, 0, 0,
481 count); 665 bytenr, count);
482 break; 666 break;
483 } 667 }
484 default: 668 default:
@@ -496,8 +680,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
496 */ 680 */
497static int __add_keyed_refs(struct btrfs_fs_info *fs_info, 681static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
498 struct btrfs_path *path, u64 bytenr, 682 struct btrfs_path *path, u64 bytenr,
499 struct btrfs_key *info_key, int info_level, 683 int info_level, struct list_head *prefs)
500 struct list_head *prefs)
501{ 684{
502 struct btrfs_root *extent_root = fs_info->extent_root; 685 struct btrfs_root *extent_root = fs_info->extent_root;
503 int ret; 686 int ret;
@@ -527,7 +710,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
527 710
528 switch (key.type) { 711 switch (key.type) {
529 case BTRFS_SHARED_BLOCK_REF_KEY: 712 case BTRFS_SHARED_BLOCK_REF_KEY:
530 ret = __add_prelim_ref(prefs, 0, info_key, 713 ret = __add_prelim_ref(prefs, 0, NULL,
531 info_level + 1, key.offset, 714 info_level + 1, key.offset,
532 bytenr, 1); 715 bytenr, 1);
533 break; 716 break;
@@ -543,8 +726,9 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
543 break; 726 break;
544 } 727 }
545 case BTRFS_TREE_BLOCK_REF_KEY: 728 case BTRFS_TREE_BLOCK_REF_KEY:
546 ret = __add_prelim_ref(prefs, key.offset, info_key, 729 ret = __add_prelim_ref(prefs, key.offset, NULL,
547 info_level + 1, 0, bytenr, 1); 730 info_level + 1, 0,
731 bytenr, 1);
548 break; 732 break;
549 case BTRFS_EXTENT_DATA_REF_KEY: { 733 case BTRFS_EXTENT_DATA_REF_KEY: {
550 struct btrfs_extent_data_ref *dref; 734 struct btrfs_extent_data_ref *dref;
@@ -560,7 +744,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
560 key.offset = btrfs_extent_data_ref_offset(leaf, dref); 744 key.offset = btrfs_extent_data_ref_offset(leaf, dref);
561 root = btrfs_extent_data_ref_root(leaf, dref); 745 root = btrfs_extent_data_ref_root(leaf, dref);
562 ret = __add_prelim_ref(prefs, root, &key, 0, 0, 746 ret = __add_prelim_ref(prefs, root, &key, 0, 0,
563 bytenr, count); 747 bytenr, count);
564 break; 748 break;
565 } 749 }
566 default: 750 default:
@@ -582,11 +766,12 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
582 */ 766 */
583static int find_parent_nodes(struct btrfs_trans_handle *trans, 767static int find_parent_nodes(struct btrfs_trans_handle *trans,
584 struct btrfs_fs_info *fs_info, u64 bytenr, 768 struct btrfs_fs_info *fs_info, u64 bytenr,
585 u64 seq, struct ulist *refs, struct ulist *roots) 769 u64 delayed_ref_seq, u64 time_seq,
770 struct ulist *refs, struct ulist *roots,
771 const u64 *extent_item_pos)
586{ 772{
587 struct btrfs_key key; 773 struct btrfs_key key;
588 struct btrfs_path *path; 774 struct btrfs_path *path;
589 struct btrfs_key info_key = { 0 };
590 struct btrfs_delayed_ref_root *delayed_refs = NULL; 775 struct btrfs_delayed_ref_root *delayed_refs = NULL;
591 struct btrfs_delayed_ref_head *head; 776 struct btrfs_delayed_ref_head *head;
592 int info_level = 0; 777 int info_level = 0;
@@ -645,7 +830,7 @@ again:
645 btrfs_put_delayed_ref(&head->node); 830 btrfs_put_delayed_ref(&head->node);
646 goto again; 831 goto again;
647 } 832 }
648 ret = __add_delayed_refs(head, seq, &info_key, 833 ret = __add_delayed_refs(head, delayed_ref_seq,
649 &prefs_delayed); 834 &prefs_delayed);
650 if (ret) { 835 if (ret) {
651 spin_unlock(&delayed_refs->lock); 836 spin_unlock(&delayed_refs->lock);
@@ -659,16 +844,17 @@ again:
659 struct extent_buffer *leaf; 844 struct extent_buffer *leaf;
660 int slot; 845 int slot;
661 846
847 path->slots[0]--;
662 leaf = path->nodes[0]; 848 leaf = path->nodes[0];
663 slot = path->slots[0] - 1; 849 slot = path->slots[0];
664 btrfs_item_key_to_cpu(leaf, &key, slot); 850 btrfs_item_key_to_cpu(leaf, &key, slot);
665 if (key.objectid == bytenr && 851 if (key.objectid == bytenr &&
666 key.type == BTRFS_EXTENT_ITEM_KEY) { 852 key.type == BTRFS_EXTENT_ITEM_KEY) {
667 ret = __add_inline_refs(fs_info, path, bytenr, 853 ret = __add_inline_refs(fs_info, path, bytenr,
668 &info_key, &info_level, &prefs); 854 &info_level, &prefs);
669 if (ret) 855 if (ret)
670 goto out; 856 goto out;
671 ret = __add_keyed_refs(fs_info, path, bytenr, &info_key, 857 ret = __add_keyed_refs(fs_info, path, bytenr,
672 info_level, &prefs); 858 info_level, &prefs);
673 if (ret) 859 if (ret)
674 goto out; 860 goto out;
@@ -676,21 +862,18 @@ again:
676 } 862 }
677 btrfs_release_path(path); 863 btrfs_release_path(path);
678 864
679 /*
680 * when adding the delayed refs above, the info_key might not have
681 * been known yet. Go over the list and replace the missing keys
682 */
683 list_for_each_entry(ref, &prefs_delayed, list) {
684 if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0)
685 memcpy(&ref->key, &info_key, sizeof(ref->key));
686 }
687 list_splice_init(&prefs_delayed, &prefs); 865 list_splice_init(&prefs_delayed, &prefs);
688 866
867 ret = __add_missing_keys(fs_info, &prefs);
868 if (ret)
869 goto out;
870
689 ret = __merge_refs(&prefs, 1); 871 ret = __merge_refs(&prefs, 1);
690 if (ret) 872 if (ret)
691 goto out; 873 goto out;
692 874
693 ret = __resolve_indirect_refs(fs_info, search_commit_root, &prefs); 875 ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq,
876 &prefs, extent_item_pos);
694 if (ret) 877 if (ret)
695 goto out; 878 goto out;
696 879
@@ -709,7 +892,33 @@ again:
709 BUG_ON(ret < 0); 892 BUG_ON(ret < 0);
710 } 893 }
711 if (ref->count && ref->parent) { 894 if (ref->count && ref->parent) {
712 ret = ulist_add(refs, ref->parent, 0, GFP_NOFS); 895 struct extent_inode_elem *eie = NULL;
896 if (extent_item_pos && !ref->inode_list) {
897 u32 bsz;
898 struct extent_buffer *eb;
899 bsz = btrfs_level_size(fs_info->extent_root,
900 info_level);
901 eb = read_tree_block(fs_info->extent_root,
902 ref->parent, bsz, 0);
903 BUG_ON(!eb);
904 ret = find_extent_in_eb(eb, bytenr,
905 *extent_item_pos, &eie);
906 ref->inode_list = eie;
907 free_extent_buffer(eb);
908 }
909 ret = ulist_add_merge(refs, ref->parent,
910 (unsigned long)ref->inode_list,
911 (unsigned long *)&eie, GFP_NOFS);
912 if (!ret && extent_item_pos) {
913 /*
914 * we've recorded that parent, so we must extend
915 * its inode list here
916 */
917 BUG_ON(!eie);
918 while (eie->next)
919 eie = eie->next;
920 eie->next = ref->inode_list;
921 }
713 BUG_ON(ret < 0); 922 BUG_ON(ret < 0);
714 } 923 }
715 kfree(ref); 924 kfree(ref);
@@ -734,6 +943,28 @@ out:
734 return ret; 943 return ret;
735} 944}
736 945
946static void free_leaf_list(struct ulist *blocks)
947{
948 struct ulist_node *node = NULL;
949 struct extent_inode_elem *eie;
950 struct extent_inode_elem *eie_next;
951 struct ulist_iterator uiter;
952
953 ULIST_ITER_INIT(&uiter);
954 while ((node = ulist_next(blocks, &uiter))) {
955 if (!node->aux)
956 continue;
957 eie = (struct extent_inode_elem *)node->aux;
958 for (; eie; eie = eie_next) {
959 eie_next = eie->next;
960 kfree(eie);
961 }
962 node->aux = 0;
963 }
964
965 ulist_free(blocks);
966}
967
737/* 968/*
738 * Finds all leafs with a reference to the specified combination of bytenr and 969 * Finds all leafs with a reference to the specified combination of bytenr and
739 * offset. key_list_head will point to a list of corresponding keys (caller must 970 * offset. key_list_head will point to a list of corresponding keys (caller must
@@ -744,7 +975,9 @@ out:
744 */ 975 */
745static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, 976static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
746 struct btrfs_fs_info *fs_info, u64 bytenr, 977 struct btrfs_fs_info *fs_info, u64 bytenr,
747 u64 num_bytes, u64 seq, struct ulist **leafs) 978 u64 delayed_ref_seq, u64 time_seq,
979 struct ulist **leafs,
980 const u64 *extent_item_pos)
748{ 981{
749 struct ulist *tmp; 982 struct ulist *tmp;
750 int ret; 983 int ret;
@@ -758,11 +991,12 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
758 return -ENOMEM; 991 return -ENOMEM;
759 } 992 }
760 993
761 ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp); 994 ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq,
995 time_seq, *leafs, tmp, extent_item_pos);
762 ulist_free(tmp); 996 ulist_free(tmp);
763 997
764 if (ret < 0 && ret != -ENOENT) { 998 if (ret < 0 && ret != -ENOENT) {
765 ulist_free(*leafs); 999 free_leaf_list(*leafs);
766 return ret; 1000 return ret;
767 } 1001 }
768 1002
@@ -784,10 +1018,12 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
784 */ 1018 */
785int btrfs_find_all_roots(struct btrfs_trans_handle *trans, 1019int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
786 struct btrfs_fs_info *fs_info, u64 bytenr, 1020 struct btrfs_fs_info *fs_info, u64 bytenr,
787 u64 num_bytes, u64 seq, struct ulist **roots) 1021 u64 delayed_ref_seq, u64 time_seq,
1022 struct ulist **roots)
788{ 1023{
789 struct ulist *tmp; 1024 struct ulist *tmp;
790 struct ulist_node *node = NULL; 1025 struct ulist_node *node = NULL;
1026 struct ulist_iterator uiter;
791 int ret; 1027 int ret;
792 1028
793 tmp = ulist_alloc(GFP_NOFS); 1029 tmp = ulist_alloc(GFP_NOFS);
@@ -799,15 +1035,16 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
799 return -ENOMEM; 1035 return -ENOMEM;
800 } 1036 }
801 1037
1038 ULIST_ITER_INIT(&uiter);
802 while (1) { 1039 while (1) {
803 ret = find_parent_nodes(trans, fs_info, bytenr, seq, 1040 ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq,
804 tmp, *roots); 1041 time_seq, tmp, *roots, NULL);
805 if (ret < 0 && ret != -ENOENT) { 1042 if (ret < 0 && ret != -ENOENT) {
806 ulist_free(tmp); 1043 ulist_free(tmp);
807 ulist_free(*roots); 1044 ulist_free(*roots);
808 return ret; 1045 return ret;
809 } 1046 }
810 node = ulist_next(tmp, node); 1047 node = ulist_next(tmp, &uiter);
811 if (!node) 1048 if (!node)
812 break; 1049 break;
813 bytenr = node->val; 1050 bytenr = node->val;
@@ -1093,67 +1330,25 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
1093 return 0; 1330 return 0;
1094} 1331}
1095 1332
1096static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, u64 logical, 1333static int iterate_leaf_refs(struct extent_inode_elem *inode_list,
1097 u64 orig_extent_item_objectid, 1334 u64 root, u64 extent_item_objectid,
1098 u64 extent_item_pos, u64 root,
1099 iterate_extent_inodes_t *iterate, void *ctx) 1335 iterate_extent_inodes_t *iterate, void *ctx)
1100{ 1336{
1101 u64 disk_byte; 1337 struct extent_inode_elem *eie;
1102 struct btrfs_key key;
1103 struct btrfs_file_extent_item *fi;
1104 struct extent_buffer *eb;
1105 int slot;
1106 int nritems;
1107 int ret = 0; 1338 int ret = 0;
1108 int extent_type;
1109 u64 data_offset;
1110 u64 data_len;
1111
1112 eb = read_tree_block(fs_info->tree_root, logical,
1113 fs_info->tree_root->leafsize, 0);
1114 if (!eb)
1115 return -EIO;
1116
1117 /*
1118 * from the shared data ref, we only have the leaf but we need
1119 * the key. thus, we must look into all items and see that we
1120 * find one (some) with a reference to our extent item.
1121 */
1122 nritems = btrfs_header_nritems(eb);
1123 for (slot = 0; slot < nritems; ++slot) {
1124 btrfs_item_key_to_cpu(eb, &key, slot);
1125 if (key.type != BTRFS_EXTENT_DATA_KEY)
1126 continue;
1127 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
1128 extent_type = btrfs_file_extent_type(eb, fi);
1129 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
1130 continue;
1131 /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
1132 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
1133 if (disk_byte != orig_extent_item_objectid)
1134 continue;
1135
1136 data_offset = btrfs_file_extent_offset(eb, fi);
1137 data_len = btrfs_file_extent_num_bytes(eb, fi);
1138
1139 if (extent_item_pos < data_offset ||
1140 extent_item_pos >= data_offset + data_len)
1141 continue;
1142 1339
1340 for (eie = inode_list; eie; eie = eie->next) {
1143 pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), " 1341 pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), "
1144 "root %llu\n", orig_extent_item_objectid, 1342 "root %llu\n", extent_item_objectid,
1145 key.objectid, key.offset, root); 1343 eie->inum, eie->offset, root);
1146 ret = iterate(key.objectid, 1344 ret = iterate(eie->inum, eie->offset, root, ctx);
1147 key.offset + (extent_item_pos - data_offset),
1148 root, ctx);
1149 if (ret) { 1345 if (ret) {
1150 pr_debug("stopping iteration because ret=%d\n", ret); 1346 pr_debug("stopping iteration for %llu due to ret=%d\n",
1347 extent_item_objectid, ret);
1151 break; 1348 break;
1152 } 1349 }
1153 } 1350 }
1154 1351
1155 free_extent_buffer(eb);
1156
1157 return ret; 1352 return ret;
1158} 1353}
1159 1354
@@ -1175,7 +1370,10 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1175 struct ulist *roots = NULL; 1370 struct ulist *roots = NULL;
1176 struct ulist_node *ref_node = NULL; 1371 struct ulist_node *ref_node = NULL;
1177 struct ulist_node *root_node = NULL; 1372 struct ulist_node *root_node = NULL;
1178 struct seq_list seq_elem; 1373 struct seq_list seq_elem = {};
1374 struct seq_list tree_mod_seq_elem = {};
1375 struct ulist_iterator ref_uiter;
1376 struct ulist_iterator root_uiter;
1179 struct btrfs_delayed_ref_root *delayed_refs = NULL; 1377 struct btrfs_delayed_ref_root *delayed_refs = NULL;
1180 1378
1181 pr_debug("resolving all inodes for extent %llu\n", 1379 pr_debug("resolving all inodes for extent %llu\n",
@@ -1192,34 +1390,41 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1192 spin_lock(&delayed_refs->lock); 1390 spin_lock(&delayed_refs->lock);
1193 btrfs_get_delayed_seq(delayed_refs, &seq_elem); 1391 btrfs_get_delayed_seq(delayed_refs, &seq_elem);
1194 spin_unlock(&delayed_refs->lock); 1392 spin_unlock(&delayed_refs->lock);
1393 btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
1195 } 1394 }
1196 1395
1197 ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, 1396 ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
1198 extent_item_pos, seq_elem.seq, 1397 seq_elem.seq, tree_mod_seq_elem.seq, &refs,
1199 &refs); 1398 &extent_item_pos);
1200
1201 if (ret) 1399 if (ret)
1202 goto out; 1400 goto out;
1203 1401
1204 while (!ret && (ref_node = ulist_next(refs, ref_node))) { 1402 ULIST_ITER_INIT(&ref_uiter);
1205 ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1, 1403 while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
1206 seq_elem.seq, &roots); 1404 ret = btrfs_find_all_roots(trans, fs_info, ref_node->val,
1405 seq_elem.seq,
1406 tree_mod_seq_elem.seq, &roots);
1207 if (ret) 1407 if (ret)
1208 break; 1408 break;
1209 while (!ret && (root_node = ulist_next(roots, root_node))) { 1409 ULIST_ITER_INIT(&root_uiter);
1210 pr_debug("root %llu references leaf %llu\n", 1410 while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
1211 root_node->val, ref_node->val); 1411 pr_debug("root %llu references leaf %llu, data list "
1212 ret = iterate_leaf_refs(fs_info, ref_node->val, 1412 "%#lx\n", root_node->val, ref_node->val,
1213 extent_item_objectid, 1413 ref_node->aux);
1214 extent_item_pos, root_node->val, 1414 ret = iterate_leaf_refs(
1215 iterate, ctx); 1415 (struct extent_inode_elem *)ref_node->aux,
1416 root_node->val, extent_item_objectid,
1417 iterate, ctx);
1216 } 1418 }
1419 ulist_free(roots);
1420 roots = NULL;
1217 } 1421 }
1218 1422
1219 ulist_free(refs); 1423 free_leaf_list(refs);
1220 ulist_free(roots); 1424 ulist_free(roots);
1221out: 1425out:
1222 if (!search_commit_root) { 1426 if (!search_commit_root) {
1427 btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
1223 btrfs_put_delayed_seq(delayed_refs, &seq_elem); 1428 btrfs_put_delayed_seq(delayed_refs, &seq_elem);
1224 btrfs_end_transaction(trans, fs_info->extent_root); 1429 btrfs_end_transaction(trans, fs_info->extent_root);
1225 } 1430 }
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 57ea2e959e4d..c18d8ac7b795 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -58,7 +58,8 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
58 58
59int btrfs_find_all_roots(struct btrfs_trans_handle *trans, 59int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
60 struct btrfs_fs_info *fs_info, u64 bytenr, 60 struct btrfs_fs_info *fs_info, u64 bytenr,
61 u64 num_bytes, u64 seq, struct ulist **roots); 61 u64 delayed_ref_seq, u64 time_seq,
62 struct ulist **roots);
62 63
63struct btrfs_data_container *init_data_container(u32 total_bytes); 64struct btrfs_data_container *init_data_container(u32 total_bytes);
64struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, 65struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 9b9b15fd5204..e616f8872e69 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -24,6 +24,20 @@
24#include "ordered-data.h" 24#include "ordered-data.h"
25#include "delayed-inode.h" 25#include "delayed-inode.h"
26 26
27/*
28 * ordered_data_close is set by truncate when a file that used
29 * to have good data has been truncated to zero. When it is set
30 * the btrfs file release call will add this inode to the
31 * ordered operations list so that we make sure to flush out any
32 * new data the application may have written before commit.
33 */
34#define BTRFS_INODE_ORDERED_DATA_CLOSE 0
35#define BTRFS_INODE_ORPHAN_META_RESERVED 1
36#define BTRFS_INODE_DUMMY 2
37#define BTRFS_INODE_IN_DEFRAG 3
38#define BTRFS_INODE_DELALLOC_META_RESERVED 4
39#define BTRFS_INODE_HAS_ORPHAN_ITEM 5
40
27/* in memory btrfs inode */ 41/* in memory btrfs inode */
28struct btrfs_inode { 42struct btrfs_inode {
29 /* which subvolume this inode belongs to */ 43 /* which subvolume this inode belongs to */
@@ -57,9 +71,6 @@ struct btrfs_inode {
57 /* used to order data wrt metadata */ 71 /* used to order data wrt metadata */
58 struct btrfs_ordered_inode_tree ordered_tree; 72 struct btrfs_ordered_inode_tree ordered_tree;
59 73
60 /* for keeping track of orphaned inodes */
61 struct list_head i_orphan;
62
63 /* list of all the delalloc inodes in the FS. There are times we need 74 /* list of all the delalloc inodes in the FS. There are times we need
64 * to write all the delalloc pages to disk, and this list is used 75 * to write all the delalloc pages to disk, and this list is used
65 * to walk them all. 76 * to walk them all.
@@ -78,14 +89,13 @@ struct btrfs_inode {
78 /* the space_info for where this inode's data allocations are done */ 89 /* the space_info for where this inode's data allocations are done */
79 struct btrfs_space_info *space_info; 90 struct btrfs_space_info *space_info;
80 91
92 unsigned long runtime_flags;
93
81 /* full 64 bit generation number, struct vfs_inode doesn't have a big 94 /* full 64 bit generation number, struct vfs_inode doesn't have a big
82 * enough field for this. 95 * enough field for this.
83 */ 96 */
84 u64 generation; 97 u64 generation;
85 98
86 /* sequence number for NFS changes */
87 u64 sequence;
88
89 /* 99 /*
90 * transid of the trans_handle that last modified this inode 100 * transid of the trans_handle that last modified this inode
91 */ 101 */
@@ -145,22 +155,9 @@ struct btrfs_inode {
145 unsigned reserved_extents; 155 unsigned reserved_extents;
146 156
147 /* 157 /*
148 * ordered_data_close is set by truncate when a file that used
149 * to have good data has been truncated to zero. When it is set
150 * the btrfs file release call will add this inode to the
151 * ordered operations list so that we make sure to flush out any
152 * new data the application may have written before commit.
153 */
154 unsigned ordered_data_close:1;
155 unsigned orphan_meta_reserved:1;
156 unsigned dummy_inode:1;
157 unsigned in_defrag:1;
158 unsigned delalloc_meta_reserved:1;
159
160 /*
161 * always compress this one file 158 * always compress this one file
162 */ 159 */
163 unsigned force_compress:4; 160 unsigned force_compress;
164 161
165 struct btrfs_delayed_node *delayed_node; 162 struct btrfs_delayed_node *delayed_node;
166 163
@@ -202,4 +199,17 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_root *root,
202 return false; 199 return false;
203} 200}
204 201
202static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
203{
204 struct btrfs_root *root = BTRFS_I(inode)->root;
205 int ret = 0;
206
207 mutex_lock(&root->log_mutex);
208 if (BTRFS_I(inode)->logged_trans == generation &&
209 BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
210 ret = 1;
211 mutex_unlock(&root->log_mutex);
212 return ret;
213}
214
205#endif 215#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index c053e90f2006..9cebb1fd6a3c 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -103,8 +103,6 @@
103#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300 103#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
104#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters, 104#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters,
105 * excluding " [...]" */ 105 * excluding " [...]" */
106#define BTRFSIC_BLOCK_SIZE PAGE_SIZE
107
108#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1) 106#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
109 107
110/* 108/*
@@ -210,8 +208,9 @@ struct btrfsic_block_data_ctx {
210 u64 dev_bytenr; /* physical bytenr on device */ 208 u64 dev_bytenr; /* physical bytenr on device */
211 u32 len; 209 u32 len;
212 struct btrfsic_dev_state *dev; 210 struct btrfsic_dev_state *dev;
213 char *data; 211 char **datav;
214 struct buffer_head *bh; /* do not use if set to NULL */ 212 struct page **pagev;
213 void *mem_to_free;
215}; 214};
216 215
217/* This structure is used to implement recursion without occupying 216/* This structure is used to implement recursion without occupying
@@ -243,6 +242,8 @@ struct btrfsic_state {
243 struct btrfs_root *root; 242 struct btrfs_root *root;
244 u64 max_superblock_generation; 243 u64 max_superblock_generation;
245 struct btrfsic_block *latest_superblock; 244 struct btrfsic_block *latest_superblock;
245 u32 metablock_size;
246 u32 datablock_size;
246}; 247};
247 248
248static void btrfsic_block_init(struct btrfsic_block *b); 249static void btrfsic_block_init(struct btrfsic_block *b);
@@ -290,8 +291,10 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
290static int btrfsic_process_metablock(struct btrfsic_state *state, 291static int btrfsic_process_metablock(struct btrfsic_state *state,
291 struct btrfsic_block *block, 292 struct btrfsic_block *block,
292 struct btrfsic_block_data_ctx *block_ctx, 293 struct btrfsic_block_data_ctx *block_ctx,
293 struct btrfs_header *hdr,
294 int limit_nesting, int force_iodone_flag); 294 int limit_nesting, int force_iodone_flag);
295static void btrfsic_read_from_block_data(
296 struct btrfsic_block_data_ctx *block_ctx,
297 void *dst, u32 offset, size_t len);
295static int btrfsic_create_link_to_next_block( 298static int btrfsic_create_link_to_next_block(
296 struct btrfsic_state *state, 299 struct btrfsic_state *state,
297 struct btrfsic_block *block, 300 struct btrfsic_block *block,
@@ -318,12 +321,13 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
318static int btrfsic_read_block(struct btrfsic_state *state, 321static int btrfsic_read_block(struct btrfsic_state *state,
319 struct btrfsic_block_data_ctx *block_ctx); 322 struct btrfsic_block_data_ctx *block_ctx);
320static void btrfsic_dump_database(struct btrfsic_state *state); 323static void btrfsic_dump_database(struct btrfsic_state *state);
324static void btrfsic_complete_bio_end_io(struct bio *bio, int err);
321static int btrfsic_test_for_metadata(struct btrfsic_state *state, 325static int btrfsic_test_for_metadata(struct btrfsic_state *state,
322 const u8 *data, unsigned int size); 326 char **datav, unsigned int num_pages);
323static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, 327static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
324 u64 dev_bytenr, u8 *mapped_data, 328 u64 dev_bytenr, char **mapped_datav,
325 unsigned int len, struct bio *bio, 329 unsigned int num_pages,
326 int *bio_is_patched, 330 struct bio *bio, int *bio_is_patched,
327 struct buffer_head *bh, 331 struct buffer_head *bh,
328 int submit_bio_bh_rw); 332 int submit_bio_bh_rw);
329static int btrfsic_process_written_superblock( 333static int btrfsic_process_written_superblock(
@@ -375,7 +379,7 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
375static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, 379static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
376 u64 bytenr, 380 u64 bytenr,
377 struct btrfsic_dev_state *dev_state, 381 struct btrfsic_dev_state *dev_state,
378 u64 dev_bytenr, char *data); 382 u64 dev_bytenr);
379 383
380static struct mutex btrfsic_mutex; 384static struct mutex btrfsic_mutex;
381static int btrfsic_is_initialized; 385static int btrfsic_is_initialized;
@@ -651,7 +655,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
651 int pass; 655 int pass;
652 656
653 BUG_ON(NULL == state); 657 BUG_ON(NULL == state);
654 selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS); 658 selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
655 if (NULL == selected_super) { 659 if (NULL == selected_super) {
656 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); 660 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
657 return -1; 661 return -1;
@@ -718,7 +722,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
718 722
719 num_copies = 723 num_copies =
720 btrfs_num_copies(&state->root->fs_info->mapping_tree, 724 btrfs_num_copies(&state->root->fs_info->mapping_tree,
721 next_bytenr, PAGE_SIZE); 725 next_bytenr, state->metablock_size);
722 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 726 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
723 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 727 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
724 (unsigned long long)next_bytenr, num_copies); 728 (unsigned long long)next_bytenr, num_copies);
@@ -727,9 +731,9 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
727 struct btrfsic_block *next_block; 731 struct btrfsic_block *next_block;
728 struct btrfsic_block_data_ctx tmp_next_block_ctx; 732 struct btrfsic_block_data_ctx tmp_next_block_ctx;
729 struct btrfsic_block_link *l; 733 struct btrfsic_block_link *l;
730 struct btrfs_header *hdr;
731 734
732 ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE, 735 ret = btrfsic_map_block(state, next_bytenr,
736 state->metablock_size,
733 &tmp_next_block_ctx, 737 &tmp_next_block_ctx,
734 mirror_num); 738 mirror_num);
735 if (ret) { 739 if (ret) {
@@ -758,7 +762,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
758 BUG_ON(NULL == l); 762 BUG_ON(NULL == l);
759 763
760 ret = btrfsic_read_block(state, &tmp_next_block_ctx); 764 ret = btrfsic_read_block(state, &tmp_next_block_ctx);
761 if (ret < (int)BTRFSIC_BLOCK_SIZE) { 765 if (ret < (int)PAGE_CACHE_SIZE) {
762 printk(KERN_INFO 766 printk(KERN_INFO
763 "btrfsic: read @logical %llu failed!\n", 767 "btrfsic: read @logical %llu failed!\n",
764 (unsigned long long) 768 (unsigned long long)
@@ -768,11 +772,9 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
768 return -1; 772 return -1;
769 } 773 }
770 774
771 hdr = (struct btrfs_header *)tmp_next_block_ctx.data;
772 ret = btrfsic_process_metablock(state, 775 ret = btrfsic_process_metablock(state,
773 next_block, 776 next_block,
774 &tmp_next_block_ctx, 777 &tmp_next_block_ctx,
775 hdr,
776 BTRFS_MAX_LEVEL + 3, 1); 778 BTRFS_MAX_LEVEL + 3, 1);
777 btrfsic_release_block_ctx(&tmp_next_block_ctx); 779 btrfsic_release_block_ctx(&tmp_next_block_ctx);
778 } 780 }
@@ -799,7 +801,10 @@ static int btrfsic_process_superblock_dev_mirror(
799 801
800 /* super block bytenr is always the unmapped device bytenr */ 802 /* super block bytenr is always the unmapped device bytenr */
801 dev_bytenr = btrfs_sb_offset(superblock_mirror_num); 803 dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
802 bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096); 804 if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
805 return -1;
806 bh = __bread(superblock_bdev, dev_bytenr / 4096,
807 BTRFS_SUPER_INFO_SIZE);
803 if (NULL == bh) 808 if (NULL == bh)
804 return -1; 809 return -1;
805 super_tmp = (struct btrfs_super_block *) 810 super_tmp = (struct btrfs_super_block *)
@@ -808,7 +813,10 @@ static int btrfsic_process_superblock_dev_mirror(
808 if (btrfs_super_bytenr(super_tmp) != dev_bytenr || 813 if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
809 strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC, 814 strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC,
810 sizeof(super_tmp->magic)) || 815 sizeof(super_tmp->magic)) ||
811 memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) { 816 memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
817 btrfs_super_nodesize(super_tmp) != state->metablock_size ||
818 btrfs_super_leafsize(super_tmp) != state->metablock_size ||
819 btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
812 brelse(bh); 820 brelse(bh);
813 return 0; 821 return 0;
814 } 822 }
@@ -893,7 +901,7 @@ static int btrfsic_process_superblock_dev_mirror(
893 901
894 num_copies = 902 num_copies =
895 btrfs_num_copies(&state->root->fs_info->mapping_tree, 903 btrfs_num_copies(&state->root->fs_info->mapping_tree,
896 next_bytenr, PAGE_SIZE); 904 next_bytenr, state->metablock_size);
897 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 905 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
898 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 906 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
899 (unsigned long long)next_bytenr, num_copies); 907 (unsigned long long)next_bytenr, num_copies);
@@ -902,7 +910,8 @@ static int btrfsic_process_superblock_dev_mirror(
902 struct btrfsic_block_data_ctx tmp_next_block_ctx; 910 struct btrfsic_block_data_ctx tmp_next_block_ctx;
903 struct btrfsic_block_link *l; 911 struct btrfsic_block_link *l;
904 912
905 if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE, 913 if (btrfsic_map_block(state, next_bytenr,
914 state->metablock_size,
906 &tmp_next_block_ctx, 915 &tmp_next_block_ctx,
907 mirror_num)) { 916 mirror_num)) {
908 printk(KERN_INFO "btrfsic: btrfsic_map_block(" 917 printk(KERN_INFO "btrfsic: btrfsic_map_block("
@@ -966,13 +975,15 @@ static int btrfsic_process_metablock(
966 struct btrfsic_state *state, 975 struct btrfsic_state *state,
967 struct btrfsic_block *const first_block, 976 struct btrfsic_block *const first_block,
968 struct btrfsic_block_data_ctx *const first_block_ctx, 977 struct btrfsic_block_data_ctx *const first_block_ctx,
969 struct btrfs_header *const first_hdr,
970 int first_limit_nesting, int force_iodone_flag) 978 int first_limit_nesting, int force_iodone_flag)
971{ 979{
972 struct btrfsic_stack_frame initial_stack_frame = { 0 }; 980 struct btrfsic_stack_frame initial_stack_frame = { 0 };
973 struct btrfsic_stack_frame *sf; 981 struct btrfsic_stack_frame *sf;
974 struct btrfsic_stack_frame *next_stack; 982 struct btrfsic_stack_frame *next_stack;
983 struct btrfs_header *const first_hdr =
984 (struct btrfs_header *)first_block_ctx->datav[0];
975 985
986 BUG_ON(!first_hdr);
976 sf = &initial_stack_frame; 987 sf = &initial_stack_frame;
977 sf->error = 0; 988 sf->error = 0;
978 sf->i = -1; 989 sf->i = -1;
@@ -1012,21 +1023,47 @@ continue_with_current_leaf_stack_frame:
1012 } 1023 }
1013 1024
1014 if (sf->i < sf->nr) { 1025 if (sf->i < sf->nr) {
1015 struct btrfs_item *disk_item = leafhdr->items + sf->i; 1026 struct btrfs_item disk_item;
1016 struct btrfs_disk_key *disk_key = &disk_item->key; 1027 u32 disk_item_offset =
1028 (uintptr_t)(leafhdr->items + sf->i) -
1029 (uintptr_t)leafhdr;
1030 struct btrfs_disk_key *disk_key;
1017 u8 type; 1031 u8 type;
1018 const u32 item_offset = le32_to_cpu(disk_item->offset); 1032 u32 item_offset;
1019 1033
1034 if (disk_item_offset + sizeof(struct btrfs_item) >
1035 sf->block_ctx->len) {
1036leaf_item_out_of_bounce_error:
1037 printk(KERN_INFO
1038 "btrfsic: leaf item out of bounce at logical %llu, dev %s\n",
1039 sf->block_ctx->start,
1040 sf->block_ctx->dev->name);
1041 goto one_stack_frame_backwards;
1042 }
1043 btrfsic_read_from_block_data(sf->block_ctx,
1044 &disk_item,
1045 disk_item_offset,
1046 sizeof(struct btrfs_item));
1047 item_offset = le32_to_cpu(disk_item.offset);
1048 disk_key = &disk_item.key;
1020 type = disk_key->type; 1049 type = disk_key->type;
1021 1050
1022 if (BTRFS_ROOT_ITEM_KEY == type) { 1051 if (BTRFS_ROOT_ITEM_KEY == type) {
1023 const struct btrfs_root_item *const root_item = 1052 struct btrfs_root_item root_item;
1024 (struct btrfs_root_item *) 1053 u32 root_item_offset;
1025 (sf->block_ctx->data + 1054 u64 next_bytenr;
1026 offsetof(struct btrfs_leaf, items) + 1055
1027 item_offset); 1056 root_item_offset = item_offset +
1028 const u64 next_bytenr = 1057 offsetof(struct btrfs_leaf, items);
1029 le64_to_cpu(root_item->bytenr); 1058 if (root_item_offset +
1059 sizeof(struct btrfs_root_item) >
1060 sf->block_ctx->len)
1061 goto leaf_item_out_of_bounce_error;
1062 btrfsic_read_from_block_data(
1063 sf->block_ctx, &root_item,
1064 root_item_offset,
1065 sizeof(struct btrfs_root_item));
1066 next_bytenr = le64_to_cpu(root_item.bytenr);
1030 1067
1031 sf->error = 1068 sf->error =
1032 btrfsic_create_link_to_next_block( 1069 btrfsic_create_link_to_next_block(
@@ -1041,7 +1078,7 @@ continue_with_current_leaf_stack_frame:
1041 &sf->num_copies, 1078 &sf->num_copies,
1042 &sf->mirror_num, 1079 &sf->mirror_num,
1043 disk_key, 1080 disk_key,
1044 le64_to_cpu(root_item-> 1081 le64_to_cpu(root_item.
1045 generation)); 1082 generation));
1046 if (sf->error) 1083 if (sf->error)
1047 goto one_stack_frame_backwards; 1084 goto one_stack_frame_backwards;
@@ -1049,7 +1086,7 @@ continue_with_current_leaf_stack_frame:
1049 if (NULL != sf->next_block) { 1086 if (NULL != sf->next_block) {
1050 struct btrfs_header *const next_hdr = 1087 struct btrfs_header *const next_hdr =
1051 (struct btrfs_header *) 1088 (struct btrfs_header *)
1052 sf->next_block_ctx.data; 1089 sf->next_block_ctx.datav[0];
1053 1090
1054 next_stack = 1091 next_stack =
1055 btrfsic_stack_frame_alloc(); 1092 btrfsic_stack_frame_alloc();
@@ -1111,10 +1148,24 @@ continue_with_current_node_stack_frame:
1111 } 1148 }
1112 1149
1113 if (sf->i < sf->nr) { 1150 if (sf->i < sf->nr) {
1114 struct btrfs_key_ptr *disk_key_ptr = 1151 struct btrfs_key_ptr key_ptr;
1115 nodehdr->ptrs + sf->i; 1152 u32 key_ptr_offset;
1116 const u64 next_bytenr = 1153 u64 next_bytenr;
1117 le64_to_cpu(disk_key_ptr->blockptr); 1154
1155 key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) -
1156 (uintptr_t)nodehdr;
1157 if (key_ptr_offset + sizeof(struct btrfs_key_ptr) >
1158 sf->block_ctx->len) {
1159 printk(KERN_INFO
1160 "btrfsic: node item out of bounce at logical %llu, dev %s\n",
1161 sf->block_ctx->start,
1162 sf->block_ctx->dev->name);
1163 goto one_stack_frame_backwards;
1164 }
1165 btrfsic_read_from_block_data(
1166 sf->block_ctx, &key_ptr, key_ptr_offset,
1167 sizeof(struct btrfs_key_ptr));
1168 next_bytenr = le64_to_cpu(key_ptr.blockptr);
1118 1169
1119 sf->error = btrfsic_create_link_to_next_block( 1170 sf->error = btrfsic_create_link_to_next_block(
1120 state, 1171 state,
@@ -1127,15 +1178,15 @@ continue_with_current_node_stack_frame:
1127 force_iodone_flag, 1178 force_iodone_flag,
1128 &sf->num_copies, 1179 &sf->num_copies,
1129 &sf->mirror_num, 1180 &sf->mirror_num,
1130 &disk_key_ptr->key, 1181 &key_ptr.key,
1131 le64_to_cpu(disk_key_ptr->generation)); 1182 le64_to_cpu(key_ptr.generation));
1132 if (sf->error) 1183 if (sf->error)
1133 goto one_stack_frame_backwards; 1184 goto one_stack_frame_backwards;
1134 1185
1135 if (NULL != sf->next_block) { 1186 if (NULL != sf->next_block) {
1136 struct btrfs_header *const next_hdr = 1187 struct btrfs_header *const next_hdr =
1137 (struct btrfs_header *) 1188 (struct btrfs_header *)
1138 sf->next_block_ctx.data; 1189 sf->next_block_ctx.datav[0];
1139 1190
1140 next_stack = btrfsic_stack_frame_alloc(); 1191 next_stack = btrfsic_stack_frame_alloc();
1141 if (NULL == next_stack) 1192 if (NULL == next_stack)
@@ -1181,6 +1232,35 @@ one_stack_frame_backwards:
1181 return sf->error; 1232 return sf->error;
1182} 1233}
1183 1234
1235static void btrfsic_read_from_block_data(
1236 struct btrfsic_block_data_ctx *block_ctx,
1237 void *dstv, u32 offset, size_t len)
1238{
1239 size_t cur;
1240 size_t offset_in_page;
1241 char *kaddr;
1242 char *dst = (char *)dstv;
1243 size_t start_offset = block_ctx->start & ((u64)PAGE_CACHE_SIZE - 1);
1244 unsigned long i = (start_offset + offset) >> PAGE_CACHE_SHIFT;
1245
1246 WARN_ON(offset + len > block_ctx->len);
1247 offset_in_page = (start_offset + offset) &
1248 ((unsigned long)PAGE_CACHE_SIZE - 1);
1249
1250 while (len > 0) {
1251 cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page));
1252 BUG_ON(i >= (block_ctx->len + PAGE_CACHE_SIZE - 1) >>
1253 PAGE_CACHE_SHIFT);
1254 kaddr = block_ctx->datav[i];
1255 memcpy(dst, kaddr + offset_in_page, cur);
1256
1257 dst += cur;
1258 len -= cur;
1259 offset_in_page = 0;
1260 i++;
1261 }
1262}
1263
1184static int btrfsic_create_link_to_next_block( 1264static int btrfsic_create_link_to_next_block(
1185 struct btrfsic_state *state, 1265 struct btrfsic_state *state,
1186 struct btrfsic_block *block, 1266 struct btrfsic_block *block,
@@ -1204,7 +1284,7 @@ static int btrfsic_create_link_to_next_block(
1204 if (0 == *num_copiesp) { 1284 if (0 == *num_copiesp) {
1205 *num_copiesp = 1285 *num_copiesp =
1206 btrfs_num_copies(&state->root->fs_info->mapping_tree, 1286 btrfs_num_copies(&state->root->fs_info->mapping_tree,
1207 next_bytenr, PAGE_SIZE); 1287 next_bytenr, state->metablock_size);
1208 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 1288 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
1209 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 1289 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
1210 (unsigned long long)next_bytenr, *num_copiesp); 1290 (unsigned long long)next_bytenr, *num_copiesp);
@@ -1219,7 +1299,7 @@ static int btrfsic_create_link_to_next_block(
1219 "btrfsic_create_link_to_next_block(mirror_num=%d)\n", 1299 "btrfsic_create_link_to_next_block(mirror_num=%d)\n",
1220 *mirror_nump); 1300 *mirror_nump);
1221 ret = btrfsic_map_block(state, next_bytenr, 1301 ret = btrfsic_map_block(state, next_bytenr,
1222 BTRFSIC_BLOCK_SIZE, 1302 state->metablock_size,
1223 next_block_ctx, *mirror_nump); 1303 next_block_ctx, *mirror_nump);
1224 if (ret) { 1304 if (ret) {
1225 printk(KERN_INFO 1305 printk(KERN_INFO
@@ -1314,7 +1394,7 @@ static int btrfsic_create_link_to_next_block(
1314 1394
1315 if (limit_nesting > 0 && did_alloc_block_link) { 1395 if (limit_nesting > 0 && did_alloc_block_link) {
1316 ret = btrfsic_read_block(state, next_block_ctx); 1396 ret = btrfsic_read_block(state, next_block_ctx);
1317 if (ret < (int)BTRFSIC_BLOCK_SIZE) { 1397 if (ret < (int)next_block_ctx->len) {
1318 printk(KERN_INFO 1398 printk(KERN_INFO
1319 "btrfsic: read block @logical %llu failed!\n", 1399 "btrfsic: read block @logical %llu failed!\n",
1320 (unsigned long long)next_bytenr); 1400 (unsigned long long)next_bytenr);
@@ -1339,43 +1419,74 @@ static int btrfsic_handle_extent_data(
1339 u32 item_offset, int force_iodone_flag) 1419 u32 item_offset, int force_iodone_flag)
1340{ 1420{
1341 int ret; 1421 int ret;
1342 struct btrfs_file_extent_item *file_extent_item = 1422 struct btrfs_file_extent_item file_extent_item;
1343 (struct btrfs_file_extent_item *)(block_ctx->data + 1423 u64 file_extent_item_offset;
1344 offsetof(struct btrfs_leaf, 1424 u64 next_bytenr;
1345 items) + item_offset); 1425 u64 num_bytes;
1346 u64 next_bytenr = 1426 u64 generation;
1347 le64_to_cpu(file_extent_item->disk_bytenr) +
1348 le64_to_cpu(file_extent_item->offset);
1349 u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes);
1350 u64 generation = le64_to_cpu(file_extent_item->generation);
1351 struct btrfsic_block_link *l; 1427 struct btrfsic_block_link *l;
1352 1428
1429 file_extent_item_offset = offsetof(struct btrfs_leaf, items) +
1430 item_offset;
1431 if (file_extent_item_offset +
1432 offsetof(struct btrfs_file_extent_item, disk_num_bytes) >
1433 block_ctx->len) {
1434 printk(KERN_INFO
1435 "btrfsic: file item out of bounce at logical %llu, dev %s\n",
1436 block_ctx->start, block_ctx->dev->name);
1437 return -1;
1438 }
1439
1440 btrfsic_read_from_block_data(block_ctx, &file_extent_item,
1441 file_extent_item_offset,
1442 offsetof(struct btrfs_file_extent_item, disk_num_bytes));
1443 if (BTRFS_FILE_EXTENT_REG != file_extent_item.type ||
1444 ((u64)0) == le64_to_cpu(file_extent_item.disk_bytenr)) {
1445 if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
1446 printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu\n",
1447 file_extent_item.type,
1448 (unsigned long long)
1449 le64_to_cpu(file_extent_item.disk_bytenr));
1450 return 0;
1451 }
1452
1453 if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) >
1454 block_ctx->len) {
1455 printk(KERN_INFO
1456 "btrfsic: file item out of bounce at logical %llu, dev %s\n",
1457 block_ctx->start, block_ctx->dev->name);
1458 return -1;
1459 }
1460 btrfsic_read_from_block_data(block_ctx, &file_extent_item,
1461 file_extent_item_offset,
1462 sizeof(struct btrfs_file_extent_item));
1463 next_bytenr = le64_to_cpu(file_extent_item.disk_bytenr) +
1464 le64_to_cpu(file_extent_item.offset);
1465 generation = le64_to_cpu(file_extent_item.generation);
1466 num_bytes = le64_to_cpu(file_extent_item.num_bytes);
1467 generation = le64_to_cpu(file_extent_item.generation);
1468
1353 if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) 1469 if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
1354 printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu," 1470 printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu,"
1355 " offset = %llu, num_bytes = %llu\n", 1471 " offset = %llu, num_bytes = %llu\n",
1356 file_extent_item->type, 1472 file_extent_item.type,
1357 (unsigned long long) 1473 (unsigned long long)
1358 le64_to_cpu(file_extent_item->disk_bytenr), 1474 le64_to_cpu(file_extent_item.disk_bytenr),
1359 (unsigned long long) 1475 (unsigned long long)le64_to_cpu(file_extent_item.offset),
1360 le64_to_cpu(file_extent_item->offset), 1476 (unsigned long long)num_bytes);
1361 (unsigned long long)
1362 le64_to_cpu(file_extent_item->num_bytes));
1363 if (BTRFS_FILE_EXTENT_REG != file_extent_item->type ||
1364 ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr))
1365 return 0;
1366 while (num_bytes > 0) { 1477 while (num_bytes > 0) {
1367 u32 chunk_len; 1478 u32 chunk_len;
1368 int num_copies; 1479 int num_copies;
1369 int mirror_num; 1480 int mirror_num;
1370 1481
1371 if (num_bytes > BTRFSIC_BLOCK_SIZE) 1482 if (num_bytes > state->datablock_size)
1372 chunk_len = BTRFSIC_BLOCK_SIZE; 1483 chunk_len = state->datablock_size;
1373 else 1484 else
1374 chunk_len = num_bytes; 1485 chunk_len = num_bytes;
1375 1486
1376 num_copies = 1487 num_copies =
1377 btrfs_num_copies(&state->root->fs_info->mapping_tree, 1488 btrfs_num_copies(&state->root->fs_info->mapping_tree,
1378 next_bytenr, PAGE_SIZE); 1489 next_bytenr, state->datablock_size);
1379 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 1490 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
1380 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 1491 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
1381 (unsigned long long)next_bytenr, num_copies); 1492 (unsigned long long)next_bytenr, num_copies);
@@ -1475,8 +1586,9 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
1475 block_ctx_out->dev_bytenr = multi->stripes[0].physical; 1586 block_ctx_out->dev_bytenr = multi->stripes[0].physical;
1476 block_ctx_out->start = bytenr; 1587 block_ctx_out->start = bytenr;
1477 block_ctx_out->len = len; 1588 block_ctx_out->len = len;
1478 block_ctx_out->data = NULL; 1589 block_ctx_out->datav = NULL;
1479 block_ctx_out->bh = NULL; 1590 block_ctx_out->pagev = NULL;
1591 block_ctx_out->mem_to_free = NULL;
1480 1592
1481 if (0 == ret) 1593 if (0 == ret)
1482 kfree(multi); 1594 kfree(multi);
@@ -1496,8 +1608,9 @@ static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
1496 block_ctx_out->dev_bytenr = bytenr; 1608 block_ctx_out->dev_bytenr = bytenr;
1497 block_ctx_out->start = bytenr; 1609 block_ctx_out->start = bytenr;
1498 block_ctx_out->len = len; 1610 block_ctx_out->len = len;
1499 block_ctx_out->data = NULL; 1611 block_ctx_out->datav = NULL;
1500 block_ctx_out->bh = NULL; 1612 block_ctx_out->pagev = NULL;
1613 block_ctx_out->mem_to_free = NULL;
1501 if (NULL != block_ctx_out->dev) { 1614 if (NULL != block_ctx_out->dev) {
1502 return 0; 1615 return 0;
1503 } else { 1616 } else {
@@ -1508,38 +1621,127 @@ static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
1508 1621
1509static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) 1622static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
1510{ 1623{
1511 if (NULL != block_ctx->bh) { 1624 if (block_ctx->mem_to_free) {
1512 brelse(block_ctx->bh); 1625 unsigned int num_pages;
1513 block_ctx->bh = NULL; 1626
1627 BUG_ON(!block_ctx->datav);
1628 BUG_ON(!block_ctx->pagev);
1629 num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >>
1630 PAGE_CACHE_SHIFT;
1631 while (num_pages > 0) {
1632 num_pages--;
1633 if (block_ctx->datav[num_pages]) {
1634 kunmap(block_ctx->pagev[num_pages]);
1635 block_ctx->datav[num_pages] = NULL;
1636 }
1637 if (block_ctx->pagev[num_pages]) {
1638 __free_page(block_ctx->pagev[num_pages]);
1639 block_ctx->pagev[num_pages] = NULL;
1640 }
1641 }
1642
1643 kfree(block_ctx->mem_to_free);
1644 block_ctx->mem_to_free = NULL;
1645 block_ctx->pagev = NULL;
1646 block_ctx->datav = NULL;
1514 } 1647 }
1515} 1648}
1516 1649
1517static int btrfsic_read_block(struct btrfsic_state *state, 1650static int btrfsic_read_block(struct btrfsic_state *state,
1518 struct btrfsic_block_data_ctx *block_ctx) 1651 struct btrfsic_block_data_ctx *block_ctx)
1519{ 1652{
1520 block_ctx->bh = NULL; 1653 unsigned int num_pages;
1521 if (block_ctx->dev_bytenr & 4095) { 1654 unsigned int i;
1655 u64 dev_bytenr;
1656 int ret;
1657
1658 BUG_ON(block_ctx->datav);
1659 BUG_ON(block_ctx->pagev);
1660 BUG_ON(block_ctx->mem_to_free);
1661 if (block_ctx->dev_bytenr & ((u64)PAGE_CACHE_SIZE - 1)) {
1522 printk(KERN_INFO 1662 printk(KERN_INFO
1523 "btrfsic: read_block() with unaligned bytenr %llu\n", 1663 "btrfsic: read_block() with unaligned bytenr %llu\n",
1524 (unsigned long long)block_ctx->dev_bytenr); 1664 (unsigned long long)block_ctx->dev_bytenr);
1525 return -1; 1665 return -1;
1526 } 1666 }
1527 if (block_ctx->len > 4096) { 1667
1528 printk(KERN_INFO 1668 num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >>
1529 "btrfsic: read_block() with too huge size %d\n", 1669 PAGE_CACHE_SHIFT;
1530 block_ctx->len); 1670 block_ctx->mem_to_free = kzalloc((sizeof(*block_ctx->datav) +
1671 sizeof(*block_ctx->pagev)) *
1672 num_pages, GFP_NOFS);
1673 if (!block_ctx->mem_to_free)
1531 return -1; 1674 return -1;
1675 block_ctx->datav = block_ctx->mem_to_free;
1676 block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
1677 for (i = 0; i < num_pages; i++) {
1678 block_ctx->pagev[i] = alloc_page(GFP_NOFS);
1679 if (!block_ctx->pagev[i])
1680 return -1;
1532 } 1681 }
1533 1682
1534 block_ctx->bh = __bread(block_ctx->dev->bdev, 1683 dev_bytenr = block_ctx->dev_bytenr;
1535 block_ctx->dev_bytenr >> 12, 4096); 1684 for (i = 0; i < num_pages;) {
1536 if (NULL == block_ctx->bh) 1685 struct bio *bio;
1537 return -1; 1686 unsigned int j;
1538 block_ctx->data = block_ctx->bh->b_data; 1687 DECLARE_COMPLETION_ONSTACK(complete);
1688
1689 bio = bio_alloc(GFP_NOFS, num_pages - i);
1690 if (!bio) {
1691 printk(KERN_INFO
1692 "btrfsic: bio_alloc() for %u pages failed!\n",
1693 num_pages - i);
1694 return -1;
1695 }
1696 bio->bi_bdev = block_ctx->dev->bdev;
1697 bio->bi_sector = dev_bytenr >> 9;
1698 bio->bi_end_io = btrfsic_complete_bio_end_io;
1699 bio->bi_private = &complete;
1700
1701 for (j = i; j < num_pages; j++) {
1702 ret = bio_add_page(bio, block_ctx->pagev[j],
1703 PAGE_CACHE_SIZE, 0);
1704 if (PAGE_CACHE_SIZE != ret)
1705 break;
1706 }
1707 if (j == i) {
1708 printk(KERN_INFO
1709 "btrfsic: error, failed to add a single page!\n");
1710 return -1;
1711 }
1712 submit_bio(READ, bio);
1713
1714 /* this will also unplug the queue */
1715 wait_for_completion(&complete);
1716
1717 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1718 printk(KERN_INFO
1719 "btrfsic: read error at logical %llu dev %s!\n",
1720 block_ctx->start, block_ctx->dev->name);
1721 bio_put(bio);
1722 return -1;
1723 }
1724 bio_put(bio);
1725 dev_bytenr += (j - i) * PAGE_CACHE_SIZE;
1726 i = j;
1727 }
1728 for (i = 0; i < num_pages; i++) {
1729 block_ctx->datav[i] = kmap(block_ctx->pagev[i]);
1730 if (!block_ctx->datav[i]) {
1731 printk(KERN_INFO "btrfsic: kmap() failed (dev %s)!\n",
1732 block_ctx->dev->name);
1733 return -1;
1734 }
1735 }
1539 1736
1540 return block_ctx->len; 1737 return block_ctx->len;
1541} 1738}
1542 1739
1740static void btrfsic_complete_bio_end_io(struct bio *bio, int err)
1741{
1742 complete((struct completion *)bio->bi_private);
1743}
1744
1543static void btrfsic_dump_database(struct btrfsic_state *state) 1745static void btrfsic_dump_database(struct btrfsic_state *state)
1544{ 1746{
1545 struct list_head *elem_all; 1747 struct list_head *elem_all;
@@ -1617,32 +1819,39 @@ static void btrfsic_dump_database(struct btrfsic_state *state)
1617 * (note that this test fails for the super block) 1819 * (note that this test fails for the super block)
1618 */ 1820 */
1619static int btrfsic_test_for_metadata(struct btrfsic_state *state, 1821static int btrfsic_test_for_metadata(struct btrfsic_state *state,
1620 const u8 *data, unsigned int size) 1822 char **datav, unsigned int num_pages)
1621{ 1823{
1622 struct btrfs_header *h; 1824 struct btrfs_header *h;
1623 u8 csum[BTRFS_CSUM_SIZE]; 1825 u8 csum[BTRFS_CSUM_SIZE];
1624 u32 crc = ~(u32)0; 1826 u32 crc = ~(u32)0;
1625 int fail = 0; 1827 unsigned int i;
1626 int crc_fail = 0;
1627 1828
1628 h = (struct btrfs_header *)data; 1829 if (num_pages * PAGE_CACHE_SIZE < state->metablock_size)
1830 return 1; /* not metadata */
1831 num_pages = state->metablock_size >> PAGE_CACHE_SHIFT;
1832 h = (struct btrfs_header *)datav[0];
1629 1833
1630 if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE)) 1834 if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE))
1631 fail++; 1835 return 1;
1836
1837 for (i = 0; i < num_pages; i++) {
1838 u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE);
1839 size_t sublen = i ? PAGE_CACHE_SIZE :
1840 (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE);
1632 1841
1633 crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE); 1842 crc = crc32c(crc, data, sublen);
1843 }
1634 btrfs_csum_final(crc, csum); 1844 btrfs_csum_final(crc, csum);
1635 if (memcmp(csum, h->csum, state->csum_size)) 1845 if (memcmp(csum, h->csum, state->csum_size))
1636 crc_fail++; 1846 return 1;
1637 1847
1638 return fail || crc_fail; 1848 return 0; /* is metadata */
1639} 1849}
1640 1850
1641static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, 1851static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1642 u64 dev_bytenr, 1852 u64 dev_bytenr, char **mapped_datav,
1643 u8 *mapped_data, unsigned int len, 1853 unsigned int num_pages,
1644 struct bio *bio, 1854 struct bio *bio, int *bio_is_patched,
1645 int *bio_is_patched,
1646 struct buffer_head *bh, 1855 struct buffer_head *bh,
1647 int submit_bio_bh_rw) 1856 int submit_bio_bh_rw)
1648{ 1857{
@@ -1652,12 +1861,19 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1652 int ret; 1861 int ret;
1653 struct btrfsic_state *state = dev_state->state; 1862 struct btrfsic_state *state = dev_state->state;
1654 struct block_device *bdev = dev_state->bdev; 1863 struct block_device *bdev = dev_state->bdev;
1864 unsigned int processed_len;
1655 1865
1656 WARN_ON(len > PAGE_SIZE);
1657 is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len));
1658 if (NULL != bio_is_patched) 1866 if (NULL != bio_is_patched)
1659 *bio_is_patched = 0; 1867 *bio_is_patched = 0;
1660 1868
1869again:
1870 if (num_pages == 0)
1871 return;
1872
1873 processed_len = 0;
1874 is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav,
1875 num_pages));
1876
1661 block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr, 1877 block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
1662 &state->block_hashtable); 1878 &state->block_hashtable);
1663 if (NULL != block) { 1879 if (NULL != block) {
@@ -1667,8 +1883,16 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1667 1883
1668 if (block->is_superblock) { 1884 if (block->is_superblock) {
1669 bytenr = le64_to_cpu(((struct btrfs_super_block *) 1885 bytenr = le64_to_cpu(((struct btrfs_super_block *)
1670 mapped_data)->bytenr); 1886 mapped_datav[0])->bytenr);
1887 if (num_pages * PAGE_CACHE_SIZE <
1888 BTRFS_SUPER_INFO_SIZE) {
1889 printk(KERN_INFO
1890 "btrfsic: cannot work with too short bios!\n");
1891 return;
1892 }
1671 is_metadata = 1; 1893 is_metadata = 1;
1894 BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_CACHE_SIZE - 1));
1895 processed_len = BTRFS_SUPER_INFO_SIZE;
1672 if (state->print_mask & 1896 if (state->print_mask &
1673 BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) { 1897 BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
1674 printk(KERN_INFO 1898 printk(KERN_INFO
@@ -1678,12 +1902,18 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1678 } 1902 }
1679 if (is_metadata) { 1903 if (is_metadata) {
1680 if (!block->is_superblock) { 1904 if (!block->is_superblock) {
1905 if (num_pages * PAGE_CACHE_SIZE <
1906 state->metablock_size) {
1907 printk(KERN_INFO
1908 "btrfsic: cannot work with too short bios!\n");
1909 return;
1910 }
1911 processed_len = state->metablock_size;
1681 bytenr = le64_to_cpu(((struct btrfs_header *) 1912 bytenr = le64_to_cpu(((struct btrfs_header *)
1682 mapped_data)->bytenr); 1913 mapped_datav[0])->bytenr);
1683 btrfsic_cmp_log_and_dev_bytenr(state, bytenr, 1914 btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
1684 dev_state, 1915 dev_state,
1685 dev_bytenr, 1916 dev_bytenr);
1686 mapped_data);
1687 } 1917 }
1688 if (block->logical_bytenr != bytenr) { 1918 if (block->logical_bytenr != bytenr) {
1689 printk(KERN_INFO 1919 printk(KERN_INFO
@@ -1710,6 +1940,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1710 block->mirror_num, 1940 block->mirror_num,
1711 btrfsic_get_block_type(state, block)); 1941 btrfsic_get_block_type(state, block));
1712 } else { 1942 } else {
1943 if (num_pages * PAGE_CACHE_SIZE <
1944 state->datablock_size) {
1945 printk(KERN_INFO
1946 "btrfsic: cannot work with too short bios!\n");
1947 return;
1948 }
1949 processed_len = state->datablock_size;
1713 bytenr = block->logical_bytenr; 1950 bytenr = block->logical_bytenr;
1714 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) 1951 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1715 printk(KERN_INFO 1952 printk(KERN_INFO
@@ -1747,7 +1984,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1747 le64_to_cpu(block->disk_key.offset), 1984 le64_to_cpu(block->disk_key.offset),
1748 (unsigned long long) 1985 (unsigned long long)
1749 le64_to_cpu(((struct btrfs_header *) 1986 le64_to_cpu(((struct btrfs_header *)
1750 mapped_data)->generation), 1987 mapped_datav[0])->generation),
1751 (unsigned long long) 1988 (unsigned long long)
1752 state->max_superblock_generation); 1989 state->max_superblock_generation);
1753 btrfsic_dump_tree(state); 1990 btrfsic_dump_tree(state);
@@ -1765,10 +2002,10 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1765 (unsigned long long)block->generation, 2002 (unsigned long long)block->generation,
1766 (unsigned long long) 2003 (unsigned long long)
1767 le64_to_cpu(((struct btrfs_header *) 2004 le64_to_cpu(((struct btrfs_header *)
1768 mapped_data)->generation)); 2005 mapped_datav[0])->generation));
1769 /* it would not be safe to go on */ 2006 /* it would not be safe to go on */
1770 btrfsic_dump_tree(state); 2007 btrfsic_dump_tree(state);
1771 return; 2008 goto continue_loop;
1772 } 2009 }
1773 2010
1774 /* 2011 /*
@@ -1796,18 +2033,19 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1796 } 2033 }
1797 2034
1798 if (block->is_superblock) 2035 if (block->is_superblock)
1799 ret = btrfsic_map_superblock(state, bytenr, len, 2036 ret = btrfsic_map_superblock(state, bytenr,
2037 processed_len,
1800 bdev, &block_ctx); 2038 bdev, &block_ctx);
1801 else 2039 else
1802 ret = btrfsic_map_block(state, bytenr, len, 2040 ret = btrfsic_map_block(state, bytenr, processed_len,
1803 &block_ctx, 0); 2041 &block_ctx, 0);
1804 if (ret) { 2042 if (ret) {
1805 printk(KERN_INFO 2043 printk(KERN_INFO
1806 "btrfsic: btrfsic_map_block(root @%llu)" 2044 "btrfsic: btrfsic_map_block(root @%llu)"
1807 " failed!\n", (unsigned long long)bytenr); 2045 " failed!\n", (unsigned long long)bytenr);
1808 return; 2046 goto continue_loop;
1809 } 2047 }
1810 block_ctx.data = mapped_data; 2048 block_ctx.datav = mapped_datav;
1811 /* the following is required in case of writes to mirrors, 2049 /* the following is required in case of writes to mirrors,
1812 * use the same that was used for the lookup */ 2050 * use the same that was used for the lookup */
1813 block_ctx.dev = dev_state; 2051 block_ctx.dev = dev_state;
@@ -1863,11 +2101,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1863 block->logical_bytenr = bytenr; 2101 block->logical_bytenr = bytenr;
1864 block->is_metadata = 1; 2102 block->is_metadata = 1;
1865 if (block->is_superblock) { 2103 if (block->is_superblock) {
2104 BUG_ON(PAGE_CACHE_SIZE !=
2105 BTRFS_SUPER_INFO_SIZE);
1866 ret = btrfsic_process_written_superblock( 2106 ret = btrfsic_process_written_superblock(
1867 state, 2107 state,
1868 block, 2108 block,
1869 (struct btrfs_super_block *) 2109 (struct btrfs_super_block *)
1870 mapped_data); 2110 mapped_datav[0]);
1871 if (state->print_mask & 2111 if (state->print_mask &
1872 BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) { 2112 BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
1873 printk(KERN_INFO 2113 printk(KERN_INFO
@@ -1880,8 +2120,6 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1880 state, 2120 state,
1881 block, 2121 block,
1882 &block_ctx, 2122 &block_ctx,
1883 (struct btrfs_header *)
1884 block_ctx.data,
1885 0, 0); 2123 0, 0);
1886 } 2124 }
1887 if (ret) 2125 if (ret)
@@ -1912,26 +2150,30 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1912 u64 bytenr; 2150 u64 bytenr;
1913 2151
1914 if (!is_metadata) { 2152 if (!is_metadata) {
2153 processed_len = state->datablock_size;
1915 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) 2154 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1916 printk(KERN_INFO "Written block (%s/%llu/?)" 2155 printk(KERN_INFO "Written block (%s/%llu/?)"
1917 " !found in hash table, D.\n", 2156 " !found in hash table, D.\n",
1918 dev_state->name, 2157 dev_state->name,
1919 (unsigned long long)dev_bytenr); 2158 (unsigned long long)dev_bytenr);
1920 if (!state->include_extent_data) 2159 if (!state->include_extent_data) {
1921 return; /* ignore that written D block */ 2160 /* ignore that written D block */
2161 goto continue_loop;
2162 }
1922 2163
1923 /* this is getting ugly for the 2164 /* this is getting ugly for the
1924 * include_extent_data case... */ 2165 * include_extent_data case... */
1925 bytenr = 0; /* unknown */ 2166 bytenr = 0; /* unknown */
1926 block_ctx.start = bytenr; 2167 block_ctx.start = bytenr;
1927 block_ctx.len = len; 2168 block_ctx.len = processed_len;
1928 block_ctx.bh = NULL; 2169 block_ctx.mem_to_free = NULL;
2170 block_ctx.pagev = NULL;
1929 } else { 2171 } else {
2172 processed_len = state->metablock_size;
1930 bytenr = le64_to_cpu(((struct btrfs_header *) 2173 bytenr = le64_to_cpu(((struct btrfs_header *)
1931 mapped_data)->bytenr); 2174 mapped_datav[0])->bytenr);
1932 btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, 2175 btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
1933 dev_bytenr, 2176 dev_bytenr);
1934 mapped_data);
1935 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) 2177 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1936 printk(KERN_INFO 2178 printk(KERN_INFO
1937 "Written block @%llu (%s/%llu/?)" 2179 "Written block @%llu (%s/%llu/?)"
@@ -1940,17 +2182,17 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1940 dev_state->name, 2182 dev_state->name,
1941 (unsigned long long)dev_bytenr); 2183 (unsigned long long)dev_bytenr);
1942 2184
1943 ret = btrfsic_map_block(state, bytenr, len, &block_ctx, 2185 ret = btrfsic_map_block(state, bytenr, processed_len,
1944 0); 2186 &block_ctx, 0);
1945 if (ret) { 2187 if (ret) {
1946 printk(KERN_INFO 2188 printk(KERN_INFO
1947 "btrfsic: btrfsic_map_block(root @%llu)" 2189 "btrfsic: btrfsic_map_block(root @%llu)"
1948 " failed!\n", 2190 " failed!\n",
1949 (unsigned long long)dev_bytenr); 2191 (unsigned long long)dev_bytenr);
1950 return; 2192 goto continue_loop;
1951 } 2193 }
1952 } 2194 }
1953 block_ctx.data = mapped_data; 2195 block_ctx.datav = mapped_datav;
1954 /* the following is required in case of writes to mirrors, 2196 /* the following is required in case of writes to mirrors,
1955 * use the same that was used for the lookup */ 2197 * use the same that was used for the lookup */
1956 block_ctx.dev = dev_state; 2198 block_ctx.dev = dev_state;
@@ -1960,7 +2202,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1960 if (NULL == block) { 2202 if (NULL == block) {
1961 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); 2203 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
1962 btrfsic_release_block_ctx(&block_ctx); 2204 btrfsic_release_block_ctx(&block_ctx);
1963 return; 2205 goto continue_loop;
1964 } 2206 }
1965 block->dev_state = dev_state; 2207 block->dev_state = dev_state;
1966 block->dev_bytenr = dev_bytenr; 2208 block->dev_bytenr = dev_bytenr;
@@ -2020,9 +2262,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
2020 2262
2021 if (is_metadata) { 2263 if (is_metadata) {
2022 ret = btrfsic_process_metablock(state, block, 2264 ret = btrfsic_process_metablock(state, block,
2023 &block_ctx, 2265 &block_ctx, 0, 0);
2024 (struct btrfs_header *)
2025 block_ctx.data, 0, 0);
2026 if (ret) 2266 if (ret)
2027 printk(KERN_INFO 2267 printk(KERN_INFO
2028 "btrfsic: process_metablock(root @%llu)" 2268 "btrfsic: process_metablock(root @%llu)"
@@ -2031,6 +2271,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
2031 } 2271 }
2032 btrfsic_release_block_ctx(&block_ctx); 2272 btrfsic_release_block_ctx(&block_ctx);
2033 } 2273 }
2274
2275continue_loop:
2276 BUG_ON(!processed_len);
2277 dev_bytenr += processed_len;
2278 mapped_datav += processed_len >> PAGE_CACHE_SHIFT;
2279 num_pages -= processed_len >> PAGE_CACHE_SHIFT;
2280 goto again;
2034} 2281}
2035 2282
2036static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) 2283static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
@@ -2213,7 +2460,7 @@ static int btrfsic_process_written_superblock(
2213 2460
2214 num_copies = 2461 num_copies =
2215 btrfs_num_copies(&state->root->fs_info->mapping_tree, 2462 btrfs_num_copies(&state->root->fs_info->mapping_tree,
2216 next_bytenr, PAGE_SIZE); 2463 next_bytenr, BTRFS_SUPER_INFO_SIZE);
2217 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 2464 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
2218 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 2465 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
2219 (unsigned long long)next_bytenr, num_copies); 2466 (unsigned long long)next_bytenr, num_copies);
@@ -2224,7 +2471,8 @@ static int btrfsic_process_written_superblock(
2224 printk(KERN_INFO 2471 printk(KERN_INFO
2225 "btrfsic_process_written_superblock(" 2472 "btrfsic_process_written_superblock("
2226 "mirror_num=%d)\n", mirror_num); 2473 "mirror_num=%d)\n", mirror_num);
2227 ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE, 2474 ret = btrfsic_map_block(state, next_bytenr,
2475 BTRFS_SUPER_INFO_SIZE,
2228 &tmp_next_block_ctx, 2476 &tmp_next_block_ctx,
2229 mirror_num); 2477 mirror_num);
2230 if (ret) { 2478 if (ret) {
@@ -2689,7 +2937,7 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add(
2689static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, 2937static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
2690 u64 bytenr, 2938 u64 bytenr,
2691 struct btrfsic_dev_state *dev_state, 2939 struct btrfsic_dev_state *dev_state,
2692 u64 dev_bytenr, char *data) 2940 u64 dev_bytenr)
2693{ 2941{
2694 int num_copies; 2942 int num_copies;
2695 int mirror_num; 2943 int mirror_num;
@@ -2698,10 +2946,10 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
2698 int match = 0; 2946 int match = 0;
2699 2947
2700 num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, 2948 num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
2701 bytenr, PAGE_SIZE); 2949 bytenr, state->metablock_size);
2702 2950
2703 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { 2951 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
2704 ret = btrfsic_map_block(state, bytenr, PAGE_SIZE, 2952 ret = btrfsic_map_block(state, bytenr, state->metablock_size,
2705 &block_ctx, mirror_num); 2953 &block_ctx, mirror_num);
2706 if (ret) { 2954 if (ret) {
2707 printk(KERN_INFO "btrfsic:" 2955 printk(KERN_INFO "btrfsic:"
@@ -2727,7 +2975,8 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
2727 (unsigned long long)bytenr, dev_state->name, 2975 (unsigned long long)bytenr, dev_state->name,
2728 (unsigned long long)dev_bytenr); 2976 (unsigned long long)dev_bytenr);
2729 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { 2977 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
2730 ret = btrfsic_map_block(state, bytenr, PAGE_SIZE, 2978 ret = btrfsic_map_block(state, bytenr,
2979 state->metablock_size,
2731 &block_ctx, mirror_num); 2980 &block_ctx, mirror_num);
2732 if (ret) 2981 if (ret)
2733 continue; 2982 continue;
@@ -2781,13 +3030,13 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh)
2781 (unsigned long)bh->b_size, bh->b_data, 3030 (unsigned long)bh->b_size, bh->b_data,
2782 bh->b_bdev); 3031 bh->b_bdev);
2783 btrfsic_process_written_block(dev_state, dev_bytenr, 3032 btrfsic_process_written_block(dev_state, dev_bytenr,
2784 bh->b_data, bh->b_size, NULL, 3033 &bh->b_data, 1, NULL,
2785 NULL, bh, rw); 3034 NULL, bh, rw);
2786 } else if (NULL != dev_state && (rw & REQ_FLUSH)) { 3035 } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
2787 if (dev_state->state->print_mask & 3036 if (dev_state->state->print_mask &
2788 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) 3037 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
2789 printk(KERN_INFO 3038 printk(KERN_INFO
2790 "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n", 3039 "submit_bh(rw=0x%x FLUSH, bdev=%p)\n",
2791 rw, bh->b_bdev); 3040 rw, bh->b_bdev);
2792 if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { 3041 if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
2793 if ((dev_state->state->print_mask & 3042 if ((dev_state->state->print_mask &
@@ -2836,6 +3085,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
2836 unsigned int i; 3085 unsigned int i;
2837 u64 dev_bytenr; 3086 u64 dev_bytenr;
2838 int bio_is_patched; 3087 int bio_is_patched;
3088 char **mapped_datav;
2839 3089
2840 dev_bytenr = 512 * bio->bi_sector; 3090 dev_bytenr = 512 * bio->bi_sector;
2841 bio_is_patched = 0; 3091 bio_is_patched = 0;
@@ -2848,35 +3098,46 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
2848 (unsigned long long)dev_bytenr, 3098 (unsigned long long)dev_bytenr,
2849 bio->bi_bdev); 3099 bio->bi_bdev);
2850 3100
3101 mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt,
3102 GFP_NOFS);
3103 if (!mapped_datav)
3104 goto leave;
2851 for (i = 0; i < bio->bi_vcnt; i++) { 3105 for (i = 0; i < bio->bi_vcnt; i++) {
2852 u8 *mapped_data; 3106 BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE);
2853 3107 mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page);
2854 mapped_data = kmap(bio->bi_io_vec[i].bv_page); 3108 if (!mapped_datav[i]) {
3109 while (i > 0) {
3110 i--;
3111 kunmap(bio->bi_io_vec[i].bv_page);
3112 }
3113 kfree(mapped_datav);
3114 goto leave;
3115 }
2855 if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | 3116 if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
2856 BTRFSIC_PRINT_MASK_VERBOSE) == 3117 BTRFSIC_PRINT_MASK_VERBOSE) ==
2857 (dev_state->state->print_mask & 3118 (dev_state->state->print_mask &
2858 (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | 3119 (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
2859 BTRFSIC_PRINT_MASK_VERBOSE))) 3120 BTRFSIC_PRINT_MASK_VERBOSE)))
2860 printk(KERN_INFO 3121 printk(KERN_INFO
2861 "#%u: page=%p, mapped=%p, len=%u," 3122 "#%u: page=%p, len=%u, offset=%u\n",
2862 " offset=%u\n",
2863 i, bio->bi_io_vec[i].bv_page, 3123 i, bio->bi_io_vec[i].bv_page,
2864 mapped_data,
2865 bio->bi_io_vec[i].bv_len, 3124 bio->bi_io_vec[i].bv_len,
2866 bio->bi_io_vec[i].bv_offset); 3125 bio->bi_io_vec[i].bv_offset);
2867 btrfsic_process_written_block(dev_state, dev_bytenr, 3126 }
2868 mapped_data, 3127 btrfsic_process_written_block(dev_state, dev_bytenr,
2869 bio->bi_io_vec[i].bv_len, 3128 mapped_datav, bio->bi_vcnt,
2870 bio, &bio_is_patched, 3129 bio, &bio_is_patched,
2871 NULL, rw); 3130 NULL, rw);
3131 while (i > 0) {
3132 i--;
2872 kunmap(bio->bi_io_vec[i].bv_page); 3133 kunmap(bio->bi_io_vec[i].bv_page);
2873 dev_bytenr += bio->bi_io_vec[i].bv_len;
2874 } 3134 }
3135 kfree(mapped_datav);
2875 } else if (NULL != dev_state && (rw & REQ_FLUSH)) { 3136 } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
2876 if (dev_state->state->print_mask & 3137 if (dev_state->state->print_mask &
2877 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) 3138 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
2878 printk(KERN_INFO 3139 printk(KERN_INFO
2879 "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n", 3140 "submit_bio(rw=0x%x FLUSH, bdev=%p)\n",
2880 rw, bio->bi_bdev); 3141 rw, bio->bi_bdev);
2881 if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { 3142 if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
2882 if ((dev_state->state->print_mask & 3143 if ((dev_state->state->print_mask &
@@ -2903,6 +3164,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
2903 bio->bi_end_io = btrfsic_bio_end_io; 3164 bio->bi_end_io = btrfsic_bio_end_io;
2904 } 3165 }
2905 } 3166 }
3167leave:
2906 mutex_unlock(&btrfsic_mutex); 3168 mutex_unlock(&btrfsic_mutex);
2907 3169
2908 submit_bio(rw, bio); 3170 submit_bio(rw, bio);
@@ -2917,6 +3179,30 @@ int btrfsic_mount(struct btrfs_root *root,
2917 struct list_head *dev_head = &fs_devices->devices; 3179 struct list_head *dev_head = &fs_devices->devices;
2918 struct btrfs_device *device; 3180 struct btrfs_device *device;
2919 3181
3182 if (root->nodesize != root->leafsize) {
3183 printk(KERN_INFO
3184 "btrfsic: cannot handle nodesize %d != leafsize %d!\n",
3185 root->nodesize, root->leafsize);
3186 return -1;
3187 }
3188 if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) {
3189 printk(KERN_INFO
3190 "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
3191 root->nodesize, (unsigned long)PAGE_CACHE_SIZE);
3192 return -1;
3193 }
3194 if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) {
3195 printk(KERN_INFO
3196 "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
3197 root->leafsize, (unsigned long)PAGE_CACHE_SIZE);
3198 return -1;
3199 }
3200 if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) {
3201 printk(KERN_INFO
3202 "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
3203 root->sectorsize, (unsigned long)PAGE_CACHE_SIZE);
3204 return -1;
3205 }
2920 state = kzalloc(sizeof(*state), GFP_NOFS); 3206 state = kzalloc(sizeof(*state), GFP_NOFS);
2921 if (NULL == state) { 3207 if (NULL == state) {
2922 printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n"); 3208 printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n");
@@ -2933,6 +3219,8 @@ int btrfsic_mount(struct btrfs_root *root,
2933 state->print_mask = print_mask; 3219 state->print_mask = print_mask;
2934 state->include_extent_data = including_extent_data; 3220 state->include_extent_data = including_extent_data;
2935 state->csum_size = 0; 3221 state->csum_size = 0;
3222 state->metablock_size = root->nodesize;
3223 state->datablock_size = root->sectorsize;
2936 INIT_LIST_HEAD(&state->all_blocks_list); 3224 INIT_LIST_HEAD(&state->all_blocks_list);
2937 btrfsic_block_hashtable_init(&state->block_hashtable); 3225 btrfsic_block_hashtable_init(&state->block_hashtable);
2938 btrfsic_block_link_hashtable_init(&state->block_link_hashtable); 3226 btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
@@ -3049,7 +3337,7 @@ void btrfsic_unmount(struct btrfs_root *root,
3049 btrfsic_block_link_free(l); 3337 btrfsic_block_link_free(l);
3050 } 3338 }
3051 3339
3052 if (b_all->is_iodone) 3340 if (b_all->is_iodone || b_all->never_written)
3053 btrfsic_block_free(b_all); 3341 btrfsic_block_free(b_all);
3054 else 3342 else
3055 printk(KERN_INFO "btrfs: attempt to free %c-block" 3343 printk(KERN_INFO "btrfs: attempt to free %c-block"
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 4106264fbc65..d7a96cfdc50a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -18,6 +18,7 @@
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/rbtree.h>
21#include "ctree.h" 22#include "ctree.h"
22#include "disk-io.h" 23#include "disk-io.h"
23#include "transaction.h" 24#include "transaction.h"
@@ -37,7 +38,16 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
37 struct extent_buffer *dst_buf, 38 struct extent_buffer *dst_buf,
38 struct extent_buffer *src_buf); 39 struct extent_buffer *src_buf);
39static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 40static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
40 struct btrfs_path *path, int level, int slot); 41 struct btrfs_path *path, int level, int slot,
42 int tree_mod_log);
43static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
44 struct extent_buffer *eb);
45struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr,
46 u32 blocksize, u64 parent_transid,
47 u64 time_seq);
48struct extent_buffer *btrfs_find_old_tree_block(struct btrfs_root *root,
49 u64 bytenr, u32 blocksize,
50 u64 time_seq);
41 51
42struct btrfs_path *btrfs_alloc_path(void) 52struct btrfs_path *btrfs_alloc_path(void)
43{ 53{
@@ -255,7 +265,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
255 265
256 cow = btrfs_alloc_free_block(trans, root, buf->len, 0, 266 cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
257 new_root_objectid, &disk_key, level, 267 new_root_objectid, &disk_key, level,
258 buf->start, 0, 1); 268 buf->start, 0);
259 if (IS_ERR(cow)) 269 if (IS_ERR(cow))
260 return PTR_ERR(cow); 270 return PTR_ERR(cow);
261 271
@@ -288,6 +298,434 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
288 return 0; 298 return 0;
289} 299}
290 300
301enum mod_log_op {
302 MOD_LOG_KEY_REPLACE,
303 MOD_LOG_KEY_ADD,
304 MOD_LOG_KEY_REMOVE,
305 MOD_LOG_KEY_REMOVE_WHILE_FREEING,
306 MOD_LOG_KEY_REMOVE_WHILE_MOVING,
307 MOD_LOG_MOVE_KEYS,
308 MOD_LOG_ROOT_REPLACE,
309};
310
311struct tree_mod_move {
312 int dst_slot;
313 int nr_items;
314};
315
316struct tree_mod_root {
317 u64 logical;
318 u8 level;
319};
320
321struct tree_mod_elem {
322 struct rb_node node;
323 u64 index; /* shifted logical */
324 struct seq_list elem;
325 enum mod_log_op op;
326
327 /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
328 int slot;
329
330 /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
331 u64 generation;
332
333 /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
334 struct btrfs_disk_key key;
335 u64 blockptr;
336
337 /* this is used for op == MOD_LOG_MOVE_KEYS */
338 struct tree_mod_move move;
339
340 /* this is used for op == MOD_LOG_ROOT_REPLACE */
341 struct tree_mod_root old_root;
342};
343
344static inline void
345__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem)
346{
347 elem->seq = atomic_inc_return(&fs_info->tree_mod_seq);
348 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
349}
350
351void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
352 struct seq_list *elem)
353{
354 elem->flags = 1;
355 spin_lock(&fs_info->tree_mod_seq_lock);
356 __get_tree_mod_seq(fs_info, elem);
357 spin_unlock(&fs_info->tree_mod_seq_lock);
358}
359
360void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
361 struct seq_list *elem)
362{
363 struct rb_root *tm_root;
364 struct rb_node *node;
365 struct rb_node *next;
366 struct seq_list *cur_elem;
367 struct tree_mod_elem *tm;
368 u64 min_seq = (u64)-1;
369 u64 seq_putting = elem->seq;
370
371 if (!seq_putting)
372 return;
373
374 BUG_ON(!(elem->flags & 1));
375 spin_lock(&fs_info->tree_mod_seq_lock);
376 list_del(&elem->list);
377
378 list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
379 if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) {
380 if (seq_putting > cur_elem->seq) {
381 /*
382 * blocker with lower sequence number exists, we
383 * cannot remove anything from the log
384 */
385 goto out;
386 }
387 min_seq = cur_elem->seq;
388 }
389 }
390
391 /*
392 * anything that's lower than the lowest existing (read: blocked)
393 * sequence number can be removed from the tree.
394 */
395 write_lock(&fs_info->tree_mod_log_lock);
396 tm_root = &fs_info->tree_mod_log;
397 for (node = rb_first(tm_root); node; node = next) {
398 next = rb_next(node);
399 tm = container_of(node, struct tree_mod_elem, node);
400 if (tm->elem.seq > min_seq)
401 continue;
402 rb_erase(node, tm_root);
403 list_del(&tm->elem.list);
404 kfree(tm);
405 }
406 write_unlock(&fs_info->tree_mod_log_lock);
407out:
408 spin_unlock(&fs_info->tree_mod_seq_lock);
409}
410
411/*
412 * key order of the log:
413 * index -> sequence
414 *
415 * the index is the shifted logical of the *new* root node for root replace
416 * operations, or the shifted logical of the affected block for all other
417 * operations.
418 */
419static noinline int
420__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
421{
422 struct rb_root *tm_root;
423 struct rb_node **new;
424 struct rb_node *parent = NULL;
425 struct tree_mod_elem *cur;
426 int ret = 0;
427
428 BUG_ON(!tm || !tm->elem.seq);
429
430 write_lock(&fs_info->tree_mod_log_lock);
431 tm_root = &fs_info->tree_mod_log;
432 new = &tm_root->rb_node;
433 while (*new) {
434 cur = container_of(*new, struct tree_mod_elem, node);
435 parent = *new;
436 if (cur->index < tm->index)
437 new = &((*new)->rb_left);
438 else if (cur->index > tm->index)
439 new = &((*new)->rb_right);
440 else if (cur->elem.seq < tm->elem.seq)
441 new = &((*new)->rb_left);
442 else if (cur->elem.seq > tm->elem.seq)
443 new = &((*new)->rb_right);
444 else {
445 kfree(tm);
446 ret = -EEXIST;
447 goto unlock;
448 }
449 }
450
451 rb_link_node(&tm->node, parent, new);
452 rb_insert_color(&tm->node, tm_root);
453unlock:
454 write_unlock(&fs_info->tree_mod_log_lock);
455 return ret;
456}
457
458static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
459 struct extent_buffer *eb) {
460 smp_mb();
461 if (list_empty(&(fs_info)->tree_mod_seq_list))
462 return 1;
463 if (!eb)
464 return 0;
465 if (btrfs_header_level(eb) == 0)
466 return 1;
467 return 0;
468}
469
470static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
471 struct tree_mod_elem **tm_ret)
472{
473 struct tree_mod_elem *tm;
474 int seq;
475
476 if (tree_mod_dont_log(fs_info, NULL))
477 return 0;
478
479 tm = *tm_ret = kzalloc(sizeof(*tm), flags);
480 if (!tm)
481 return -ENOMEM;
482
483 tm->elem.flags = 0;
484 spin_lock(&fs_info->tree_mod_seq_lock);
485 if (list_empty(&fs_info->tree_mod_seq_list)) {
486 /*
487 * someone emptied the list while we were waiting for the lock.
488 * we must not add to the list, because no blocker exists. items
489 * are removed from the list only when the existing blocker is
490 * removed from the list.
491 */
492 kfree(tm);
493 seq = 0;
494 } else {
495 __get_tree_mod_seq(fs_info, &tm->elem);
496 seq = tm->elem.seq;
497 }
498 spin_unlock(&fs_info->tree_mod_seq_lock);
499
500 return seq;
501}
502
503static noinline int
504tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
505 struct extent_buffer *eb, int slot,
506 enum mod_log_op op, gfp_t flags)
507{
508 struct tree_mod_elem *tm;
509 int ret;
510
511 ret = tree_mod_alloc(fs_info, flags, &tm);
512 if (ret <= 0)
513 return ret;
514
515 tm->index = eb->start >> PAGE_CACHE_SHIFT;
516 if (op != MOD_LOG_KEY_ADD) {
517 btrfs_node_key(eb, &tm->key, slot);
518 tm->blockptr = btrfs_node_blockptr(eb, slot);
519 }
520 tm->op = op;
521 tm->slot = slot;
522 tm->generation = btrfs_node_ptr_generation(eb, slot);
523
524 return __tree_mod_log_insert(fs_info, tm);
525}
526
527static noinline int
528tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
529 int slot, enum mod_log_op op)
530{
531 return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS);
532}
533
534static noinline int
535tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
536 struct extent_buffer *eb, int dst_slot, int src_slot,
537 int nr_items, gfp_t flags)
538{
539 struct tree_mod_elem *tm;
540 int ret;
541 int i;
542
543 if (tree_mod_dont_log(fs_info, eb))
544 return 0;
545
546 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
547 ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot,
548 MOD_LOG_KEY_REMOVE_WHILE_MOVING);
549 BUG_ON(ret < 0);
550 }
551
552 ret = tree_mod_alloc(fs_info, flags, &tm);
553 if (ret <= 0)
554 return ret;
555
556 tm->index = eb->start >> PAGE_CACHE_SHIFT;
557 tm->slot = src_slot;
558 tm->move.dst_slot = dst_slot;
559 tm->move.nr_items = nr_items;
560 tm->op = MOD_LOG_MOVE_KEYS;
561
562 return __tree_mod_log_insert(fs_info, tm);
563}
564
565static noinline int
566tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
567 struct extent_buffer *old_root,
568 struct extent_buffer *new_root, gfp_t flags)
569{
570 struct tree_mod_elem *tm;
571 int ret;
572
573 ret = tree_mod_alloc(fs_info, flags, &tm);
574 if (ret <= 0)
575 return ret;
576
577 tm->index = new_root->start >> PAGE_CACHE_SHIFT;
578 tm->old_root.logical = old_root->start;
579 tm->old_root.level = btrfs_header_level(old_root);
580 tm->generation = btrfs_header_generation(old_root);
581 tm->op = MOD_LOG_ROOT_REPLACE;
582
583 return __tree_mod_log_insert(fs_info, tm);
584}
585
586static struct tree_mod_elem *
587__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
588 int smallest)
589{
590 struct rb_root *tm_root;
591 struct rb_node *node;
592 struct tree_mod_elem *cur = NULL;
593 struct tree_mod_elem *found = NULL;
594 u64 index = start >> PAGE_CACHE_SHIFT;
595
596 read_lock(&fs_info->tree_mod_log_lock);
597 tm_root = &fs_info->tree_mod_log;
598 node = tm_root->rb_node;
599 while (node) {
600 cur = container_of(node, struct tree_mod_elem, node);
601 if (cur->index < index) {
602 node = node->rb_left;
603 } else if (cur->index > index) {
604 node = node->rb_right;
605 } else if (cur->elem.seq < min_seq) {
606 node = node->rb_left;
607 } else if (!smallest) {
608 /* we want the node with the highest seq */
609 if (found)
610 BUG_ON(found->elem.seq > cur->elem.seq);
611 found = cur;
612 node = node->rb_left;
613 } else if (cur->elem.seq > min_seq) {
614 /* we want the node with the smallest seq */
615 if (found)
616 BUG_ON(found->elem.seq < cur->elem.seq);
617 found = cur;
618 node = node->rb_right;
619 } else {
620 found = cur;
621 break;
622 }
623 }
624 read_unlock(&fs_info->tree_mod_log_lock);
625
626 return found;
627}
628
629/*
630 * this returns the element from the log with the smallest time sequence
631 * value that's in the log (the oldest log item). any element with a time
632 * sequence lower than min_seq will be ignored.
633 */
634static struct tree_mod_elem *
635tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
636 u64 min_seq)
637{
638 return __tree_mod_log_search(fs_info, start, min_seq, 1);
639}
640
641/*
642 * this returns the element from the log with the largest time sequence
643 * value that's in the log (the most recent log item). any element with
644 * a time sequence lower than min_seq will be ignored.
645 */
646static struct tree_mod_elem *
647tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
648{
649 return __tree_mod_log_search(fs_info, start, min_seq, 0);
650}
651
652static inline void
653tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
654 struct extent_buffer *src, unsigned long dst_offset,
655 unsigned long src_offset, int nr_items)
656{
657 int ret;
658 int i;
659
660 if (tree_mod_dont_log(fs_info, NULL))
661 return;
662
663 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
664 return;
665
666 /* speed this up by single seq for all operations? */
667 for (i = 0; i < nr_items; i++) {
668 ret = tree_mod_log_insert_key(fs_info, src, i + src_offset,
669 MOD_LOG_KEY_REMOVE);
670 BUG_ON(ret < 0);
671 ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset,
672 MOD_LOG_KEY_ADD);
673 BUG_ON(ret < 0);
674 }
675}
676
677static inline void
678tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
679 int dst_offset, int src_offset, int nr_items)
680{
681 int ret;
682 ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
683 nr_items, GFP_NOFS);
684 BUG_ON(ret < 0);
685}
686
687static inline void
688tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
689 struct extent_buffer *eb,
690 struct btrfs_disk_key *disk_key, int slot, int atomic)
691{
692 int ret;
693
694 ret = tree_mod_log_insert_key_mask(fs_info, eb, slot,
695 MOD_LOG_KEY_REPLACE,
696 atomic ? GFP_ATOMIC : GFP_NOFS);
697 BUG_ON(ret < 0);
698}
699
700static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
701 struct extent_buffer *eb)
702{
703 int i;
704 int ret;
705 u32 nritems;
706
707 if (tree_mod_dont_log(fs_info, eb))
708 return;
709
710 nritems = btrfs_header_nritems(eb);
711 for (i = nritems - 1; i >= 0; i--) {
712 ret = tree_mod_log_insert_key(fs_info, eb, i,
713 MOD_LOG_KEY_REMOVE_WHILE_FREEING);
714 BUG_ON(ret < 0);
715 }
716}
717
718static inline void
719tree_mod_log_set_root_pointer(struct btrfs_root *root,
720 struct extent_buffer *new_root_node)
721{
722 int ret;
723 tree_mod_log_free_eb(root->fs_info, root->node);
724 ret = tree_mod_log_insert_root(root->fs_info, root->node,
725 new_root_node, GFP_NOFS);
726 BUG_ON(ret < 0);
727}
728
291/* 729/*
292 * check if the tree block can be shared by multiple trees 730 * check if the tree block can be shared by multiple trees
293 */ 731 */
@@ -409,6 +847,12 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
409 ret = btrfs_dec_ref(trans, root, buf, 1, 1); 847 ret = btrfs_dec_ref(trans, root, buf, 1, 1);
410 BUG_ON(ret); /* -ENOMEM */ 848 BUG_ON(ret); /* -ENOMEM */
411 } 849 }
850 /*
851 * don't log freeing in case we're freeing the root node, this
852 * is done by tree_mod_log_set_root_pointer later
853 */
854 if (buf != root->node && btrfs_header_level(buf) != 0)
855 tree_mod_log_free_eb(root->fs_info, buf);
412 clean_tree_block(trans, root, buf); 856 clean_tree_block(trans, root, buf);
413 *last_ref = 1; 857 *last_ref = 1;
414 } 858 }
@@ -467,7 +911,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
467 911
468 cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, 912 cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
469 root->root_key.objectid, &disk_key, 913 root->root_key.objectid, &disk_key,
470 level, search_start, empty_size, 1); 914 level, search_start, empty_size);
471 if (IS_ERR(cow)) 915 if (IS_ERR(cow))
472 return PTR_ERR(cow); 916 return PTR_ERR(cow);
473 917
@@ -506,10 +950,11 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
506 parent_start = 0; 950 parent_start = 0;
507 951
508 extent_buffer_get(cow); 952 extent_buffer_get(cow);
953 tree_mod_log_set_root_pointer(root, cow);
509 rcu_assign_pointer(root->node, cow); 954 rcu_assign_pointer(root->node, cow);
510 955
511 btrfs_free_tree_block(trans, root, buf, parent_start, 956 btrfs_free_tree_block(trans, root, buf, parent_start,
512 last_ref, 1); 957 last_ref);
513 free_extent_buffer(buf); 958 free_extent_buffer(buf);
514 add_root_to_dirty_list(root); 959 add_root_to_dirty_list(root);
515 } else { 960 } else {
@@ -519,13 +964,15 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
519 parent_start = 0; 964 parent_start = 0;
520 965
521 WARN_ON(trans->transid != btrfs_header_generation(parent)); 966 WARN_ON(trans->transid != btrfs_header_generation(parent));
967 tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
968 MOD_LOG_KEY_REPLACE);
522 btrfs_set_node_blockptr(parent, parent_slot, 969 btrfs_set_node_blockptr(parent, parent_slot,
523 cow->start); 970 cow->start);
524 btrfs_set_node_ptr_generation(parent, parent_slot, 971 btrfs_set_node_ptr_generation(parent, parent_slot,
525 trans->transid); 972 trans->transid);
526 btrfs_mark_buffer_dirty(parent); 973 btrfs_mark_buffer_dirty(parent);
527 btrfs_free_tree_block(trans, root, buf, parent_start, 974 btrfs_free_tree_block(trans, root, buf, parent_start,
528 last_ref, 1); 975 last_ref);
529 } 976 }
530 if (unlock_orig) 977 if (unlock_orig)
531 btrfs_tree_unlock(buf); 978 btrfs_tree_unlock(buf);
@@ -535,6 +982,210 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
535 return 0; 982 return 0;
536} 983}
537 984
985/*
986 * returns the logical address of the oldest predecessor of the given root.
987 * entries older than time_seq are ignored.
988 */
989static struct tree_mod_elem *
990__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
991 struct btrfs_root *root, u64 time_seq)
992{
993 struct tree_mod_elem *tm;
994 struct tree_mod_elem *found = NULL;
995 u64 root_logical = root->node->start;
996 int looped = 0;
997
998 if (!time_seq)
999 return 0;
1000
1001 /*
1002 * the very last operation that's logged for a root is the replacement
1003 * operation (if it is replaced at all). this has the index of the *new*
1004 * root, making it the very first operation that's logged for this root.
1005 */
1006 while (1) {
1007 tm = tree_mod_log_search_oldest(fs_info, root_logical,
1008 time_seq);
1009 if (!looped && !tm)
1010 return 0;
1011 /*
1012 * we must have key remove operations in the log before the
1013 * replace operation.
1014 */
1015 BUG_ON(!tm);
1016
1017 if (tm->op != MOD_LOG_ROOT_REPLACE)
1018 break;
1019
1020 found = tm;
1021 root_logical = tm->old_root.logical;
1022 BUG_ON(root_logical == root->node->start);
1023 looped = 1;
1024 }
1025
1026 return found;
1027}
1028
1029/*
1030 * tm is a pointer to the first operation to rewind within eb. then, all
1031 * previous operations will be rewinded (until we reach something older than
1032 * time_seq).
1033 */
1034static void
1035__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1036 struct tree_mod_elem *first_tm)
1037{
1038 u32 n;
1039 struct rb_node *next;
1040 struct tree_mod_elem *tm = first_tm;
1041 unsigned long o_dst;
1042 unsigned long o_src;
1043 unsigned long p_size = sizeof(struct btrfs_key_ptr);
1044
1045 n = btrfs_header_nritems(eb);
1046 while (tm && tm->elem.seq >= time_seq) {
1047 /*
1048 * all the operations are recorded with the operator used for
1049 * the modification. as we're going backwards, we do the
1050 * opposite of each operation here.
1051 */
1052 switch (tm->op) {
1053 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1054 BUG_ON(tm->slot < n);
1055 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1056 case MOD_LOG_KEY_REMOVE:
1057 btrfs_set_node_key(eb, &tm->key, tm->slot);
1058 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1059 btrfs_set_node_ptr_generation(eb, tm->slot,
1060 tm->generation);
1061 n++;
1062 break;
1063 case MOD_LOG_KEY_REPLACE:
1064 BUG_ON(tm->slot >= n);
1065 btrfs_set_node_key(eb, &tm->key, tm->slot);
1066 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1067 btrfs_set_node_ptr_generation(eb, tm->slot,
1068 tm->generation);
1069 break;
1070 case MOD_LOG_KEY_ADD:
1071 if (tm->slot != n - 1) {
1072 o_dst = btrfs_node_key_ptr_offset(tm->slot);
1073 o_src = btrfs_node_key_ptr_offset(tm->slot + 1);
1074 memmove_extent_buffer(eb, o_dst, o_src, p_size);
1075 }
1076 n--;
1077 break;
1078 case MOD_LOG_MOVE_KEYS:
1079 o_dst = btrfs_node_key_ptr_offset(tm->slot);
1080 o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1081 memmove_extent_buffer(eb, o_dst, o_src,
1082 tm->move.nr_items * p_size);
1083 break;
1084 case MOD_LOG_ROOT_REPLACE:
1085 /*
1086 * this operation is special. for roots, this must be
1087 * handled explicitly before rewinding.
1088 * for non-roots, this operation may exist if the node
1089 * was a root: root A -> child B; then A gets empty and
1090 * B is promoted to the new root. in the mod log, we'll
1091 * have a root-replace operation for B, a tree block
1092 * that is no root. we simply ignore that operation.
1093 */
1094 break;
1095 }
1096 next = rb_next(&tm->node);
1097 if (!next)
1098 break;
1099 tm = container_of(next, struct tree_mod_elem, node);
1100 if (tm->index != first_tm->index)
1101 break;
1102 }
1103 btrfs_set_header_nritems(eb, n);
1104}
1105
1106static struct extent_buffer *
1107tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1108 u64 time_seq)
1109{
1110 struct extent_buffer *eb_rewin;
1111 struct tree_mod_elem *tm;
1112
1113 if (!time_seq)
1114 return eb;
1115
1116 if (btrfs_header_level(eb) == 0)
1117 return eb;
1118
1119 tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1120 if (!tm)
1121 return eb;
1122
1123 if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1124 BUG_ON(tm->slot != 0);
1125 eb_rewin = alloc_dummy_extent_buffer(eb->start,
1126 fs_info->tree_root->nodesize);
1127 BUG_ON(!eb_rewin);
1128 btrfs_set_header_bytenr(eb_rewin, eb->start);
1129 btrfs_set_header_backref_rev(eb_rewin,
1130 btrfs_header_backref_rev(eb));
1131 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1132 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1133 } else {
1134 eb_rewin = btrfs_clone_extent_buffer(eb);
1135 BUG_ON(!eb_rewin);
1136 }
1137
1138 extent_buffer_get(eb_rewin);
1139 free_extent_buffer(eb);
1140
1141 __tree_mod_log_rewind(eb_rewin, time_seq, tm);
1142
1143 return eb_rewin;
1144}
1145
1146static inline struct extent_buffer *
1147get_old_root(struct btrfs_root *root, u64 time_seq)
1148{
1149 struct tree_mod_elem *tm;
1150 struct extent_buffer *eb;
1151 struct tree_mod_root *old_root;
1152 u64 old_generation;
1153
1154 tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq);
1155 if (!tm)
1156 return root->node;
1157
1158 old_root = &tm->old_root;
1159 old_generation = tm->generation;
1160
1161 tm = tree_mod_log_search(root->fs_info, old_root->logical, time_seq);
1162 /*
1163 * there was an item in the log when __tree_mod_log_oldest_root
1164 * returned. this one must not go away, because the time_seq passed to
1165 * us must be blocking its removal.
1166 */
1167 BUG_ON(!tm);
1168
1169 if (old_root->logical == root->node->start) {
1170 /* there are logged operations for the current root */
1171 eb = btrfs_clone_extent_buffer(root->node);
1172 } else {
1173 /* there's a root replace operation for the current root */
1174 eb = alloc_dummy_extent_buffer(tm->index << PAGE_CACHE_SHIFT,
1175 root->nodesize);
1176 btrfs_set_header_bytenr(eb, eb->start);
1177 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1178 btrfs_set_header_owner(eb, root->root_key.objectid);
1179 }
1180 if (!eb)
1181 return NULL;
1182 btrfs_set_header_level(eb, old_root->level);
1183 btrfs_set_header_generation(eb, old_generation);
1184 __tree_mod_log_rewind(eb, time_seq, tm);
1185
1186 return eb;
1187}
1188
538static inline int should_cow_block(struct btrfs_trans_handle *trans, 1189static inline int should_cow_block(struct btrfs_trans_handle *trans,
539 struct btrfs_root *root, 1190 struct btrfs_root *root,
540 struct extent_buffer *buf) 1191 struct extent_buffer *buf)
@@ -739,7 +1390,11 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
739 if (!cur) 1390 if (!cur)
740 return -EIO; 1391 return -EIO;
741 } else if (!uptodate) { 1392 } else if (!uptodate) {
742 btrfs_read_buffer(cur, gen); 1393 err = btrfs_read_buffer(cur, gen);
1394 if (err) {
1395 free_extent_buffer(cur);
1396 return err;
1397 }
743 } 1398 }
744 } 1399 }
745 if (search_start == 0) 1400 if (search_start == 0)
@@ -854,20 +1509,18 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
854static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, 1509static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
855 int level, int *slot) 1510 int level, int *slot)
856{ 1511{
857 if (level == 0) { 1512 if (level == 0)
858 return generic_bin_search(eb, 1513 return generic_bin_search(eb,
859 offsetof(struct btrfs_leaf, items), 1514 offsetof(struct btrfs_leaf, items),
860 sizeof(struct btrfs_item), 1515 sizeof(struct btrfs_item),
861 key, btrfs_header_nritems(eb), 1516 key, btrfs_header_nritems(eb),
862 slot); 1517 slot);
863 } else { 1518 else
864 return generic_bin_search(eb, 1519 return generic_bin_search(eb,
865 offsetof(struct btrfs_node, ptrs), 1520 offsetof(struct btrfs_node, ptrs),
866 sizeof(struct btrfs_key_ptr), 1521 sizeof(struct btrfs_key_ptr),
867 key, btrfs_header_nritems(eb), 1522 key, btrfs_header_nritems(eb),
868 slot); 1523 slot);
869 }
870 return -1;
871} 1524}
872 1525
873int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 1526int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
@@ -974,6 +1627,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
974 goto enospc; 1627 goto enospc;
975 } 1628 }
976 1629
1630 tree_mod_log_set_root_pointer(root, child);
977 rcu_assign_pointer(root->node, child); 1631 rcu_assign_pointer(root->node, child);
978 1632
979 add_root_to_dirty_list(root); 1633 add_root_to_dirty_list(root);
@@ -987,7 +1641,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
987 free_extent_buffer(mid); 1641 free_extent_buffer(mid);
988 1642
989 root_sub_used(root, mid->len); 1643 root_sub_used(root, mid->len);
990 btrfs_free_tree_block(trans, root, mid, 0, 1, 0); 1644 btrfs_free_tree_block(trans, root, mid, 0, 1);
991 /* once for the root ptr */ 1645 /* once for the root ptr */
992 free_extent_buffer_stale(mid); 1646 free_extent_buffer_stale(mid);
993 return 0; 1647 return 0;
@@ -1040,14 +1694,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1040 if (btrfs_header_nritems(right) == 0) { 1694 if (btrfs_header_nritems(right) == 0) {
1041 clean_tree_block(trans, root, right); 1695 clean_tree_block(trans, root, right);
1042 btrfs_tree_unlock(right); 1696 btrfs_tree_unlock(right);
1043 del_ptr(trans, root, path, level + 1, pslot + 1); 1697 del_ptr(trans, root, path, level + 1, pslot + 1, 1);
1044 root_sub_used(root, right->len); 1698 root_sub_used(root, right->len);
1045 btrfs_free_tree_block(trans, root, right, 0, 1, 0); 1699 btrfs_free_tree_block(trans, root, right, 0, 1);
1046 free_extent_buffer_stale(right); 1700 free_extent_buffer_stale(right);
1047 right = NULL; 1701 right = NULL;
1048 } else { 1702 } else {
1049 struct btrfs_disk_key right_key; 1703 struct btrfs_disk_key right_key;
1050 btrfs_node_key(right, &right_key, 0); 1704 btrfs_node_key(right, &right_key, 0);
1705 tree_mod_log_set_node_key(root->fs_info, parent,
1706 &right_key, pslot + 1, 0);
1051 btrfs_set_node_key(parent, &right_key, pslot + 1); 1707 btrfs_set_node_key(parent, &right_key, pslot + 1);
1052 btrfs_mark_buffer_dirty(parent); 1708 btrfs_mark_buffer_dirty(parent);
1053 } 1709 }
@@ -1082,15 +1738,17 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1082 if (btrfs_header_nritems(mid) == 0) { 1738 if (btrfs_header_nritems(mid) == 0) {
1083 clean_tree_block(trans, root, mid); 1739 clean_tree_block(trans, root, mid);
1084 btrfs_tree_unlock(mid); 1740 btrfs_tree_unlock(mid);
1085 del_ptr(trans, root, path, level + 1, pslot); 1741 del_ptr(trans, root, path, level + 1, pslot, 1);
1086 root_sub_used(root, mid->len); 1742 root_sub_used(root, mid->len);
1087 btrfs_free_tree_block(trans, root, mid, 0, 1, 0); 1743 btrfs_free_tree_block(trans, root, mid, 0, 1);
1088 free_extent_buffer_stale(mid); 1744 free_extent_buffer_stale(mid);
1089 mid = NULL; 1745 mid = NULL;
1090 } else { 1746 } else {
1091 /* update the parent key to reflect our changes */ 1747 /* update the parent key to reflect our changes */
1092 struct btrfs_disk_key mid_key; 1748 struct btrfs_disk_key mid_key;
1093 btrfs_node_key(mid, &mid_key, 0); 1749 btrfs_node_key(mid, &mid_key, 0);
1750 tree_mod_log_set_node_key(root->fs_info, parent, &mid_key,
1751 pslot, 0);
1094 btrfs_set_node_key(parent, &mid_key, pslot); 1752 btrfs_set_node_key(parent, &mid_key, pslot);
1095 btrfs_mark_buffer_dirty(parent); 1753 btrfs_mark_buffer_dirty(parent);
1096 } 1754 }
@@ -1188,6 +1846,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1188 struct btrfs_disk_key disk_key; 1846 struct btrfs_disk_key disk_key;
1189 orig_slot += left_nr; 1847 orig_slot += left_nr;
1190 btrfs_node_key(mid, &disk_key, 0); 1848 btrfs_node_key(mid, &disk_key, 0);
1849 tree_mod_log_set_node_key(root->fs_info, parent,
1850 &disk_key, pslot, 0);
1191 btrfs_set_node_key(parent, &disk_key, pslot); 1851 btrfs_set_node_key(parent, &disk_key, pslot);
1192 btrfs_mark_buffer_dirty(parent); 1852 btrfs_mark_buffer_dirty(parent);
1193 if (btrfs_header_nritems(left) > orig_slot) { 1853 if (btrfs_header_nritems(left) > orig_slot) {
@@ -1239,6 +1899,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1239 struct btrfs_disk_key disk_key; 1899 struct btrfs_disk_key disk_key;
1240 1900
1241 btrfs_node_key(right, &disk_key, 0); 1901 btrfs_node_key(right, &disk_key, 0);
1902 tree_mod_log_set_node_key(root->fs_info, parent,
1903 &disk_key, pslot + 1, 0);
1242 btrfs_set_node_key(parent, &disk_key, pslot + 1); 1904 btrfs_set_node_key(parent, &disk_key, pslot + 1);
1243 btrfs_mark_buffer_dirty(parent); 1905 btrfs_mark_buffer_dirty(parent);
1244 1906
@@ -1496,7 +2158,7 @@ static int
1496read_block_for_search(struct btrfs_trans_handle *trans, 2158read_block_for_search(struct btrfs_trans_handle *trans,
1497 struct btrfs_root *root, struct btrfs_path *p, 2159 struct btrfs_root *root, struct btrfs_path *p,
1498 struct extent_buffer **eb_ret, int level, int slot, 2160 struct extent_buffer **eb_ret, int level, int slot,
1499 struct btrfs_key *key) 2161 struct btrfs_key *key, u64 time_seq)
1500{ 2162{
1501 u64 blocknr; 2163 u64 blocknr;
1502 u64 gen; 2164 u64 gen;
@@ -1850,7 +2512,7 @@ cow_done:
1850 } 2512 }
1851 2513
1852 err = read_block_for_search(trans, root, p, 2514 err = read_block_for_search(trans, root, p,
1853 &b, level, slot, key); 2515 &b, level, slot, key, 0);
1854 if (err == -EAGAIN) 2516 if (err == -EAGAIN)
1855 goto again; 2517 goto again;
1856 if (err) { 2518 if (err) {
@@ -1922,6 +2584,115 @@ done:
1922} 2584}
1923 2585
1924/* 2586/*
2587 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2588 * current state of the tree together with the operations recorded in the tree
2589 * modification log to search for the key in a previous version of this tree, as
2590 * denoted by the time_seq parameter.
2591 *
2592 * Naturally, there is no support for insert, delete or cow operations.
2593 *
2594 * The resulting path and return value will be set up as if we called
2595 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2596 */
2597int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2598 struct btrfs_path *p, u64 time_seq)
2599{
2600 struct extent_buffer *b;
2601 int slot;
2602 int ret;
2603 int err;
2604 int level;
2605 int lowest_unlock = 1;
2606 u8 lowest_level = 0;
2607
2608 lowest_level = p->lowest_level;
2609 WARN_ON(p->nodes[0] != NULL);
2610
2611 if (p->search_commit_root) {
2612 BUG_ON(time_seq);
2613 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2614 }
2615
2616again:
2617 b = get_old_root(root, time_seq);
2618 extent_buffer_get(b);
2619 level = btrfs_header_level(b);
2620 btrfs_tree_read_lock(b);
2621 p->locks[level] = BTRFS_READ_LOCK;
2622
2623 while (b) {
2624 level = btrfs_header_level(b);
2625 p->nodes[level] = b;
2626 btrfs_clear_path_blocking(p, NULL, 0);
2627
2628 /*
2629 * we have a lock on b and as long as we aren't changing
2630 * the tree, there is no way to for the items in b to change.
2631 * It is safe to drop the lock on our parent before we
2632 * go through the expensive btree search on b.
2633 */
2634 btrfs_unlock_up_safe(p, level + 1);
2635
2636 ret = bin_search(b, key, level, &slot);
2637
2638 if (level != 0) {
2639 int dec = 0;
2640 if (ret && slot > 0) {
2641 dec = 1;
2642 slot -= 1;
2643 }
2644 p->slots[level] = slot;
2645 unlock_up(p, level, lowest_unlock, 0, NULL);
2646
2647 if (level == lowest_level) {
2648 if (dec)
2649 p->slots[level]++;
2650 goto done;
2651 }
2652
2653 err = read_block_for_search(NULL, root, p, &b, level,
2654 slot, key, time_seq);
2655 if (err == -EAGAIN)
2656 goto again;
2657 if (err) {
2658 ret = err;
2659 goto done;
2660 }
2661
2662 level = btrfs_header_level(b);
2663 err = btrfs_try_tree_read_lock(b);
2664 if (!err) {
2665 btrfs_set_path_blocking(p);
2666 btrfs_tree_read_lock(b);
2667 btrfs_clear_path_blocking(p, b,
2668 BTRFS_READ_LOCK);
2669 }
2670 p->locks[level] = BTRFS_READ_LOCK;
2671 p->nodes[level] = b;
2672 b = tree_mod_log_rewind(root->fs_info, b, time_seq);
2673 if (b != p->nodes[level]) {
2674 btrfs_tree_unlock_rw(p->nodes[level],
2675 p->locks[level]);
2676 p->locks[level] = 0;
2677 p->nodes[level] = b;
2678 }
2679 } else {
2680 p->slots[level] = slot;
2681 unlock_up(p, level, lowest_unlock, 0, NULL);
2682 goto done;
2683 }
2684 }
2685 ret = 1;
2686done:
2687 if (!p->leave_spinning)
2688 btrfs_set_path_blocking(p);
2689 if (ret < 0)
2690 btrfs_release_path(p);
2691
2692 return ret;
2693}
2694
2695/*
1925 * adjust the pointers going up the tree, starting at level 2696 * adjust the pointers going up the tree, starting at level
1926 * making sure the right key of each node is points to 'key'. 2697 * making sure the right key of each node is points to 'key'.
1927 * This is used after shifting pointers to the left, so it stops 2698 * This is used after shifting pointers to the left, so it stops
@@ -1941,6 +2712,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,
1941 if (!path->nodes[i]) 2712 if (!path->nodes[i])
1942 break; 2713 break;
1943 t = path->nodes[i]; 2714 t = path->nodes[i];
2715 tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1);
1944 btrfs_set_node_key(t, key, tslot); 2716 btrfs_set_node_key(t, key, tslot);
1945 btrfs_mark_buffer_dirty(path->nodes[i]); 2717 btrfs_mark_buffer_dirty(path->nodes[i]);
1946 if (tslot != 0) 2718 if (tslot != 0)
@@ -2023,12 +2795,16 @@ static int push_node_left(struct btrfs_trans_handle *trans,
2023 } else 2795 } else
2024 push_items = min(src_nritems - 8, push_items); 2796 push_items = min(src_nritems - 8, push_items);
2025 2797
2798 tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
2799 push_items);
2026 copy_extent_buffer(dst, src, 2800 copy_extent_buffer(dst, src,
2027 btrfs_node_key_ptr_offset(dst_nritems), 2801 btrfs_node_key_ptr_offset(dst_nritems),
2028 btrfs_node_key_ptr_offset(0), 2802 btrfs_node_key_ptr_offset(0),
2029 push_items * sizeof(struct btrfs_key_ptr)); 2803 push_items * sizeof(struct btrfs_key_ptr));
2030 2804
2031 if (push_items < src_nritems) { 2805 if (push_items < src_nritems) {
2806 tree_mod_log_eb_move(root->fs_info, src, 0, push_items,
2807 src_nritems - push_items);
2032 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), 2808 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
2033 btrfs_node_key_ptr_offset(push_items), 2809 btrfs_node_key_ptr_offset(push_items),
2034 (src_nritems - push_items) * 2810 (src_nritems - push_items) *
@@ -2082,11 +2858,14 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
2082 if (max_push < push_items) 2858 if (max_push < push_items)
2083 push_items = max_push; 2859 push_items = max_push;
2084 2860
2861 tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
2085 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), 2862 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
2086 btrfs_node_key_ptr_offset(0), 2863 btrfs_node_key_ptr_offset(0),
2087 (dst_nritems) * 2864 (dst_nritems) *
2088 sizeof(struct btrfs_key_ptr)); 2865 sizeof(struct btrfs_key_ptr));
2089 2866
2867 tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
2868 src_nritems - push_items, push_items);
2090 copy_extent_buffer(dst, src, 2869 copy_extent_buffer(dst, src,
2091 btrfs_node_key_ptr_offset(0), 2870 btrfs_node_key_ptr_offset(0),
2092 btrfs_node_key_ptr_offset(src_nritems - push_items), 2871 btrfs_node_key_ptr_offset(src_nritems - push_items),
@@ -2129,7 +2908,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2129 2908
2130 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 2909 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2131 root->root_key.objectid, &lower_key, 2910 root->root_key.objectid, &lower_key,
2132 level, root->node->start, 0, 0); 2911 level, root->node->start, 0);
2133 if (IS_ERR(c)) 2912 if (IS_ERR(c))
2134 return PTR_ERR(c); 2913 return PTR_ERR(c);
2135 2914
@@ -2161,6 +2940,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2161 btrfs_mark_buffer_dirty(c); 2940 btrfs_mark_buffer_dirty(c);
2162 2941
2163 old = root->node; 2942 old = root->node;
2943 tree_mod_log_set_root_pointer(root, c);
2164 rcu_assign_pointer(root->node, c); 2944 rcu_assign_pointer(root->node, c);
2165 2945
2166 /* the super has an extra ref to root->node */ 2946 /* the super has an extra ref to root->node */
@@ -2184,10 +2964,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2184static void insert_ptr(struct btrfs_trans_handle *trans, 2964static void insert_ptr(struct btrfs_trans_handle *trans,
2185 struct btrfs_root *root, struct btrfs_path *path, 2965 struct btrfs_root *root, struct btrfs_path *path,
2186 struct btrfs_disk_key *key, u64 bytenr, 2966 struct btrfs_disk_key *key, u64 bytenr,
2187 int slot, int level) 2967 int slot, int level, int tree_mod_log)
2188{ 2968{
2189 struct extent_buffer *lower; 2969 struct extent_buffer *lower;
2190 int nritems; 2970 int nritems;
2971 int ret;
2191 2972
2192 BUG_ON(!path->nodes[level]); 2973 BUG_ON(!path->nodes[level]);
2193 btrfs_assert_tree_locked(path->nodes[level]); 2974 btrfs_assert_tree_locked(path->nodes[level]);
@@ -2196,11 +2977,19 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
2196 BUG_ON(slot > nritems); 2977 BUG_ON(slot > nritems);
2197 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root)); 2978 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
2198 if (slot != nritems) { 2979 if (slot != nritems) {
2980 if (tree_mod_log && level)
2981 tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
2982 slot, nritems - slot);
2199 memmove_extent_buffer(lower, 2983 memmove_extent_buffer(lower,
2200 btrfs_node_key_ptr_offset(slot + 1), 2984 btrfs_node_key_ptr_offset(slot + 1),
2201 btrfs_node_key_ptr_offset(slot), 2985 btrfs_node_key_ptr_offset(slot),
2202 (nritems - slot) * sizeof(struct btrfs_key_ptr)); 2986 (nritems - slot) * sizeof(struct btrfs_key_ptr));
2203 } 2987 }
2988 if (tree_mod_log && level) {
2989 ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
2990 MOD_LOG_KEY_ADD);
2991 BUG_ON(ret < 0);
2992 }
2204 btrfs_set_node_key(lower, key, slot); 2993 btrfs_set_node_key(lower, key, slot);
2205 btrfs_set_node_blockptr(lower, slot, bytenr); 2994 btrfs_set_node_blockptr(lower, slot, bytenr);
2206 WARN_ON(trans->transid == 0); 2995 WARN_ON(trans->transid == 0);
@@ -2252,7 +3041,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2252 3041
2253 split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 3042 split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2254 root->root_key.objectid, 3043 root->root_key.objectid,
2255 &disk_key, level, c->start, 0, 0); 3044 &disk_key, level, c->start, 0);
2256 if (IS_ERR(split)) 3045 if (IS_ERR(split))
2257 return PTR_ERR(split); 3046 return PTR_ERR(split);
2258 3047
@@ -2271,7 +3060,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2271 (unsigned long)btrfs_header_chunk_tree_uuid(split), 3060 (unsigned long)btrfs_header_chunk_tree_uuid(split),
2272 BTRFS_UUID_SIZE); 3061 BTRFS_UUID_SIZE);
2273 3062
2274 3063 tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid);
2275 copy_extent_buffer(split, c, 3064 copy_extent_buffer(split, c,
2276 btrfs_node_key_ptr_offset(0), 3065 btrfs_node_key_ptr_offset(0),
2277 btrfs_node_key_ptr_offset(mid), 3066 btrfs_node_key_ptr_offset(mid),
@@ -2284,7 +3073,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2284 btrfs_mark_buffer_dirty(split); 3073 btrfs_mark_buffer_dirty(split);
2285 3074
2286 insert_ptr(trans, root, path, &disk_key, split->start, 3075 insert_ptr(trans, root, path, &disk_key, split->start,
2287 path->slots[level + 1] + 1, level + 1); 3076 path->slots[level + 1] + 1, level + 1, 1);
2288 3077
2289 if (path->slots[level] >= mid) { 3078 if (path->slots[level] >= mid) {
2290 path->slots[level] -= mid; 3079 path->slots[level] -= mid;
@@ -2821,7 +3610,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
2821 btrfs_set_header_nritems(l, mid); 3610 btrfs_set_header_nritems(l, mid);
2822 btrfs_item_key(right, &disk_key, 0); 3611 btrfs_item_key(right, &disk_key, 0);
2823 insert_ptr(trans, root, path, &disk_key, right->start, 3612 insert_ptr(trans, root, path, &disk_key, right->start,
2824 path->slots[1] + 1, 1); 3613 path->slots[1] + 1, 1, 0);
2825 3614
2826 btrfs_mark_buffer_dirty(right); 3615 btrfs_mark_buffer_dirty(right);
2827 btrfs_mark_buffer_dirty(l); 3616 btrfs_mark_buffer_dirty(l);
@@ -3004,7 +3793,7 @@ again:
3004 3793
3005 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 3794 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
3006 root->root_key.objectid, 3795 root->root_key.objectid,
3007 &disk_key, 0, l->start, 0, 0); 3796 &disk_key, 0, l->start, 0);
3008 if (IS_ERR(right)) 3797 if (IS_ERR(right))
3009 return PTR_ERR(right); 3798 return PTR_ERR(right);
3010 3799
@@ -3028,7 +3817,7 @@ again:
3028 if (mid <= slot) { 3817 if (mid <= slot) {
3029 btrfs_set_header_nritems(right, 0); 3818 btrfs_set_header_nritems(right, 0);
3030 insert_ptr(trans, root, path, &disk_key, right->start, 3819 insert_ptr(trans, root, path, &disk_key, right->start,
3031 path->slots[1] + 1, 1); 3820 path->slots[1] + 1, 1, 0);
3032 btrfs_tree_unlock(path->nodes[0]); 3821 btrfs_tree_unlock(path->nodes[0]);
3033 free_extent_buffer(path->nodes[0]); 3822 free_extent_buffer(path->nodes[0]);
3034 path->nodes[0] = right; 3823 path->nodes[0] = right;
@@ -3037,7 +3826,7 @@ again:
3037 } else { 3826 } else {
3038 btrfs_set_header_nritems(right, 0); 3827 btrfs_set_header_nritems(right, 0);
3039 insert_ptr(trans, root, path, &disk_key, right->start, 3828 insert_ptr(trans, root, path, &disk_key, right->start,
3040 path->slots[1], 1); 3829 path->slots[1], 1, 0);
3041 btrfs_tree_unlock(path->nodes[0]); 3830 btrfs_tree_unlock(path->nodes[0]);
3042 free_extent_buffer(path->nodes[0]); 3831 free_extent_buffer(path->nodes[0]);
3043 path->nodes[0] = right; 3832 path->nodes[0] = right;
@@ -3749,19 +4538,29 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
3749 * empty a node. 4538 * empty a node.
3750 */ 4539 */
3751static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4540static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3752 struct btrfs_path *path, int level, int slot) 4541 struct btrfs_path *path, int level, int slot,
4542 int tree_mod_log)
3753{ 4543{
3754 struct extent_buffer *parent = path->nodes[level]; 4544 struct extent_buffer *parent = path->nodes[level];
3755 u32 nritems; 4545 u32 nritems;
4546 int ret;
3756 4547
3757 nritems = btrfs_header_nritems(parent); 4548 nritems = btrfs_header_nritems(parent);
3758 if (slot != nritems - 1) { 4549 if (slot != nritems - 1) {
4550 if (tree_mod_log && level)
4551 tree_mod_log_eb_move(root->fs_info, parent, slot,
4552 slot + 1, nritems - slot - 1);
3759 memmove_extent_buffer(parent, 4553 memmove_extent_buffer(parent,
3760 btrfs_node_key_ptr_offset(slot), 4554 btrfs_node_key_ptr_offset(slot),
3761 btrfs_node_key_ptr_offset(slot + 1), 4555 btrfs_node_key_ptr_offset(slot + 1),
3762 sizeof(struct btrfs_key_ptr) * 4556 sizeof(struct btrfs_key_ptr) *
3763 (nritems - slot - 1)); 4557 (nritems - slot - 1));
4558 } else if (tree_mod_log && level) {
4559 ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4560 MOD_LOG_KEY_REMOVE);
4561 BUG_ON(ret < 0);
3764 } 4562 }
4563
3765 nritems--; 4564 nritems--;
3766 btrfs_set_header_nritems(parent, nritems); 4565 btrfs_set_header_nritems(parent, nritems);
3767 if (nritems == 0 && parent == root->node) { 4566 if (nritems == 0 && parent == root->node) {
@@ -3793,7 +4592,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
3793 struct extent_buffer *leaf) 4592 struct extent_buffer *leaf)
3794{ 4593{
3795 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4594 WARN_ON(btrfs_header_generation(leaf) != trans->transid);
3796 del_ptr(trans, root, path, 1, path->slots[1]); 4595 del_ptr(trans, root, path, 1, path->slots[1], 1);
3797 4596
3798 /* 4597 /*
3799 * btrfs_free_extent is expensive, we want to make sure we 4598 * btrfs_free_extent is expensive, we want to make sure we
@@ -3804,7 +4603,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
3804 root_sub_used(root, leaf->len); 4603 root_sub_used(root, leaf->len);
3805 4604
3806 extent_buffer_get(leaf); 4605 extent_buffer_get(leaf);
3807 btrfs_free_tree_block(trans, root, leaf, 0, 1, 0); 4606 btrfs_free_tree_block(trans, root, leaf, 0, 1);
3808 free_extent_buffer_stale(leaf); 4607 free_extent_buffer_stale(leaf);
3809} 4608}
3810/* 4609/*
@@ -4271,7 +5070,7 @@ again:
4271 next = c; 5070 next = c;
4272 next_rw_lock = path->locks[level]; 5071 next_rw_lock = path->locks[level];
4273 ret = read_block_for_search(NULL, root, path, &next, level, 5072 ret = read_block_for_search(NULL, root, path, &next, level,
4274 slot, &key); 5073 slot, &key, 0);
4275 if (ret == -EAGAIN) 5074 if (ret == -EAGAIN)
4276 goto again; 5075 goto again;
4277 5076
@@ -4308,7 +5107,7 @@ again:
4308 break; 5107 break;
4309 5108
4310 ret = read_block_for_search(NULL, root, path, &next, level, 5109 ret = read_block_for_search(NULL, root, path, &next, level,
4311 0, &key); 5110 0, &key, 0);
4312 if (ret == -EAGAIN) 5111 if (ret == -EAGAIN)
4313 goto again; 5112 goto again;
4314 5113
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8fd72331d600..0151ca1ac657 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -173,6 +173,9 @@ static int btrfs_csum_sizes[] = { 4, 0 };
173#define BTRFS_FT_XATTR 8 173#define BTRFS_FT_XATTR 8
174#define BTRFS_FT_MAX 9 174#define BTRFS_FT_MAX 9
175 175
176/* ioprio of readahead is set to idle */
177#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
178
176/* 179/*
177 * The key defines the order in the tree, and so it also defines (optimal) 180 * The key defines the order in the tree, and so it also defines (optimal)
178 * block layout. 181 * block layout.
@@ -823,6 +826,14 @@ struct btrfs_csum_item {
823 u8 csum; 826 u8 csum;
824} __attribute__ ((__packed__)); 827} __attribute__ ((__packed__));
825 828
829struct btrfs_dev_stats_item {
830 /*
831 * grow this item struct at the end for future enhancements and keep
832 * the existing values unchanged
833 */
834 __le64 values[BTRFS_DEV_STAT_VALUES_MAX];
835} __attribute__ ((__packed__));
836
826/* different types of block groups (and chunks) */ 837/* different types of block groups (and chunks) */
827#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) 838#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0)
828#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) 839#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1)
@@ -1129,6 +1140,15 @@ struct btrfs_fs_info {
1129 spinlock_t delayed_iput_lock; 1140 spinlock_t delayed_iput_lock;
1130 struct list_head delayed_iputs; 1141 struct list_head delayed_iputs;
1131 1142
1143 /* this protects tree_mod_seq_list */
1144 spinlock_t tree_mod_seq_lock;
1145 atomic_t tree_mod_seq;
1146 struct list_head tree_mod_seq_list;
1147
1148 /* this protects tree_mod_log */
1149 rwlock_t tree_mod_log_lock;
1150 struct rb_root tree_mod_log;
1151
1132 atomic_t nr_async_submits; 1152 atomic_t nr_async_submits;
1133 atomic_t async_submit_draining; 1153 atomic_t async_submit_draining;
1134 atomic_t nr_async_bios; 1154 atomic_t nr_async_bios;
@@ -1375,7 +1395,7 @@ struct btrfs_root {
1375 struct list_head root_list; 1395 struct list_head root_list;
1376 1396
1377 spinlock_t orphan_lock; 1397 spinlock_t orphan_lock;
1378 struct list_head orphan_list; 1398 atomic_t orphan_inodes;
1379 struct btrfs_block_rsv *orphan_block_rsv; 1399 struct btrfs_block_rsv *orphan_block_rsv;
1380 int orphan_item_inserted; 1400 int orphan_item_inserted;
1381 int orphan_cleanup_state; 1401 int orphan_cleanup_state;
@@ -1508,6 +1528,12 @@ struct btrfs_ioctl_defrag_range_args {
1508#define BTRFS_BALANCE_ITEM_KEY 248 1528#define BTRFS_BALANCE_ITEM_KEY 248
1509 1529
1510/* 1530/*
1531 * Persistantly stores the io stats in the device tree.
1532 * One key for all stats, (0, BTRFS_DEV_STATS_KEY, devid).
1533 */
1534#define BTRFS_DEV_STATS_KEY 249
1535
1536/*
1511 * string items are for debugging. They just store a short string of 1537 * string items are for debugging. They just store a short string of
1512 * data in the FS 1538 * data in the FS
1513 */ 1539 */
@@ -2415,6 +2441,30 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
2415 return btrfs_item_size(eb, e) - offset; 2441 return btrfs_item_size(eb, e) - offset;
2416} 2442}
2417 2443
2444/* btrfs_dev_stats_item */
2445static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb,
2446 struct btrfs_dev_stats_item *ptr,
2447 int index)
2448{
2449 u64 val;
2450
2451 read_extent_buffer(eb, &val,
2452 offsetof(struct btrfs_dev_stats_item, values) +
2453 ((unsigned long)ptr) + (index * sizeof(u64)),
2454 sizeof(val));
2455 return val;
2456}
2457
2458static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb,
2459 struct btrfs_dev_stats_item *ptr,
2460 int index, u64 val)
2461{
2462 write_extent_buffer(eb, &val,
2463 offsetof(struct btrfs_dev_stats_item, values) +
2464 ((unsigned long)ptr) + (index * sizeof(u64)),
2465 sizeof(val));
2466}
2467
2418static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) 2468static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
2419{ 2469{
2420 return sb->s_fs_info; 2470 return sb->s_fs_info;
@@ -2496,11 +2546,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
2496 struct btrfs_root *root, u32 blocksize, 2546 struct btrfs_root *root, u32 blocksize,
2497 u64 parent, u64 root_objectid, 2547 u64 parent, u64 root_objectid,
2498 struct btrfs_disk_key *key, int level, 2548 struct btrfs_disk_key *key, int level,
2499 u64 hint, u64 empty_size, int for_cow); 2549 u64 hint, u64 empty_size);
2500void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 2550void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
2501 struct btrfs_root *root, 2551 struct btrfs_root *root,
2502 struct extent_buffer *buf, 2552 struct extent_buffer *buf,
2503 u64 parent, int last_ref, int for_cow); 2553 u64 parent, int last_ref);
2504struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 2554struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
2505 struct btrfs_root *root, 2555 struct btrfs_root *root,
2506 u64 bytenr, u32 blocksize, 2556 u64 bytenr, u32 blocksize,
@@ -2659,6 +2709,8 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
2659int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root 2709int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2660 *root, struct btrfs_key *key, struct btrfs_path *p, int 2710 *root, struct btrfs_key *key, struct btrfs_path *p, int
2661 ins_len, int cow); 2711 ins_len, int cow);
2712int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2713 struct btrfs_path *p, u64 time_seq);
2662int btrfs_realloc_node(struct btrfs_trans_handle *trans, 2714int btrfs_realloc_node(struct btrfs_trans_handle *trans,
2663 struct btrfs_root *root, struct extent_buffer *parent, 2715 struct btrfs_root *root, struct extent_buffer *parent,
2664 int start_slot, int cache_only, u64 *last_ret, 2716 int start_slot, int cache_only, u64 *last_ret,
@@ -3098,4 +3150,23 @@ void btrfs_reada_detach(void *handle);
3098int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, 3150int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
3099 u64 start, int err); 3151 u64 start, int err);
3100 3152
3153/* delayed seq elem */
3154struct seq_list {
3155 struct list_head list;
3156 u64 seq;
3157 u32 flags;
3158};
3159
3160void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
3161 struct seq_list *elem);
3162void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
3163 struct seq_list *elem);
3164
3165static inline int is_fstree(u64 rootid)
3166{
3167 if (rootid == BTRFS_FS_TREE_OBJECTID ||
3168 (s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
3169 return 1;
3170 return 0;
3171}
3101#endif 3172#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 03e3748d84d0..c18d0442ae6d 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -669,8 +669,8 @@ static int btrfs_delayed_inode_reserve_metadata(
669 return ret; 669 return ret;
670 } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { 670 } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
671 spin_lock(&BTRFS_I(inode)->lock); 671 spin_lock(&BTRFS_I(inode)->lock);
672 if (BTRFS_I(inode)->delalloc_meta_reserved) { 672 if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
673 BTRFS_I(inode)->delalloc_meta_reserved = 0; 673 &BTRFS_I(inode)->runtime_flags)) {
674 spin_unlock(&BTRFS_I(inode)->lock); 674 spin_unlock(&BTRFS_I(inode)->lock);
675 release = true; 675 release = true;
676 goto migrate; 676 goto migrate;
@@ -1706,7 +1706,7 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
1706 btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode)); 1706 btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
1707 btrfs_set_stack_inode_generation(inode_item, 1707 btrfs_set_stack_inode_generation(inode_item,
1708 BTRFS_I(inode)->generation); 1708 BTRFS_I(inode)->generation);
1709 btrfs_set_stack_inode_sequence(inode_item, BTRFS_I(inode)->sequence); 1709 btrfs_set_stack_inode_sequence(inode_item, inode->i_version);
1710 btrfs_set_stack_inode_transid(inode_item, trans->transid); 1710 btrfs_set_stack_inode_transid(inode_item, trans->transid);
1711 btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev); 1711 btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
1712 btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags); 1712 btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
@@ -1754,7 +1754,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
1754 set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); 1754 set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
1755 inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); 1755 inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
1756 BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); 1756 BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
1757 BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item); 1757 inode->i_version = btrfs_stack_inode_sequence(inode_item);
1758 inode->i_rdev = 0; 1758 inode->i_rdev = 0;
1759 *rdev = btrfs_stack_inode_rdev(inode_item); 1759 *rdev = btrfs_stack_inode_rdev(inode_item);
1760 BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item); 1760 BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 69f22e3ab3bc..13ae7b04790e 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -525,7 +525,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
525 ref->is_head = 0; 525 ref->is_head = 0;
526 ref->in_tree = 1; 526 ref->in_tree = 1;
527 527
528 if (need_ref_seq(for_cow, ref_root)) 528 if (is_fstree(ref_root))
529 seq = inc_delayed_seq(delayed_refs); 529 seq = inc_delayed_seq(delayed_refs);
530 ref->seq = seq; 530 ref->seq = seq;
531 531
@@ -584,7 +584,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
584 ref->is_head = 0; 584 ref->is_head = 0;
585 ref->in_tree = 1; 585 ref->in_tree = 1;
586 586
587 if (need_ref_seq(for_cow, ref_root)) 587 if (is_fstree(ref_root))
588 seq = inc_delayed_seq(delayed_refs); 588 seq = inc_delayed_seq(delayed_refs);
589 ref->seq = seq; 589 ref->seq = seq;
590 590
@@ -658,10 +658,11 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
658 add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, 658 add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
659 num_bytes, parent, ref_root, level, action, 659 num_bytes, parent, ref_root, level, action,
660 for_cow); 660 for_cow);
661 if (!need_ref_seq(for_cow, ref_root) && 661 if (!is_fstree(ref_root) &&
662 waitqueue_active(&delayed_refs->seq_wait)) 662 waitqueue_active(&delayed_refs->seq_wait))
663 wake_up(&delayed_refs->seq_wait); 663 wake_up(&delayed_refs->seq_wait);
664 spin_unlock(&delayed_refs->lock); 664 spin_unlock(&delayed_refs->lock);
665
665 return 0; 666 return 0;
666} 667}
667 668
@@ -706,10 +707,11 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
706 add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, 707 add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
707 num_bytes, parent, ref_root, owner, offset, 708 num_bytes, parent, ref_root, owner, offset,
708 action, for_cow); 709 action, for_cow);
709 if (!need_ref_seq(for_cow, ref_root) && 710 if (!is_fstree(ref_root) &&
710 waitqueue_active(&delayed_refs->seq_wait)) 711 waitqueue_active(&delayed_refs->seq_wait))
711 wake_up(&delayed_refs->seq_wait); 712 wake_up(&delayed_refs->seq_wait);
712 spin_unlock(&delayed_refs->lock); 713 spin_unlock(&delayed_refs->lock);
714
713 return 0; 715 return 0;
714} 716}
715 717
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index d8f244d94925..413927fb9957 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -195,11 +195,6 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
195int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 195int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
196 struct list_head *cluster, u64 search_start); 196 struct list_head *cluster, u64 search_start);
197 197
198struct seq_list {
199 struct list_head list;
200 u64 seq;
201};
202
203static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs) 198static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs)
204{ 199{
205 assert_spin_locked(&delayed_refs->lock); 200 assert_spin_locked(&delayed_refs->lock);
@@ -230,25 +225,6 @@ int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
230 u64 seq); 225 u64 seq);
231 226
232/* 227/*
233 * delayed refs with a ref_seq > 0 must be held back during backref walking.
234 * this only applies to items in one of the fs-trees. for_cow items never need
235 * to be held back, so they won't get a ref_seq number.
236 */
237static inline int need_ref_seq(int for_cow, u64 rootid)
238{
239 if (for_cow)
240 return 0;
241
242 if (rootid == BTRFS_FS_TREE_OBJECTID)
243 return 1;
244
245 if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
246 return 1;
247
248 return 0;
249}
250
251/*
252 * a node might live in a head or a regular ref, this lets you 228 * a node might live in a head or a regular ref, this lets you
253 * test for the proper type to use. 229 * test for the proper type to use.
254 */ 230 */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e1fe74a2ce16..7ae51decf6d3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1153,7 +1153,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1153 root->orphan_block_rsv = NULL; 1153 root->orphan_block_rsv = NULL;
1154 1154
1155 INIT_LIST_HEAD(&root->dirty_list); 1155 INIT_LIST_HEAD(&root->dirty_list);
1156 INIT_LIST_HEAD(&root->orphan_list);
1157 INIT_LIST_HEAD(&root->root_list); 1156 INIT_LIST_HEAD(&root->root_list);
1158 spin_lock_init(&root->orphan_lock); 1157 spin_lock_init(&root->orphan_lock);
1159 spin_lock_init(&root->inode_lock); 1158 spin_lock_init(&root->inode_lock);
@@ -1166,6 +1165,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1166 atomic_set(&root->log_commit[0], 0); 1165 atomic_set(&root->log_commit[0], 0);
1167 atomic_set(&root->log_commit[1], 0); 1166 atomic_set(&root->log_commit[1], 0);
1168 atomic_set(&root->log_writers, 0); 1167 atomic_set(&root->log_writers, 0);
1168 atomic_set(&root->orphan_inodes, 0);
1169 root->log_batch = 0; 1169 root->log_batch = 0;
1170 root->log_transid = 0; 1170 root->log_transid = 0;
1171 root->last_log_commit = 0; 1171 root->last_log_commit = 0;
@@ -1252,7 +1252,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1252 1252
1253 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 1253 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
1254 BTRFS_TREE_LOG_OBJECTID, NULL, 1254 BTRFS_TREE_LOG_OBJECTID, NULL,
1255 0, 0, 0, 0); 1255 0, 0, 0);
1256 if (IS_ERR(leaf)) { 1256 if (IS_ERR(leaf)) {
1257 kfree(root); 1257 kfree(root);
1258 return ERR_CAST(leaf); 1258 return ERR_CAST(leaf);
@@ -1914,11 +1914,14 @@ int open_ctree(struct super_block *sb,
1914 spin_lock_init(&fs_info->delayed_iput_lock); 1914 spin_lock_init(&fs_info->delayed_iput_lock);
1915 spin_lock_init(&fs_info->defrag_inodes_lock); 1915 spin_lock_init(&fs_info->defrag_inodes_lock);
1916 spin_lock_init(&fs_info->free_chunk_lock); 1916 spin_lock_init(&fs_info->free_chunk_lock);
1917 spin_lock_init(&fs_info->tree_mod_seq_lock);
1918 rwlock_init(&fs_info->tree_mod_log_lock);
1917 mutex_init(&fs_info->reloc_mutex); 1919 mutex_init(&fs_info->reloc_mutex);
1918 1920
1919 init_completion(&fs_info->kobj_unregister); 1921 init_completion(&fs_info->kobj_unregister);
1920 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 1922 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
1921 INIT_LIST_HEAD(&fs_info->space_info); 1923 INIT_LIST_HEAD(&fs_info->space_info);
1924 INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
1922 btrfs_mapping_init(&fs_info->mapping_tree); 1925 btrfs_mapping_init(&fs_info->mapping_tree);
1923 btrfs_init_block_rsv(&fs_info->global_block_rsv); 1926 btrfs_init_block_rsv(&fs_info->global_block_rsv);
1924 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv); 1927 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
@@ -1931,12 +1934,14 @@ int open_ctree(struct super_block *sb,
1931 atomic_set(&fs_info->async_submit_draining, 0); 1934 atomic_set(&fs_info->async_submit_draining, 0);
1932 atomic_set(&fs_info->nr_async_bios, 0); 1935 atomic_set(&fs_info->nr_async_bios, 0);
1933 atomic_set(&fs_info->defrag_running, 0); 1936 atomic_set(&fs_info->defrag_running, 0);
1937 atomic_set(&fs_info->tree_mod_seq, 0);
1934 fs_info->sb = sb; 1938 fs_info->sb = sb;
1935 fs_info->max_inline = 8192 * 1024; 1939 fs_info->max_inline = 8192 * 1024;
1936 fs_info->metadata_ratio = 0; 1940 fs_info->metadata_ratio = 0;
1937 fs_info->defrag_inodes = RB_ROOT; 1941 fs_info->defrag_inodes = RB_ROOT;
1938 fs_info->trans_no_join = 0; 1942 fs_info->trans_no_join = 0;
1939 fs_info->free_chunk_space = 0; 1943 fs_info->free_chunk_space = 0;
1944 fs_info->tree_mod_log = RB_ROOT;
1940 1945
1941 /* readahead state */ 1946 /* readahead state */
1942 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); 1947 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
@@ -2001,7 +2006,8 @@ int open_ctree(struct super_block *sb,
2001 BTRFS_I(fs_info->btree_inode)->root = tree_root; 2006 BTRFS_I(fs_info->btree_inode)->root = tree_root;
2002 memset(&BTRFS_I(fs_info->btree_inode)->location, 0, 2007 memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
2003 sizeof(struct btrfs_key)); 2008 sizeof(struct btrfs_key));
2004 BTRFS_I(fs_info->btree_inode)->dummy_inode = 1; 2009 set_bit(BTRFS_INODE_DUMMY,
2010 &BTRFS_I(fs_info->btree_inode)->runtime_flags);
2005 insert_inode_hash(fs_info->btree_inode); 2011 insert_inode_hash(fs_info->btree_inode);
2006 2012
2007 spin_lock_init(&fs_info->block_group_cache_lock); 2013 spin_lock_init(&fs_info->block_group_cache_lock);
@@ -2353,6 +2359,13 @@ retry_root_backup:
2353 fs_info->generation = generation; 2359 fs_info->generation = generation;
2354 fs_info->last_trans_committed = generation; 2360 fs_info->last_trans_committed = generation;
2355 2361
2362 ret = btrfs_init_dev_stats(fs_info);
2363 if (ret) {
2364 printk(KERN_ERR "btrfs: failed to init dev_stats: %d\n",
2365 ret);
2366 goto fail_block_groups;
2367 }
2368
2356 ret = btrfs_init_space_info(fs_info); 2369 ret = btrfs_init_space_info(fs_info);
2357 if (ret) { 2370 if (ret) {
2358 printk(KERN_ERR "Failed to initial space info: %d\n", ret); 2371 printk(KERN_ERR "Failed to initial space info: %d\n", ret);
@@ -2556,18 +2569,19 @@ recovery_tree_root:
2556 2569
2557static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) 2570static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
2558{ 2571{
2559 char b[BDEVNAME_SIZE];
2560
2561 if (uptodate) { 2572 if (uptodate) {
2562 set_buffer_uptodate(bh); 2573 set_buffer_uptodate(bh);
2563 } else { 2574 } else {
2575 struct btrfs_device *device = (struct btrfs_device *)
2576 bh->b_private;
2577
2564 printk_ratelimited(KERN_WARNING "lost page write due to " 2578 printk_ratelimited(KERN_WARNING "lost page write due to "
2565 "I/O error on %s\n", 2579 "I/O error on %s\n", device->name);
2566 bdevname(bh->b_bdev, b));
2567 /* note, we dont' set_buffer_write_io_error because we have 2580 /* note, we dont' set_buffer_write_io_error because we have
2568 * our own ways of dealing with the IO errors 2581 * our own ways of dealing with the IO errors
2569 */ 2582 */
2570 clear_buffer_uptodate(bh); 2583 clear_buffer_uptodate(bh);
2584 btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
2571 } 2585 }
2572 unlock_buffer(bh); 2586 unlock_buffer(bh);
2573 put_bh(bh); 2587 put_bh(bh);
@@ -2682,6 +2696,7 @@ static int write_dev_supers(struct btrfs_device *device,
2682 set_buffer_uptodate(bh); 2696 set_buffer_uptodate(bh);
2683 lock_buffer(bh); 2697 lock_buffer(bh);
2684 bh->b_end_io = btrfs_end_buffer_write_sync; 2698 bh->b_end_io = btrfs_end_buffer_write_sync;
2699 bh->b_private = device;
2685 } 2700 }
2686 2701
2687 /* 2702 /*
@@ -2740,6 +2755,9 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
2740 } 2755 }
2741 if (!bio_flagged(bio, BIO_UPTODATE)) { 2756 if (!bio_flagged(bio, BIO_UPTODATE)) {
2742 ret = -EIO; 2757 ret = -EIO;
2758 if (!bio_flagged(bio, BIO_EOPNOTSUPP))
2759 btrfs_dev_stat_inc_and_print(device,
2760 BTRFS_DEV_STAT_FLUSH_ERRS);
2743 } 2761 }
2744 2762
2745 /* drop the reference from the wait == 0 run */ 2763 /* drop the reference from the wait == 0 run */
@@ -2902,19 +2920,6 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
2902 return ret; 2920 return ret;
2903} 2921}
2904 2922
2905/* Kill all outstanding I/O */
2906void btrfs_abort_devices(struct btrfs_root *root)
2907{
2908 struct list_head *head;
2909 struct btrfs_device *dev;
2910 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2911 head = &root->fs_info->fs_devices->devices;
2912 list_for_each_entry_rcu(dev, head, dev_list) {
2913 blk_abort_queue(dev->bdev->bd_disk->queue);
2914 }
2915 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2916}
2917
2918void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) 2923void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
2919{ 2924{
2920 spin_lock(&fs_info->fs_roots_radix_lock); 2925 spin_lock(&fs_info->fs_roots_radix_lock);
@@ -3671,17 +3676,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
3671 return 0; 3676 return 0;
3672} 3677}
3673 3678
3674static int btree_writepage_io_failed_hook(struct bio *bio, struct page *page,
3675 u64 start, u64 end,
3676 struct extent_state *state)
3677{
3678 struct super_block *sb = page->mapping->host->i_sb;
3679 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
3680 btrfs_error(fs_info, -EIO,
3681 "Error occured while writing out btree at %llu", start);
3682 return -EIO;
3683}
3684
3685static struct extent_io_ops btree_extent_io_ops = { 3679static struct extent_io_ops btree_extent_io_ops = {
3686 .write_cache_pages_lock_hook = btree_lock_page_hook, 3680 .write_cache_pages_lock_hook = btree_lock_page_hook,
3687 .readpage_end_io_hook = btree_readpage_end_io_hook, 3681 .readpage_end_io_hook = btree_readpage_end_io_hook,
@@ -3689,5 +3683,4 @@ static struct extent_io_ops btree_extent_io_ops = {
3689 .submit_bio_hook = btree_submit_bio_hook, 3683 .submit_bio_hook = btree_submit_bio_hook,
3690 /* note we're sharing with inode.c for the merge bio hook */ 3684 /* note we're sharing with inode.c for the merge bio hook */
3691 .merge_bio_hook = btrfs_merge_bio_hook, 3685 .merge_bio_hook = btrfs_merge_bio_hook,
3692 .writepage_io_failed_hook = btree_writepage_io_failed_hook,
3693}; 3686};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index ab1830aaf0ed..05b3fab39f7e 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -89,7 +89,6 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
89int btrfs_cleanup_transaction(struct btrfs_root *root); 89int btrfs_cleanup_transaction(struct btrfs_root *root);
90void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, 90void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
91 struct btrfs_root *root); 91 struct btrfs_root *root);
92void btrfs_abort_devices(struct btrfs_root *root);
93 92
94#ifdef CONFIG_DEBUG_LOCK_ALLOC 93#ifdef CONFIG_DEBUG_LOCK_ALLOC
95void btrfs_init_lockdep(void); 94void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 49fd7b66d57b..4b5a1e1bdefb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3578,7 +3578,7 @@ again:
3578 space_info->chunk_alloc = 0; 3578 space_info->chunk_alloc = 0;
3579 spin_unlock(&space_info->lock); 3579 spin_unlock(&space_info->lock);
3580out: 3580out:
3581 mutex_unlock(&extent_root->fs_info->chunk_mutex); 3581 mutex_unlock(&fs_info->chunk_mutex);
3582 return ret; 3582 return ret;
3583} 3583}
3584 3584
@@ -4355,10 +4355,9 @@ static unsigned drop_outstanding_extent(struct inode *inode)
4355 BTRFS_I(inode)->outstanding_extents--; 4355 BTRFS_I(inode)->outstanding_extents--;
4356 4356
4357 if (BTRFS_I(inode)->outstanding_extents == 0 && 4357 if (BTRFS_I(inode)->outstanding_extents == 0 &&
4358 BTRFS_I(inode)->delalloc_meta_reserved) { 4358 test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4359 &BTRFS_I(inode)->runtime_flags))
4359 drop_inode_space = 1; 4360 drop_inode_space = 1;
4360 BTRFS_I(inode)->delalloc_meta_reserved = 0;
4361 }
4362 4361
4363 /* 4362 /*
4364 * If we have more or the same amount of outsanding extents than we have 4363 * If we have more or the same amount of outsanding extents than we have
@@ -4465,7 +4464,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4465 * Add an item to reserve for updating the inode when we complete the 4464 * Add an item to reserve for updating the inode when we complete the
4466 * delalloc io. 4465 * delalloc io.
4467 */ 4466 */
4468 if (!BTRFS_I(inode)->delalloc_meta_reserved) { 4467 if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4468 &BTRFS_I(inode)->runtime_flags)) {
4469 nr_extents++; 4469 nr_extents++;
4470 extra_reserve = 1; 4470 extra_reserve = 1;
4471 } 4471 }
@@ -4511,7 +4511,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4511 4511
4512 spin_lock(&BTRFS_I(inode)->lock); 4512 spin_lock(&BTRFS_I(inode)->lock);
4513 if (extra_reserve) { 4513 if (extra_reserve) {
4514 BTRFS_I(inode)->delalloc_meta_reserved = 1; 4514 set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4515 &BTRFS_I(inode)->runtime_flags);
4515 nr_extents--; 4516 nr_extents--;
4516 } 4517 }
4517 BTRFS_I(inode)->reserved_extents += nr_extents; 4518 BTRFS_I(inode)->reserved_extents += nr_extents;
@@ -5217,7 +5218,7 @@ out:
5217void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 5218void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5218 struct btrfs_root *root, 5219 struct btrfs_root *root,
5219 struct extent_buffer *buf, 5220 struct extent_buffer *buf,
5220 u64 parent, int last_ref, int for_cow) 5221 u64 parent, int last_ref)
5221{ 5222{
5222 struct btrfs_block_group_cache *cache = NULL; 5223 struct btrfs_block_group_cache *cache = NULL;
5223 int ret; 5224 int ret;
@@ -5227,7 +5228,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5227 buf->start, buf->len, 5228 buf->start, buf->len,
5228 parent, root->root_key.objectid, 5229 parent, root->root_key.objectid,
5229 btrfs_header_level(buf), 5230 btrfs_header_level(buf),
5230 BTRFS_DROP_DELAYED_REF, NULL, for_cow); 5231 BTRFS_DROP_DELAYED_REF, NULL, 0);
5231 BUG_ON(ret); /* -ENOMEM */ 5232 BUG_ON(ret); /* -ENOMEM */
5232 } 5233 }
5233 5234
@@ -6249,7 +6250,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6249 struct btrfs_root *root, u32 blocksize, 6250 struct btrfs_root *root, u32 blocksize,
6250 u64 parent, u64 root_objectid, 6251 u64 parent, u64 root_objectid,
6251 struct btrfs_disk_key *key, int level, 6252 struct btrfs_disk_key *key, int level,
6252 u64 hint, u64 empty_size, int for_cow) 6253 u64 hint, u64 empty_size)
6253{ 6254{
6254 struct btrfs_key ins; 6255 struct btrfs_key ins;
6255 struct btrfs_block_rsv *block_rsv; 6256 struct btrfs_block_rsv *block_rsv;
@@ -6297,7 +6298,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6297 ins.objectid, 6298 ins.objectid,
6298 ins.offset, parent, root_objectid, 6299 ins.offset, parent, root_objectid,
6299 level, BTRFS_ADD_DELAYED_EXTENT, 6300 level, BTRFS_ADD_DELAYED_EXTENT,
6300 extent_op, for_cow); 6301 extent_op, 0);
6301 BUG_ON(ret); /* -ENOMEM */ 6302 BUG_ON(ret); /* -ENOMEM */
6302 } 6303 }
6303 return buf; 6304 return buf;
@@ -6715,7 +6716,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6715 btrfs_header_owner(path->nodes[level + 1])); 6716 btrfs_header_owner(path->nodes[level + 1]));
6716 } 6717 }
6717 6718
6718 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0); 6719 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
6719out: 6720out:
6720 wc->refs[level] = 0; 6721 wc->refs[level] = 0;
6721 wc->flags[level] = 0; 6722 wc->flags[level] = 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c9018a05036e..2c8f7b204617 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -186,7 +186,6 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
186 return parent; 186 return parent;
187 } 187 }
188 188
189 entry = rb_entry(node, struct tree_entry, rb_node);
190 rb_link_node(node, parent, p); 189 rb_link_node(node, parent, p);
191 rb_insert_color(node, root); 190 rb_insert_color(node, root);
192 return NULL; 191 return NULL;
@@ -413,7 +412,7 @@ static struct extent_state *next_state(struct extent_state *state)
413 412
414/* 413/*
415 * utility function to clear some bits in an extent state struct. 414 * utility function to clear some bits in an extent state struct.
416 * it will optionally wake up any one waiting on this state (wake == 1) 415 * it will optionally wake up any one waiting on this state (wake == 1).
417 * 416 *
418 * If no bits are set on the state struct after clearing things, the 417 * If no bits are set on the state struct after clearing things, the
419 * struct is freed and removed from the tree 418 * struct is freed and removed from the tree
@@ -570,10 +569,8 @@ hit_next:
570 if (err) 569 if (err)
571 goto out; 570 goto out;
572 if (state->end <= end) { 571 if (state->end <= end) {
573 clear_state_bit(tree, state, &bits, wake); 572 state = clear_state_bit(tree, state, &bits, wake);
574 if (last_end == (u64)-1) 573 goto next;
575 goto out;
576 start = last_end + 1;
577 } 574 }
578 goto search_again; 575 goto search_again;
579 } 576 }
@@ -781,7 +778,6 @@ hit_next:
781 * Just lock what we found and keep going 778 * Just lock what we found and keep going
782 */ 779 */
783 if (state->start == start && state->end <= end) { 780 if (state->start == start && state->end <= end) {
784 struct rb_node *next_node;
785 if (state->state & exclusive_bits) { 781 if (state->state & exclusive_bits) {
786 *failed_start = state->start; 782 *failed_start = state->start;
787 err = -EEXIST; 783 err = -EEXIST;
@@ -789,20 +785,15 @@ hit_next:
789 } 785 }
790 786
791 set_state_bits(tree, state, &bits); 787 set_state_bits(tree, state, &bits);
792
793 cache_state(state, cached_state); 788 cache_state(state, cached_state);
794 merge_state(tree, state); 789 merge_state(tree, state);
795 if (last_end == (u64)-1) 790 if (last_end == (u64)-1)
796 goto out; 791 goto out;
797
798 start = last_end + 1; 792 start = last_end + 1;
799 next_node = rb_next(&state->rb_node); 793 state = next_state(state);
800 if (next_node && start < end && prealloc && !need_resched()) { 794 if (start < end && state && state->start == start &&
801 state = rb_entry(next_node, struct extent_state, 795 !need_resched())
802 rb_node); 796 goto hit_next;
803 if (state->start == start)
804 goto hit_next;
805 }
806 goto search_again; 797 goto search_again;
807 } 798 }
808 799
@@ -845,6 +836,10 @@ hit_next:
845 if (last_end == (u64)-1) 836 if (last_end == (u64)-1)
846 goto out; 837 goto out;
847 start = last_end + 1; 838 start = last_end + 1;
839 state = next_state(state);
840 if (start < end && state && state->start == start &&
841 !need_resched())
842 goto hit_next;
848 } 843 }
849 goto search_again; 844 goto search_again;
850 } 845 }
@@ -994,21 +989,14 @@ hit_next:
994 * Just lock what we found and keep going 989 * Just lock what we found and keep going
995 */ 990 */
996 if (state->start == start && state->end <= end) { 991 if (state->start == start && state->end <= end) {
997 struct rb_node *next_node;
998
999 set_state_bits(tree, state, &bits); 992 set_state_bits(tree, state, &bits);
1000 clear_state_bit(tree, state, &clear_bits, 0); 993 state = clear_state_bit(tree, state, &clear_bits, 0);
1001 if (last_end == (u64)-1) 994 if (last_end == (u64)-1)
1002 goto out; 995 goto out;
1003
1004 start = last_end + 1; 996 start = last_end + 1;
1005 next_node = rb_next(&state->rb_node); 997 if (start < end && state && state->start == start &&
1006 if (next_node && start < end && prealloc && !need_resched()) { 998 !need_resched())
1007 state = rb_entry(next_node, struct extent_state, 999 goto hit_next;
1008 rb_node);
1009 if (state->start == start)
1010 goto hit_next;
1011 }
1012 goto search_again; 1000 goto search_again;
1013 } 1001 }
1014 1002
@@ -1042,10 +1030,13 @@ hit_next:
1042 goto out; 1030 goto out;
1043 if (state->end <= end) { 1031 if (state->end <= end) {
1044 set_state_bits(tree, state, &bits); 1032 set_state_bits(tree, state, &bits);
1045 clear_state_bit(tree, state, &clear_bits, 0); 1033 state = clear_state_bit(tree, state, &clear_bits, 0);
1046 if (last_end == (u64)-1) 1034 if (last_end == (u64)-1)
1047 goto out; 1035 goto out;
1048 start = last_end + 1; 1036 start = last_end + 1;
1037 if (start < end && state && state->start == start &&
1038 !need_resched())
1039 goto hit_next;
1049 } 1040 }
1050 goto search_again; 1041 goto search_again;
1051 } 1042 }
@@ -1173,9 +1164,8 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1173 cached_state, mask); 1164 cached_state, mask);
1174} 1165}
1175 1166
1176static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, 1167int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1177 u64 end, struct extent_state **cached_state, 1168 struct extent_state **cached_state, gfp_t mask)
1178 gfp_t mask)
1179{ 1169{
1180 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, 1170 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
1181 cached_state, mask); 1171 cached_state, mask);
@@ -1293,7 +1283,7 @@ out:
1293 * returned if we find something, and *start_ret and *end_ret are 1283 * returned if we find something, and *start_ret and *end_ret are
1294 * set to reflect the state struct that was found. 1284 * set to reflect the state struct that was found.
1295 * 1285 *
1296 * If nothing was found, 1 is returned, < 0 on error 1286 * If nothing was found, 1 is returned. If found something, return 0.
1297 */ 1287 */
1298int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1288int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1299 u64 *start_ret, u64 *end_ret, int bits) 1289 u64 *start_ret, u64 *end_ret, int bits)
@@ -1923,6 +1913,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1923 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { 1913 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1924 /* try to remap that extent elsewhere? */ 1914 /* try to remap that extent elsewhere? */
1925 bio_put(bio); 1915 bio_put(bio);
1916 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
1926 return -EIO; 1917 return -EIO;
1927 } 1918 }
1928 1919
@@ -2222,17 +2213,7 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
2222 uptodate = 0; 2213 uptodate = 0;
2223 } 2214 }
2224 2215
2225 if (!uptodate && tree->ops &&
2226 tree->ops->writepage_io_failed_hook) {
2227 ret = tree->ops->writepage_io_failed_hook(NULL, page,
2228 start, end, NULL);
2229 /* Writeback already completed */
2230 if (ret == 0)
2231 return 1;
2232 }
2233
2234 if (!uptodate) { 2216 if (!uptodate) {
2235 clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
2236 ClearPageUptodate(page); 2217 ClearPageUptodate(page);
2237 SetPageError(page); 2218 SetPageError(page);
2238 } 2219 }
@@ -2347,10 +2328,23 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2347 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 2328 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
2348 ret = tree->ops->readpage_end_io_hook(page, start, end, 2329 ret = tree->ops->readpage_end_io_hook(page, start, end,
2349 state, mirror); 2330 state, mirror);
2350 if (ret) 2331 if (ret) {
2332 /* no IO indicated but software detected errors
2333 * in the block, either checksum errors or
2334 * issues with the contents */
2335 struct btrfs_root *root =
2336 BTRFS_I(page->mapping->host)->root;
2337 struct btrfs_device *device;
2338
2351 uptodate = 0; 2339 uptodate = 0;
2352 else 2340 device = btrfs_find_device_for_logical(
2341 root, start, mirror);
2342 if (device)
2343 btrfs_dev_stat_inc_and_print(device,
2344 BTRFS_DEV_STAT_CORRUPTION_ERRS);
2345 } else {
2353 clean_io_failure(start, page); 2346 clean_io_failure(start, page);
2347 }
2354 } 2348 }
2355 2349
2356 if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) { 2350 if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
@@ -3164,7 +3158,7 @@ static int write_one_eb(struct extent_buffer *eb,
3164 u64 offset = eb->start; 3158 u64 offset = eb->start;
3165 unsigned long i, num_pages; 3159 unsigned long i, num_pages;
3166 int rw = (epd->sync_io ? WRITE_SYNC : WRITE); 3160 int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
3167 int ret; 3161 int ret = 0;
3168 3162
3169 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3163 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3170 num_pages = num_extent_pages(eb->start, eb->len); 3164 num_pages = num_extent_pages(eb->start, eb->len);
@@ -3930,6 +3924,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3930 eb->start = start; 3924 eb->start = start;
3931 eb->len = len; 3925 eb->len = len;
3932 eb->tree = tree; 3926 eb->tree = tree;
3927 eb->bflags = 0;
3933 rwlock_init(&eb->lock); 3928 rwlock_init(&eb->lock);
3934 atomic_set(&eb->write_locks, 0); 3929 atomic_set(&eb->write_locks, 0);
3935 atomic_set(&eb->read_locks, 0); 3930 atomic_set(&eb->read_locks, 0);
@@ -3967,6 +3962,60 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3967 return eb; 3962 return eb;
3968} 3963}
3969 3964
3965struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
3966{
3967 unsigned long i;
3968 struct page *p;
3969 struct extent_buffer *new;
3970 unsigned long num_pages = num_extent_pages(src->start, src->len);
3971
3972 new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_ATOMIC);
3973 if (new == NULL)
3974 return NULL;
3975
3976 for (i = 0; i < num_pages; i++) {
3977 p = alloc_page(GFP_ATOMIC);
3978 BUG_ON(!p);
3979 attach_extent_buffer_page(new, p);
3980 WARN_ON(PageDirty(p));
3981 SetPageUptodate(p);
3982 new->pages[i] = p;
3983 }
3984
3985 copy_extent_buffer(new, src, 0, 0, src->len);
3986 set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
3987 set_bit(EXTENT_BUFFER_DUMMY, &new->bflags);
3988
3989 return new;
3990}
3991
3992struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
3993{
3994 struct extent_buffer *eb;
3995 unsigned long num_pages = num_extent_pages(0, len);
3996 unsigned long i;
3997
3998 eb = __alloc_extent_buffer(NULL, start, len, GFP_ATOMIC);
3999 if (!eb)
4000 return NULL;
4001
4002 for (i = 0; i < num_pages; i++) {
4003 eb->pages[i] = alloc_page(GFP_ATOMIC);
4004 if (!eb->pages[i])
4005 goto err;
4006 }
4007 set_extent_buffer_uptodate(eb);
4008 btrfs_set_header_nritems(eb, 0);
4009 set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
4010
4011 return eb;
4012err:
4013 for (i--; i > 0; i--)
4014 __free_page(eb->pages[i]);
4015 __free_extent_buffer(eb);
4016 return NULL;
4017}
4018
3970static int extent_buffer_under_io(struct extent_buffer *eb) 4019static int extent_buffer_under_io(struct extent_buffer *eb)
3971{ 4020{
3972 return (atomic_read(&eb->io_pages) || 4021 return (atomic_read(&eb->io_pages) ||
@@ -3981,18 +4030,21 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
3981 unsigned long start_idx) 4030 unsigned long start_idx)
3982{ 4031{
3983 unsigned long index; 4032 unsigned long index;
4033 unsigned long num_pages;
3984 struct page *page; 4034 struct page *page;
4035 int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
3985 4036
3986 BUG_ON(extent_buffer_under_io(eb)); 4037 BUG_ON(extent_buffer_under_io(eb));
3987 4038
3988 index = num_extent_pages(eb->start, eb->len); 4039 num_pages = num_extent_pages(eb->start, eb->len);
4040 index = start_idx + num_pages;
3989 if (start_idx >= index) 4041 if (start_idx >= index)
3990 return; 4042 return;
3991 4043
3992 do { 4044 do {
3993 index--; 4045 index--;
3994 page = extent_buffer_page(eb, index); 4046 page = extent_buffer_page(eb, index);
3995 if (page) { 4047 if (page && mapped) {
3996 spin_lock(&page->mapping->private_lock); 4048 spin_lock(&page->mapping->private_lock);
3997 /* 4049 /*
3998 * We do this since we'll remove the pages after we've 4050 * We do this since we'll remove the pages after we've
@@ -4017,6 +4069,8 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
4017 } 4069 }
4018 spin_unlock(&page->mapping->private_lock); 4070 spin_unlock(&page->mapping->private_lock);
4019 4071
4072 }
4073 if (page) {
4020 /* One for when we alloced the page */ 4074 /* One for when we alloced the page */
4021 page_cache_release(page); 4075 page_cache_release(page);
4022 } 4076 }
@@ -4235,14 +4289,18 @@ static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
4235{ 4289{
4236 WARN_ON(atomic_read(&eb->refs) == 0); 4290 WARN_ON(atomic_read(&eb->refs) == 0);
4237 if (atomic_dec_and_test(&eb->refs)) { 4291 if (atomic_dec_and_test(&eb->refs)) {
4238 struct extent_io_tree *tree = eb->tree; 4292 if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) {
4293 spin_unlock(&eb->refs_lock);
4294 } else {
4295 struct extent_io_tree *tree = eb->tree;
4239 4296
4240 spin_unlock(&eb->refs_lock); 4297 spin_unlock(&eb->refs_lock);
4241 4298
4242 spin_lock(&tree->buffer_lock); 4299 spin_lock(&tree->buffer_lock);
4243 radix_tree_delete(&tree->buffer, 4300 radix_tree_delete(&tree->buffer,
4244 eb->start >> PAGE_CACHE_SHIFT); 4301 eb->start >> PAGE_CACHE_SHIFT);
4245 spin_unlock(&tree->buffer_lock); 4302 spin_unlock(&tree->buffer_lock);
4303 }
4246 4304
4247 /* Should be safe to release our pages at this point */ 4305 /* Should be safe to release our pages at this point */
4248 btrfs_release_extent_buffer_page(eb, 0); 4306 btrfs_release_extent_buffer_page(eb, 0);
@@ -4260,6 +4318,10 @@ void free_extent_buffer(struct extent_buffer *eb)
4260 4318
4261 spin_lock(&eb->refs_lock); 4319 spin_lock(&eb->refs_lock);
4262 if (atomic_read(&eb->refs) == 2 && 4320 if (atomic_read(&eb->refs) == 2 &&
4321 test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
4322 atomic_dec(&eb->refs);
4323
4324 if (atomic_read(&eb->refs) == 2 &&
4263 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && 4325 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
4264 !extent_buffer_under_io(eb) && 4326 !extent_buffer_under_io(eb) &&
4265 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4327 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index b516c3b8dec6..25900af5b15d 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -39,6 +39,7 @@
39#define EXTENT_BUFFER_STALE 6 39#define EXTENT_BUFFER_STALE 6
40#define EXTENT_BUFFER_WRITEBACK 7 40#define EXTENT_BUFFER_WRITEBACK 7
41#define EXTENT_BUFFER_IOERR 8 41#define EXTENT_BUFFER_IOERR 8
42#define EXTENT_BUFFER_DUMMY 9
42 43
43/* these are flags for extent_clear_unlock_delalloc */ 44/* these are flags for extent_clear_unlock_delalloc */
44#define EXTENT_CLEAR_UNLOCK_PAGE 0x1 45#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -75,9 +76,6 @@ struct extent_io_ops {
75 unsigned long bio_flags); 76 unsigned long bio_flags);
76 int (*readpage_io_hook)(struct page *page, u64 start, u64 end); 77 int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
77 int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); 78 int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
78 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
79 u64 start, u64 end,
80 struct extent_state *state);
81 int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, 79 int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
82 struct extent_state *state, int mirror); 80 struct extent_state *state, int mirror);
83 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, 81 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
@@ -225,6 +223,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
225 struct extent_state **cached_state, gfp_t mask); 223 struct extent_state **cached_state, gfp_t mask);
226int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 224int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
227 struct extent_state **cached_state, gfp_t mask); 225 struct extent_state **cached_state, gfp_t mask);
226int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
227 struct extent_state **cached_state, gfp_t mask);
228int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 228int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
229 gfp_t mask); 229 gfp_t mask);
230int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 230int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
@@ -265,6 +265,8 @@ void set_page_extent_mapped(struct page *page);
265 265
266struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, 266struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
267 u64 start, unsigned long len); 267 u64 start, unsigned long len);
268struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len);
269struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
268struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 270struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
269 u64 start, unsigned long len); 271 u64 start, unsigned long len);
270void free_extent_buffer(struct extent_buffer *eb); 272void free_extent_buffer(struct extent_buffer *eb);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 53bf2d764bbc..876cddd6b2f0 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -65,6 +65,21 @@ struct inode_defrag {
65 int cycled; 65 int cycled;
66}; 66};
67 67
68static int __compare_inode_defrag(struct inode_defrag *defrag1,
69 struct inode_defrag *defrag2)
70{
71 if (defrag1->root > defrag2->root)
72 return 1;
73 else if (defrag1->root < defrag2->root)
74 return -1;
75 else if (defrag1->ino > defrag2->ino)
76 return 1;
77 else if (defrag1->ino < defrag2->ino)
78 return -1;
79 else
80 return 0;
81}
82
68/* pop a record for an inode into the defrag tree. The lock 83/* pop a record for an inode into the defrag tree. The lock
69 * must be held already 84 * must be held already
70 * 85 *
@@ -81,15 +96,17 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
81 struct inode_defrag *entry; 96 struct inode_defrag *entry;
82 struct rb_node **p; 97 struct rb_node **p;
83 struct rb_node *parent = NULL; 98 struct rb_node *parent = NULL;
99 int ret;
84 100
85 p = &root->fs_info->defrag_inodes.rb_node; 101 p = &root->fs_info->defrag_inodes.rb_node;
86 while (*p) { 102 while (*p) {
87 parent = *p; 103 parent = *p;
88 entry = rb_entry(parent, struct inode_defrag, rb_node); 104 entry = rb_entry(parent, struct inode_defrag, rb_node);
89 105
90 if (defrag->ino < entry->ino) 106 ret = __compare_inode_defrag(defrag, entry);
107 if (ret < 0)
91 p = &parent->rb_left; 108 p = &parent->rb_left;
92 else if (defrag->ino > entry->ino) 109 else if (ret > 0)
93 p = &parent->rb_right; 110 p = &parent->rb_right;
94 else { 111 else {
95 /* if we're reinserting an entry for 112 /* if we're reinserting an entry for
@@ -103,7 +120,7 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
103 goto exists; 120 goto exists;
104 } 121 }
105 } 122 }
106 BTRFS_I(inode)->in_defrag = 1; 123 set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
107 rb_link_node(&defrag->rb_node, parent, p); 124 rb_link_node(&defrag->rb_node, parent, p);
108 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); 125 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
109 return; 126 return;
@@ -131,7 +148,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
131 if (btrfs_fs_closing(root->fs_info)) 148 if (btrfs_fs_closing(root->fs_info))
132 return 0; 149 return 0;
133 150
134 if (BTRFS_I(inode)->in_defrag) 151 if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
135 return 0; 152 return 0;
136 153
137 if (trans) 154 if (trans)
@@ -148,7 +165,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
148 defrag->root = root->root_key.objectid; 165 defrag->root = root->root_key.objectid;
149 166
150 spin_lock(&root->fs_info->defrag_inodes_lock); 167 spin_lock(&root->fs_info->defrag_inodes_lock);
151 if (!BTRFS_I(inode)->in_defrag) 168 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
152 __btrfs_add_inode_defrag(inode, defrag); 169 __btrfs_add_inode_defrag(inode, defrag);
153 else 170 else
154 kfree(defrag); 171 kfree(defrag);
@@ -159,28 +176,35 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
159/* 176/*
160 * must be called with the defrag_inodes lock held 177 * must be called with the defrag_inodes lock held
161 */ 178 */
162struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino, 179struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
180 u64 root, u64 ino,
163 struct rb_node **next) 181 struct rb_node **next)
164{ 182{
165 struct inode_defrag *entry = NULL; 183 struct inode_defrag *entry = NULL;
184 struct inode_defrag tmp;
166 struct rb_node *p; 185 struct rb_node *p;
167 struct rb_node *parent = NULL; 186 struct rb_node *parent = NULL;
187 int ret;
188
189 tmp.ino = ino;
190 tmp.root = root;
168 191
169 p = info->defrag_inodes.rb_node; 192 p = info->defrag_inodes.rb_node;
170 while (p) { 193 while (p) {
171 parent = p; 194 parent = p;
172 entry = rb_entry(parent, struct inode_defrag, rb_node); 195 entry = rb_entry(parent, struct inode_defrag, rb_node);
173 196
174 if (ino < entry->ino) 197 ret = __compare_inode_defrag(&tmp, entry);
198 if (ret < 0)
175 p = parent->rb_left; 199 p = parent->rb_left;
176 else if (ino > entry->ino) 200 else if (ret > 0)
177 p = parent->rb_right; 201 p = parent->rb_right;
178 else 202 else
179 return entry; 203 return entry;
180 } 204 }
181 205
182 if (next) { 206 if (next) {
183 while (parent && ino > entry->ino) { 207 while (parent && __compare_inode_defrag(&tmp, entry) > 0) {
184 parent = rb_next(parent); 208 parent = rb_next(parent);
185 entry = rb_entry(parent, struct inode_defrag, rb_node); 209 entry = rb_entry(parent, struct inode_defrag, rb_node);
186 } 210 }
@@ -202,6 +226,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
202 struct btrfs_key key; 226 struct btrfs_key key;
203 struct btrfs_ioctl_defrag_range_args range; 227 struct btrfs_ioctl_defrag_range_args range;
204 u64 first_ino = 0; 228 u64 first_ino = 0;
229 u64 root_objectid = 0;
205 int num_defrag; 230 int num_defrag;
206 int defrag_batch = 1024; 231 int defrag_batch = 1024;
207 232
@@ -214,11 +239,14 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
214 n = NULL; 239 n = NULL;
215 240
216 /* find an inode to defrag */ 241 /* find an inode to defrag */
217 defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n); 242 defrag = btrfs_find_defrag_inode(fs_info, root_objectid,
243 first_ino, &n);
218 if (!defrag) { 244 if (!defrag) {
219 if (n) 245 if (n) {
220 defrag = rb_entry(n, struct inode_defrag, rb_node); 246 defrag = rb_entry(n, struct inode_defrag,
221 else if (first_ino) { 247 rb_node);
248 } else if (root_objectid || first_ino) {
249 root_objectid = 0;
222 first_ino = 0; 250 first_ino = 0;
223 continue; 251 continue;
224 } else { 252 } else {
@@ -228,6 +256,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
228 256
229 /* remove it from the rbtree */ 257 /* remove it from the rbtree */
230 first_ino = defrag->ino + 1; 258 first_ino = defrag->ino + 1;
259 root_objectid = defrag->root;
231 rb_erase(&defrag->rb_node, &fs_info->defrag_inodes); 260 rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
232 261
233 if (btrfs_fs_closing(fs_info)) 262 if (btrfs_fs_closing(fs_info))
@@ -252,7 +281,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
252 goto next; 281 goto next;
253 282
254 /* do a chunk of defrag */ 283 /* do a chunk of defrag */
255 BTRFS_I(inode)->in_defrag = 0; 284 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
256 range.start = defrag->last_offset; 285 range.start = defrag->last_offset;
257 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, 286 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
258 defrag_batch); 287 defrag_batch);
@@ -1409,7 +1438,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1409 mutex_unlock(&inode->i_mutex); 1438 mutex_unlock(&inode->i_mutex);
1410 goto out; 1439 goto out;
1411 } 1440 }
1412 BTRFS_I(inode)->sequence++;
1413 1441
1414 start_pos = round_down(pos, root->sectorsize); 1442 start_pos = round_down(pos, root->sectorsize);
1415 if (start_pos > i_size_read(inode)) { 1443 if (start_pos > i_size_read(inode)) {
@@ -1466,8 +1494,8 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
1466 * flush down new bytes that may have been written if the 1494 * flush down new bytes that may have been written if the
1467 * application were using truncate to replace a file in place. 1495 * application were using truncate to replace a file in place.
1468 */ 1496 */
1469 if (BTRFS_I(inode)->ordered_data_close) { 1497 if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
1470 BTRFS_I(inode)->ordered_data_close = 0; 1498 &BTRFS_I(inode)->runtime_flags)) {
1471 btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); 1499 btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
1472 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) 1500 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
1473 filemap_flush(inode->i_mapping); 1501 filemap_flush(inode->i_mapping);
@@ -1498,14 +1526,15 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1498 1526
1499 trace_btrfs_sync_file(file, datasync); 1527 trace_btrfs_sync_file(file, datasync);
1500 1528
1501 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
1502 if (ret)
1503 return ret;
1504 mutex_lock(&inode->i_mutex); 1529 mutex_lock(&inode->i_mutex);
1505 1530
1506 /* we wait first, since the writeback may change the inode */ 1531 /*
1532 * we wait first, since the writeback may change the inode, also wait
1533 * ordered range does a filemape_write_and_wait_range which is why we
1534 * don't do it above like other file systems.
1535 */
1507 root->log_batch++; 1536 root->log_batch++;
1508 btrfs_wait_ordered_range(inode, 0, (u64)-1); 1537 btrfs_wait_ordered_range(inode, start, end);
1509 root->log_batch++; 1538 root->log_batch++;
1510 1539
1511 /* 1540 /*
@@ -1523,7 +1552,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1523 * syncing 1552 * syncing
1524 */ 1553 */
1525 smp_mb(); 1554 smp_mb();
1526 if (BTRFS_I(inode)->last_trans <= 1555 if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
1556 BTRFS_I(inode)->last_trans <=
1527 root->fs_info->last_trans_committed) { 1557 root->fs_info->last_trans_committed) {
1528 BTRFS_I(inode)->last_trans = 0; 1558 BTRFS_I(inode)->last_trans = 0;
1529 mutex_unlock(&inode->i_mutex); 1559 mutex_unlock(&inode->i_mutex);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 202008ec367d..19a0d85b451c 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -33,6 +33,8 @@
33 33
34static int link_free_space(struct btrfs_free_space_ctl *ctl, 34static int link_free_space(struct btrfs_free_space_ctl *ctl,
35 struct btrfs_free_space *info); 35 struct btrfs_free_space *info);
36static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
37 struct btrfs_free_space *info);
36 38
37static struct inode *__lookup_free_space_inode(struct btrfs_root *root, 39static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
38 struct btrfs_path *path, 40 struct btrfs_path *path,
@@ -584,6 +586,44 @@ static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
584 return 0; 586 return 0;
585} 587}
586 588
589/*
590 * Since we attach pinned extents after the fact we can have contiguous sections
591 * of free space that are split up in entries. This poses a problem with the
592 * tree logging stuff since it could have allocated across what appears to be 2
593 * entries since we would have merged the entries when adding the pinned extents
594 * back to the free space cache. So run through the space cache that we just
595 * loaded and merge contiguous entries. This will make the log replay stuff not
596 * blow up and it will make for nicer allocator behavior.
597 */
598static void merge_space_tree(struct btrfs_free_space_ctl *ctl)
599{
600 struct btrfs_free_space *e, *prev = NULL;
601 struct rb_node *n;
602
603again:
604 spin_lock(&ctl->tree_lock);
605 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
606 e = rb_entry(n, struct btrfs_free_space, offset_index);
607 if (!prev)
608 goto next;
609 if (e->bitmap || prev->bitmap)
610 goto next;
611 if (prev->offset + prev->bytes == e->offset) {
612 unlink_free_space(ctl, prev);
613 unlink_free_space(ctl, e);
614 prev->bytes += e->bytes;
615 kmem_cache_free(btrfs_free_space_cachep, e);
616 link_free_space(ctl, prev);
617 prev = NULL;
618 spin_unlock(&ctl->tree_lock);
619 goto again;
620 }
621next:
622 prev = e;
623 }
624 spin_unlock(&ctl->tree_lock);
625}
626
587int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, 627int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
588 struct btrfs_free_space_ctl *ctl, 628 struct btrfs_free_space_ctl *ctl,
589 struct btrfs_path *path, u64 offset) 629 struct btrfs_path *path, u64 offset)
@@ -726,6 +766,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
726 } 766 }
727 767
728 io_ctl_drop_pages(&io_ctl); 768 io_ctl_drop_pages(&io_ctl);
769 merge_space_tree(ctl);
729 ret = 1; 770 ret = 1;
730out: 771out:
731 io_ctl_free(&io_ctl); 772 io_ctl_free(&io_ctl);
@@ -972,9 +1013,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
972 goto out; 1013 goto out;
973 1014
974 1015
975 ret = filemap_write_and_wait(inode->i_mapping); 1016 btrfs_wait_ordered_range(inode, 0, (u64)-1);
976 if (ret)
977 goto out;
978 1017
979 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 1018 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
980 key.offset = offset; 1019 key.offset = offset;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ceb7b9c9edcc..e9991adc0960 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -89,7 +89,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
89 89
90static int btrfs_setsize(struct inode *inode, loff_t newsize); 90static int btrfs_setsize(struct inode *inode, loff_t newsize);
91static int btrfs_truncate(struct inode *inode); 91static int btrfs_truncate(struct inode *inode);
92static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); 92static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
93static noinline int cow_file_range(struct inode *inode, 93static noinline int cow_file_range(struct inode *inode,
94 struct page *locked_page, 94 struct page *locked_page,
95 u64 start, u64 end, int *page_started, 95 u64 start, u64 end, int *page_started,
@@ -257,10 +257,13 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
257 ret = insert_inline_extent(trans, root, inode, start, 257 ret = insert_inline_extent(trans, root, inode, start,
258 inline_len, compressed_size, 258 inline_len, compressed_size,
259 compress_type, compressed_pages); 259 compress_type, compressed_pages);
260 if (ret) { 260 if (ret && ret != -ENOSPC) {
261 btrfs_abort_transaction(trans, root, ret); 261 btrfs_abort_transaction(trans, root, ret);
262 return ret; 262 return ret;
263 } else if (ret == -ENOSPC) {
264 return 1;
263 } 265 }
266
264 btrfs_delalloc_release_metadata(inode, end + 1 - start); 267 btrfs_delalloc_release_metadata(inode, end + 1 - start);
265 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 268 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
266 return 0; 269 return 0;
@@ -1572,11 +1575,11 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1572 if (btrfs_is_free_space_inode(root, inode)) 1575 if (btrfs_is_free_space_inode(root, inode))
1573 metadata = 2; 1576 metadata = 2;
1574 1577
1575 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
1576 if (ret)
1577 return ret;
1578
1579 if (!(rw & REQ_WRITE)) { 1578 if (!(rw & REQ_WRITE)) {
1579 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
1580 if (ret)
1581 return ret;
1582
1580 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1583 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1581 return btrfs_submit_compressed_read(inode, bio, 1584 return btrfs_submit_compressed_read(inode, bio,
1582 mirror_num, bio_flags); 1585 mirror_num, bio_flags);
@@ -1815,25 +1818,24 @@ out:
1815 * an ordered extent if the range of bytes in the file it covers are 1818 * an ordered extent if the range of bytes in the file it covers are
1816 * fully written. 1819 * fully written.
1817 */ 1820 */
1818static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) 1821static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1819{ 1822{
1823 struct inode *inode = ordered_extent->inode;
1820 struct btrfs_root *root = BTRFS_I(inode)->root; 1824 struct btrfs_root *root = BTRFS_I(inode)->root;
1821 struct btrfs_trans_handle *trans = NULL; 1825 struct btrfs_trans_handle *trans = NULL;
1822 struct btrfs_ordered_extent *ordered_extent = NULL;
1823 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1826 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1824 struct extent_state *cached_state = NULL; 1827 struct extent_state *cached_state = NULL;
1825 int compress_type = 0; 1828 int compress_type = 0;
1826 int ret; 1829 int ret;
1827 bool nolock; 1830 bool nolock;
1828 1831
1829 ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
1830 end - start + 1);
1831 if (!ret)
1832 return 0;
1833 BUG_ON(!ordered_extent); /* Logic error */
1834
1835 nolock = btrfs_is_free_space_inode(root, inode); 1832 nolock = btrfs_is_free_space_inode(root, inode);
1836 1833
1834 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
1835 ret = -EIO;
1836 goto out;
1837 }
1838
1837 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1839 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
1838 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ 1840 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
1839 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1841 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
@@ -1889,12 +1891,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1889 ordered_extent->file_offset, 1891 ordered_extent->file_offset,
1890 ordered_extent->len); 1892 ordered_extent->len);
1891 } 1893 }
1892 unlock_extent_cached(io_tree, ordered_extent->file_offset, 1894
1893 ordered_extent->file_offset +
1894 ordered_extent->len - 1, &cached_state, GFP_NOFS);
1895 if (ret < 0) { 1895 if (ret < 0) {
1896 btrfs_abort_transaction(trans, root, ret); 1896 btrfs_abort_transaction(trans, root, ret);
1897 goto out; 1897 goto out_unlock;
1898 } 1898 }
1899 1899
1900 add_pending_csums(trans, inode, ordered_extent->file_offset, 1900 add_pending_csums(trans, inode, ordered_extent->file_offset,
@@ -1905,10 +1905,14 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1905 ret = btrfs_update_inode_fallback(trans, root, inode); 1905 ret = btrfs_update_inode_fallback(trans, root, inode);
1906 if (ret) { /* -ENOMEM or corruption */ 1906 if (ret) { /* -ENOMEM or corruption */
1907 btrfs_abort_transaction(trans, root, ret); 1907 btrfs_abort_transaction(trans, root, ret);
1908 goto out; 1908 goto out_unlock;
1909 } 1909 }
1910 } 1910 }
1911 ret = 0; 1911 ret = 0;
1912out_unlock:
1913 unlock_extent_cached(io_tree, ordered_extent->file_offset,
1914 ordered_extent->file_offset +
1915 ordered_extent->len - 1, &cached_state, GFP_NOFS);
1912out: 1916out:
1913 if (root != root->fs_info->tree_root) 1917 if (root != root->fs_info->tree_root)
1914 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 1918 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
@@ -1919,26 +1923,57 @@ out:
1919 btrfs_end_transaction(trans, root); 1923 btrfs_end_transaction(trans, root);
1920 } 1924 }
1921 1925
1926 if (ret)
1927 clear_extent_uptodate(io_tree, ordered_extent->file_offset,
1928 ordered_extent->file_offset +
1929 ordered_extent->len - 1, NULL, GFP_NOFS);
1930
1931 /*
1932 * This needs to be dont to make sure anybody waiting knows we are done
1933 * upating everything for this ordered extent.
1934 */
1935 btrfs_remove_ordered_extent(inode, ordered_extent);
1936
1922 /* once for us */ 1937 /* once for us */
1923 btrfs_put_ordered_extent(ordered_extent); 1938 btrfs_put_ordered_extent(ordered_extent);
1924 /* once for the tree */ 1939 /* once for the tree */
1925 btrfs_put_ordered_extent(ordered_extent); 1940 btrfs_put_ordered_extent(ordered_extent);
1926 1941
1927 return 0; 1942 return ret;
1928out_unlock: 1943}
1929 unlock_extent_cached(io_tree, ordered_extent->file_offset, 1944
1930 ordered_extent->file_offset + 1945static void finish_ordered_fn(struct btrfs_work *work)
1931 ordered_extent->len - 1, &cached_state, GFP_NOFS); 1946{
1932 goto out; 1947 struct btrfs_ordered_extent *ordered_extent;
1948 ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
1949 btrfs_finish_ordered_io(ordered_extent);
1933} 1950}
1934 1951
1935static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, 1952static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
1936 struct extent_state *state, int uptodate) 1953 struct extent_state *state, int uptodate)
1937{ 1954{
1955 struct inode *inode = page->mapping->host;
1956 struct btrfs_root *root = BTRFS_I(inode)->root;
1957 struct btrfs_ordered_extent *ordered_extent = NULL;
1958 struct btrfs_workers *workers;
1959
1938 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); 1960 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
1939 1961
1940 ClearPagePrivate2(page); 1962 ClearPagePrivate2(page);
1941 return btrfs_finish_ordered_io(page->mapping->host, start, end); 1963 if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
1964 end - start + 1, uptodate))
1965 return 0;
1966
1967 ordered_extent->work.func = finish_ordered_fn;
1968 ordered_extent->work.flags = 0;
1969
1970 if (btrfs_is_free_space_inode(root, inode))
1971 workers = &root->fs_info->endio_freespace_worker;
1972 else
1973 workers = &root->fs_info->endio_write_workers;
1974 btrfs_queue_worker(workers, &ordered_extent->work);
1975
1976 return 0;
1942} 1977}
1943 1978
1944/* 1979/*
@@ -2072,12 +2107,12 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2072 struct btrfs_block_rsv *block_rsv; 2107 struct btrfs_block_rsv *block_rsv;
2073 int ret; 2108 int ret;
2074 2109
2075 if (!list_empty(&root->orphan_list) || 2110 if (atomic_read(&root->orphan_inodes) ||
2076 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) 2111 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
2077 return; 2112 return;
2078 2113
2079 spin_lock(&root->orphan_lock); 2114 spin_lock(&root->orphan_lock);
2080 if (!list_empty(&root->orphan_list)) { 2115 if (atomic_read(&root->orphan_inodes)) {
2081 spin_unlock(&root->orphan_lock); 2116 spin_unlock(&root->orphan_lock);
2082 return; 2117 return;
2083 } 2118 }
@@ -2134,8 +2169,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2134 block_rsv = NULL; 2169 block_rsv = NULL;
2135 } 2170 }
2136 2171
2137 if (list_empty(&BTRFS_I(inode)->i_orphan)) { 2172 if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2138 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2173 &BTRFS_I(inode)->runtime_flags)) {
2139#if 0 2174#if 0
2140 /* 2175 /*
2141 * For proper ENOSPC handling, we should do orphan 2176 * For proper ENOSPC handling, we should do orphan
@@ -2148,12 +2183,12 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2148 insert = 1; 2183 insert = 1;
2149#endif 2184#endif
2150 insert = 1; 2185 insert = 1;
2186 atomic_dec(&root->orphan_inodes);
2151 } 2187 }
2152 2188
2153 if (!BTRFS_I(inode)->orphan_meta_reserved) { 2189 if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
2154 BTRFS_I(inode)->orphan_meta_reserved = 1; 2190 &BTRFS_I(inode)->runtime_flags))
2155 reserve = 1; 2191 reserve = 1;
2156 }
2157 spin_unlock(&root->orphan_lock); 2192 spin_unlock(&root->orphan_lock);
2158 2193
2159 /* grab metadata reservation from transaction handle */ 2194 /* grab metadata reservation from transaction handle */
@@ -2166,6 +2201,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2166 if (insert >= 1) { 2201 if (insert >= 1) {
2167 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); 2202 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
2168 if (ret && ret != -EEXIST) { 2203 if (ret && ret != -EEXIST) {
2204 clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2205 &BTRFS_I(inode)->runtime_flags);
2169 btrfs_abort_transaction(trans, root, ret); 2206 btrfs_abort_transaction(trans, root, ret);
2170 return ret; 2207 return ret;
2171 } 2208 }
@@ -2196,15 +2233,13 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
2196 int ret = 0; 2233 int ret = 0;
2197 2234
2198 spin_lock(&root->orphan_lock); 2235 spin_lock(&root->orphan_lock);
2199 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 2236 if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2200 list_del_init(&BTRFS_I(inode)->i_orphan); 2237 &BTRFS_I(inode)->runtime_flags))
2201 delete_item = 1; 2238 delete_item = 1;
2202 }
2203 2239
2204 if (BTRFS_I(inode)->orphan_meta_reserved) { 2240 if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
2205 BTRFS_I(inode)->orphan_meta_reserved = 0; 2241 &BTRFS_I(inode)->runtime_flags))
2206 release_rsv = 1; 2242 release_rsv = 1;
2207 }
2208 spin_unlock(&root->orphan_lock); 2243 spin_unlock(&root->orphan_lock);
2209 2244
2210 if (trans && delete_item) { 2245 if (trans && delete_item) {
@@ -2212,8 +2247,10 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
2212 BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ 2247 BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
2213 } 2248 }
2214 2249
2215 if (release_rsv) 2250 if (release_rsv) {
2216 btrfs_orphan_release_metadata(inode); 2251 btrfs_orphan_release_metadata(inode);
2252 atomic_dec(&root->orphan_inodes);
2253 }
2217 2254
2218 return 0; 2255 return 0;
2219} 2256}
@@ -2341,6 +2378,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2341 ret = PTR_ERR(trans); 2378 ret = PTR_ERR(trans);
2342 goto out; 2379 goto out;
2343 } 2380 }
2381 printk(KERN_ERR "auto deleting %Lu\n",
2382 found_key.objectid);
2344 ret = btrfs_del_orphan_item(trans, root, 2383 ret = btrfs_del_orphan_item(trans, root,
2345 found_key.objectid); 2384 found_key.objectid);
2346 BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ 2385 BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
@@ -2352,9 +2391,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2352 * add this inode to the orphan list so btrfs_orphan_del does 2391 * add this inode to the orphan list so btrfs_orphan_del does
2353 * the proper thing when we hit it 2392 * the proper thing when we hit it
2354 */ 2393 */
2355 spin_lock(&root->orphan_lock); 2394 set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2356 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2395 &BTRFS_I(inode)->runtime_flags);
2357 spin_unlock(&root->orphan_lock);
2358 2396
2359 /* if we have links, this was a truncate, lets do that */ 2397 /* if we have links, this was a truncate, lets do that */
2360 if (inode->i_nlink) { 2398 if (inode->i_nlink) {
@@ -2510,7 +2548,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
2510 2548
2511 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); 2549 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
2512 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); 2550 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
2513 BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item); 2551 inode->i_version = btrfs_inode_sequence(leaf, inode_item);
2514 inode->i_generation = BTRFS_I(inode)->generation; 2552 inode->i_generation = BTRFS_I(inode)->generation;
2515 inode->i_rdev = 0; 2553 inode->i_rdev = 0;
2516 rdev = btrfs_inode_rdev(leaf, inode_item); 2554 rdev = btrfs_inode_rdev(leaf, inode_item);
@@ -2594,7 +2632,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2594 2632
2595 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); 2633 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
2596 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); 2634 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
2597 btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence); 2635 btrfs_set_inode_sequence(leaf, item, inode->i_version);
2598 btrfs_set_inode_transid(leaf, item, trans->transid); 2636 btrfs_set_inode_transid(leaf, item, trans->transid);
2599 btrfs_set_inode_rdev(leaf, item, inode->i_rdev); 2637 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2600 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); 2638 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
@@ -2752,6 +2790,8 @@ err:
2752 goto out; 2790 goto out;
2753 2791
2754 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 2792 btrfs_i_size_write(dir, dir->i_size - name_len * 2);
2793 inode_inc_iversion(inode);
2794 inode_inc_iversion(dir);
2755 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; 2795 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
2756 btrfs_update_inode(trans, root, dir); 2796 btrfs_update_inode(trans, root, dir);
2757out: 2797out:
@@ -3089,6 +3129,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
3089 } 3129 }
3090 3130
3091 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 3131 btrfs_i_size_write(dir, dir->i_size - name_len * 2);
3132 inode_inc_iversion(dir);
3092 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 3133 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
3093 ret = btrfs_update_inode(trans, root, dir); 3134 ret = btrfs_update_inode(trans, root, dir);
3094 if (ret) 3135 if (ret)
@@ -3607,7 +3648,8 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
3607 * any new writes get down to disk quickly. 3648 * any new writes get down to disk quickly.
3608 */ 3649 */
3609 if (newsize == 0) 3650 if (newsize == 0)
3610 BTRFS_I(inode)->ordered_data_close = 1; 3651 set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
3652 &BTRFS_I(inode)->runtime_flags);
3611 3653
3612 /* we don't support swapfiles, so vmtruncate shouldn't fail */ 3654 /* we don't support swapfiles, so vmtruncate shouldn't fail */
3613 truncate_setsize(inode, newsize); 3655 truncate_setsize(inode, newsize);
@@ -3638,6 +3680,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
3638 3680
3639 if (attr->ia_valid) { 3681 if (attr->ia_valid) {
3640 setattr_copy(inode, attr); 3682 setattr_copy(inode, attr);
3683 inode_inc_iversion(inode);
3641 err = btrfs_dirty_inode(inode); 3684 err = btrfs_dirty_inode(inode);
3642 3685
3643 if (!err && attr->ia_valid & ATTR_MODE) 3686 if (!err && attr->ia_valid & ATTR_MODE)
@@ -3671,7 +3714,8 @@ void btrfs_evict_inode(struct inode *inode)
3671 btrfs_wait_ordered_range(inode, 0, (u64)-1); 3714 btrfs_wait_ordered_range(inode, 0, (u64)-1);
3672 3715
3673 if (root->fs_info->log_root_recovering) { 3716 if (root->fs_info->log_root_recovering) {
3674 BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan)); 3717 BUG_ON(!test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3718 &BTRFS_I(inode)->runtime_flags));
3675 goto no_delete; 3719 goto no_delete;
3676 } 3720 }
3677 3721
@@ -4066,7 +4110,7 @@ static struct inode *new_simple_dir(struct super_block *s,
4066 4110
4067 BTRFS_I(inode)->root = root; 4111 BTRFS_I(inode)->root = root;
4068 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); 4112 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
4069 BTRFS_I(inode)->dummy_inode = 1; 4113 set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
4070 4114
4071 inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; 4115 inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
4072 inode->i_op = &btrfs_dir_ro_inode_operations; 4116 inode->i_op = &btrfs_dir_ro_inode_operations;
@@ -4370,7 +4414,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4370 int ret = 0; 4414 int ret = 0;
4371 bool nolock = false; 4415 bool nolock = false;
4372 4416
4373 if (BTRFS_I(inode)->dummy_inode) 4417 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
4374 return 0; 4418 return 0;
4375 4419
4376 if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode)) 4420 if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode))
@@ -4403,7 +4447,7 @@ int btrfs_dirty_inode(struct inode *inode)
4403 struct btrfs_trans_handle *trans; 4447 struct btrfs_trans_handle *trans;
4404 int ret; 4448 int ret;
4405 4449
4406 if (BTRFS_I(inode)->dummy_inode) 4450 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
4407 return 0; 4451 return 0;
4408 4452
4409 trans = btrfs_join_transaction(root); 4453 trans = btrfs_join_transaction(root);
@@ -4730,6 +4774,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
4730 4774
4731 btrfs_i_size_write(parent_inode, parent_inode->i_size + 4775 btrfs_i_size_write(parent_inode, parent_inode->i_size +
4732 name_len * 2); 4776 name_len * 2);
4777 inode_inc_iversion(parent_inode);
4733 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; 4778 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
4734 ret = btrfs_update_inode(trans, root, parent_inode); 4779 ret = btrfs_update_inode(trans, root, parent_inode);
4735 if (ret) 4780 if (ret)
@@ -4937,6 +4982,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4937 } 4982 }
4938 4983
4939 btrfs_inc_nlink(inode); 4984 btrfs_inc_nlink(inode);
4985 inode_inc_iversion(inode);
4940 inode->i_ctime = CURRENT_TIME; 4986 inode->i_ctime = CURRENT_TIME;
4941 ihold(inode); 4987 ihold(inode);
4942 4988
@@ -5903,9 +5949,7 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
5903 struct btrfs_dio_private *dip = bio->bi_private; 5949 struct btrfs_dio_private *dip = bio->bi_private;
5904 struct inode *inode = dip->inode; 5950 struct inode *inode = dip->inode;
5905 struct btrfs_root *root = BTRFS_I(inode)->root; 5951 struct btrfs_root *root = BTRFS_I(inode)->root;
5906 struct btrfs_trans_handle *trans;
5907 struct btrfs_ordered_extent *ordered = NULL; 5952 struct btrfs_ordered_extent *ordered = NULL;
5908 struct extent_state *cached_state = NULL;
5909 u64 ordered_offset = dip->logical_offset; 5953 u64 ordered_offset = dip->logical_offset;
5910 u64 ordered_bytes = dip->bytes; 5954 u64 ordered_bytes = dip->bytes;
5911 int ret; 5955 int ret;
@@ -5915,73 +5959,14 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
5915again: 5959again:
5916 ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, 5960 ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
5917 &ordered_offset, 5961 &ordered_offset,
5918 ordered_bytes); 5962 ordered_bytes, !err);
5919 if (!ret) 5963 if (!ret)
5920 goto out_test; 5964 goto out_test;
5921 5965
5922 BUG_ON(!ordered); 5966 ordered->work.func = finish_ordered_fn;
5923 5967 ordered->work.flags = 0;
5924 trans = btrfs_join_transaction(root); 5968 btrfs_queue_worker(&root->fs_info->endio_write_workers,
5925 if (IS_ERR(trans)) { 5969 &ordered->work);
5926 err = -ENOMEM;
5927 goto out;
5928 }
5929 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5930
5931 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
5932 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5933 if (!ret)
5934 err = btrfs_update_inode_fallback(trans, root, inode);
5935 goto out;
5936 }
5937
5938 lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5939 ordered->file_offset + ordered->len - 1, 0,
5940 &cached_state);
5941
5942 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
5943 ret = btrfs_mark_extent_written(trans, inode,
5944 ordered->file_offset,
5945 ordered->file_offset +
5946 ordered->len);
5947 if (ret) {
5948 err = ret;
5949 goto out_unlock;
5950 }
5951 } else {
5952 ret = insert_reserved_file_extent(trans, inode,
5953 ordered->file_offset,
5954 ordered->start,
5955 ordered->disk_len,
5956 ordered->len,
5957 ordered->len,
5958 0, 0, 0,
5959 BTRFS_FILE_EXTENT_REG);
5960 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
5961 ordered->file_offset, ordered->len);
5962 if (ret) {
5963 err = ret;
5964 WARN_ON(1);
5965 goto out_unlock;
5966 }
5967 }
5968
5969 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
5970 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5971 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
5972 btrfs_update_inode_fallback(trans, root, inode);
5973 ret = 0;
5974out_unlock:
5975 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5976 ordered->file_offset + ordered->len - 1,
5977 &cached_state, GFP_NOFS);
5978out:
5979 btrfs_delalloc_release_metadata(inode, ordered->len);
5980 btrfs_end_transaction(trans, root);
5981 ordered_offset = ordered->file_offset + ordered->len;
5982 btrfs_put_ordered_extent(ordered);
5983 btrfs_put_ordered_extent(ordered);
5984
5985out_test: 5970out_test:
5986 /* 5971 /*
5987 * our bio might span multiple ordered extents. If we haven't 5972 * our bio might span multiple ordered extents. If we haven't
@@ -5990,12 +5975,12 @@ out_test:
5990 if (ordered_offset < dip->logical_offset + dip->bytes) { 5975 if (ordered_offset < dip->logical_offset + dip->bytes) {
5991 ordered_bytes = dip->logical_offset + dip->bytes - 5976 ordered_bytes = dip->logical_offset + dip->bytes -
5992 ordered_offset; 5977 ordered_offset;
5978 ordered = NULL;
5993 goto again; 5979 goto again;
5994 } 5980 }
5995out_done: 5981out_done:
5996 bio->bi_private = dip->private; 5982 bio->bi_private = dip->private;
5997 5983
5998 kfree(dip->csums);
5999 kfree(dip); 5984 kfree(dip);
6000 5985
6001 /* If we had an error make sure to clear the uptodate flag */ 5986 /* If we had an error make sure to clear the uptodate flag */
@@ -6063,9 +6048,12 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
6063 int ret; 6048 int ret;
6064 6049
6065 bio_get(bio); 6050 bio_get(bio);
6066 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 6051
6067 if (ret) 6052 if (!write) {
6068 goto err; 6053 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
6054 if (ret)
6055 goto err;
6056 }
6069 6057
6070 if (skip_sum) 6058 if (skip_sum)
6071 goto map; 6059 goto map;
@@ -6485,13 +6473,13 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
6485 6473
6486static void btrfs_invalidatepage(struct page *page, unsigned long offset) 6474static void btrfs_invalidatepage(struct page *page, unsigned long offset)
6487{ 6475{
6476 struct inode *inode = page->mapping->host;
6488 struct extent_io_tree *tree; 6477 struct extent_io_tree *tree;
6489 struct btrfs_ordered_extent *ordered; 6478 struct btrfs_ordered_extent *ordered;
6490 struct extent_state *cached_state = NULL; 6479 struct extent_state *cached_state = NULL;
6491 u64 page_start = page_offset(page); 6480 u64 page_start = page_offset(page);
6492 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 6481 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
6493 6482
6494
6495 /* 6483 /*
6496 * we have the page locked, so new writeback can't start, 6484 * we have the page locked, so new writeback can't start,
6497 * and the dirty bit won't be cleared while we are here. 6485 * and the dirty bit won't be cleared while we are here.
@@ -6501,13 +6489,13 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
6501 */ 6489 */
6502 wait_on_page_writeback(page); 6490 wait_on_page_writeback(page);
6503 6491
6504 tree = &BTRFS_I(page->mapping->host)->io_tree; 6492 tree = &BTRFS_I(inode)->io_tree;
6505 if (offset) { 6493 if (offset) {
6506 btrfs_releasepage(page, GFP_NOFS); 6494 btrfs_releasepage(page, GFP_NOFS);
6507 return; 6495 return;
6508 } 6496 }
6509 lock_extent_bits(tree, page_start, page_end, 0, &cached_state); 6497 lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
6510 ordered = btrfs_lookup_ordered_extent(page->mapping->host, 6498 ordered = btrfs_lookup_ordered_extent(inode,
6511 page_offset(page)); 6499 page_offset(page));
6512 if (ordered) { 6500 if (ordered) {
6513 /* 6501 /*
@@ -6522,9 +6510,10 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
6522 * whoever cleared the private bit is responsible 6510 * whoever cleared the private bit is responsible
6523 * for the finish_ordered_io 6511 * for the finish_ordered_io
6524 */ 6512 */
6525 if (TestClearPagePrivate2(page)) { 6513 if (TestClearPagePrivate2(page) &&
6526 btrfs_finish_ordered_io(page->mapping->host, 6514 btrfs_dec_test_ordered_pending(inode, &ordered, page_start,
6527 page_start, page_end); 6515 PAGE_CACHE_SIZE, 1)) {
6516 btrfs_finish_ordered_io(ordered);
6528 } 6517 }
6529 btrfs_put_ordered_extent(ordered); 6518 btrfs_put_ordered_extent(ordered);
6530 cached_state = NULL; 6519 cached_state = NULL;
@@ -6771,7 +6760,8 @@ static int btrfs_truncate(struct inode *inode)
6771 * using truncate to replace the contents of the file will 6760 * using truncate to replace the contents of the file will
6772 * end up with a zero length file after a crash. 6761 * end up with a zero length file after a crash.
6773 */ 6762 */
6774 if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) 6763 if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
6764 &BTRFS_I(inode)->runtime_flags))
6775 btrfs_add_ordered_operation(trans, root, inode); 6765 btrfs_add_ordered_operation(trans, root, inode);
6776 6766
6777 while (1) { 6767 while (1) {
@@ -6894,7 +6884,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6894 ei->root = NULL; 6884 ei->root = NULL;
6895 ei->space_info = NULL; 6885 ei->space_info = NULL;
6896 ei->generation = 0; 6886 ei->generation = 0;
6897 ei->sequence = 0;
6898 ei->last_trans = 0; 6887 ei->last_trans = 0;
6899 ei->last_sub_trans = 0; 6888 ei->last_sub_trans = 0;
6900 ei->logged_trans = 0; 6889 ei->logged_trans = 0;
@@ -6909,11 +6898,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6909 ei->outstanding_extents = 0; 6898 ei->outstanding_extents = 0;
6910 ei->reserved_extents = 0; 6899 ei->reserved_extents = 0;
6911 6900
6912 ei->ordered_data_close = 0; 6901 ei->runtime_flags = 0;
6913 ei->orphan_meta_reserved = 0;
6914 ei->dummy_inode = 0;
6915 ei->in_defrag = 0;
6916 ei->delalloc_meta_reserved = 0;
6917 ei->force_compress = BTRFS_COMPRESS_NONE; 6902 ei->force_compress = BTRFS_COMPRESS_NONE;
6918 6903
6919 ei->delayed_node = NULL; 6904 ei->delayed_node = NULL;
@@ -6927,7 +6912,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6927 mutex_init(&ei->log_mutex); 6912 mutex_init(&ei->log_mutex);
6928 mutex_init(&ei->delalloc_mutex); 6913 mutex_init(&ei->delalloc_mutex);
6929 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 6914 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
6930 INIT_LIST_HEAD(&ei->i_orphan);
6931 INIT_LIST_HEAD(&ei->delalloc_inodes); 6915 INIT_LIST_HEAD(&ei->delalloc_inodes);
6932 INIT_LIST_HEAD(&ei->ordered_operations); 6916 INIT_LIST_HEAD(&ei->ordered_operations);
6933 RB_CLEAR_NODE(&ei->rb_node); 6917 RB_CLEAR_NODE(&ei->rb_node);
@@ -6972,13 +6956,12 @@ void btrfs_destroy_inode(struct inode *inode)
6972 spin_unlock(&root->fs_info->ordered_extent_lock); 6956 spin_unlock(&root->fs_info->ordered_extent_lock);
6973 } 6957 }
6974 6958
6975 spin_lock(&root->orphan_lock); 6959 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
6976 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 6960 &BTRFS_I(inode)->runtime_flags)) {
6977 printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n", 6961 printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n",
6978 (unsigned long long)btrfs_ino(inode)); 6962 (unsigned long long)btrfs_ino(inode));
6979 list_del_init(&BTRFS_I(inode)->i_orphan); 6963 atomic_dec(&root->orphan_inodes);
6980 } 6964 }
6981 spin_unlock(&root->orphan_lock);
6982 6965
6983 while (1) { 6966 while (1) {
6984 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 6967 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -7193,6 +7176,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
7193 if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode)) 7176 if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
7194 btrfs_add_ordered_operation(trans, root, old_inode); 7177 btrfs_add_ordered_operation(trans, root, old_inode);
7195 7178
7179 inode_inc_iversion(old_dir);
7180 inode_inc_iversion(new_dir);
7181 inode_inc_iversion(old_inode);
7196 old_dir->i_ctime = old_dir->i_mtime = ctime; 7182 old_dir->i_ctime = old_dir->i_mtime = ctime;
7197 new_dir->i_ctime = new_dir->i_mtime = ctime; 7183 new_dir->i_ctime = new_dir->i_mtime = ctime;
7198 old_inode->i_ctime = ctime; 7184 old_inode->i_ctime = ctime;
@@ -7219,6 +7205,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
7219 } 7205 }
7220 7206
7221 if (new_inode) { 7207 if (new_inode) {
7208 inode_inc_iversion(new_inode);
7222 new_inode->i_ctime = CURRENT_TIME; 7209 new_inode->i_ctime = CURRENT_TIME;
7223 if (unlikely(btrfs_ino(new_inode) == 7210 if (unlikely(btrfs_ino(new_inode) ==
7224 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 7211 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
@@ -7490,6 +7477,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
7490 cur_offset += ins.offset; 7477 cur_offset += ins.offset;
7491 *alloc_hint = ins.objectid + ins.offset; 7478 *alloc_hint = ins.objectid + ins.offset;
7492 7479
7480 inode_inc_iversion(inode);
7493 inode->i_ctime = CURRENT_TIME; 7481 inode->i_ctime = CURRENT_TIME;
7494 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 7482 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
7495 if (!(mode & FALLOC_FL_KEEP_SIZE) && 7483 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 14f8e1faa46e..24b776c08d99 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -261,6 +261,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
261 } 261 }
262 262
263 btrfs_update_iflags(inode); 263 btrfs_update_iflags(inode);
264 inode_inc_iversion(inode);
264 inode->i_ctime = CURRENT_TIME; 265 inode->i_ctime = CURRENT_TIME;
265 ret = btrfs_update_inode(trans, root, inode); 266 ret = btrfs_update_inode(trans, root, inode);
266 267
@@ -367,7 +368,7 @@ static noinline int create_subvol(struct btrfs_root *root,
367 return PTR_ERR(trans); 368 return PTR_ERR(trans);
368 369
369 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 370 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
370 0, objectid, NULL, 0, 0, 0, 0); 371 0, objectid, NULL, 0, 0, 0);
371 if (IS_ERR(leaf)) { 372 if (IS_ERR(leaf)) {
372 ret = PTR_ERR(leaf); 373 ret = PTR_ERR(leaf);
373 goto fail; 374 goto fail;
@@ -2262,10 +2263,12 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
2262 di_args->bytes_used = dev->bytes_used; 2263 di_args->bytes_used = dev->bytes_used;
2263 di_args->total_bytes = dev->total_bytes; 2264 di_args->total_bytes = dev->total_bytes;
2264 memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); 2265 memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
2265 if (dev->name) 2266 if (dev->name) {
2266 strncpy(di_args->path, dev->name, sizeof(di_args->path)); 2267 strncpy(di_args->path, dev->name, sizeof(di_args->path));
2267 else 2268 di_args->path[sizeof(di_args->path) - 1] = 0;
2269 } else {
2268 di_args->path[0] = '\0'; 2270 di_args->path[0] = '\0';
2271 }
2269 2272
2270out: 2273out:
2271 if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args))) 2274 if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
@@ -2622,6 +2625,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2622 btrfs_mark_buffer_dirty(leaf); 2625 btrfs_mark_buffer_dirty(leaf);
2623 btrfs_release_path(path); 2626 btrfs_release_path(path);
2624 2627
2628 inode_inc_iversion(inode);
2625 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 2629 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2626 2630
2627 /* 2631 /*
@@ -2914,7 +2918,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
2914 up_read(&info->groups_sem); 2918 up_read(&info->groups_sem);
2915 } 2919 }
2916 2920
2917 user_dest = (struct btrfs_ioctl_space_info *) 2921 user_dest = (struct btrfs_ioctl_space_info __user *)
2918 (arg + sizeof(struct btrfs_ioctl_space_args)); 2922 (arg + sizeof(struct btrfs_ioctl_space_args));
2919 2923
2920 if (copy_to_user(user_dest, dest_orig, alloc_size)) 2924 if (copy_to_user(user_dest, dest_orig, alloc_size))
@@ -3042,6 +3046,28 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
3042 return ret; 3046 return ret;
3043} 3047}
3044 3048
3049static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
3050 void __user *arg, int reset_after_read)
3051{
3052 struct btrfs_ioctl_get_dev_stats *sa;
3053 int ret;
3054
3055 if (reset_after_read && !capable(CAP_SYS_ADMIN))
3056 return -EPERM;
3057
3058 sa = memdup_user(arg, sizeof(*sa));
3059 if (IS_ERR(sa))
3060 return PTR_ERR(sa);
3061
3062 ret = btrfs_get_dev_stats(root, sa, reset_after_read);
3063
3064 if (copy_to_user(arg, sa, sizeof(*sa)))
3065 ret = -EFAULT;
3066
3067 kfree(sa);
3068 return ret;
3069}
3070
3045static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) 3071static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
3046{ 3072{
3047 int ret = 0; 3073 int ret = 0;
@@ -3212,8 +3238,9 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
3212 } 3238 }
3213} 3239}
3214 3240
3215static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg) 3241static long btrfs_ioctl_balance(struct file *file, void __user *arg)
3216{ 3242{
3243 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3217 struct btrfs_fs_info *fs_info = root->fs_info; 3244 struct btrfs_fs_info *fs_info = root->fs_info;
3218 struct btrfs_ioctl_balance_args *bargs; 3245 struct btrfs_ioctl_balance_args *bargs;
3219 struct btrfs_balance_control *bctl; 3246 struct btrfs_balance_control *bctl;
@@ -3225,6 +3252,10 @@ static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
3225 if (fs_info->sb->s_flags & MS_RDONLY) 3252 if (fs_info->sb->s_flags & MS_RDONLY)
3226 return -EROFS; 3253 return -EROFS;
3227 3254
3255 ret = mnt_want_write(file->f_path.mnt);
3256 if (ret)
3257 return ret;
3258
3228 mutex_lock(&fs_info->volume_mutex); 3259 mutex_lock(&fs_info->volume_mutex);
3229 mutex_lock(&fs_info->balance_mutex); 3260 mutex_lock(&fs_info->balance_mutex);
3230 3261
@@ -3291,6 +3322,7 @@ out_bargs:
3291out: 3322out:
3292 mutex_unlock(&fs_info->balance_mutex); 3323 mutex_unlock(&fs_info->balance_mutex);
3293 mutex_unlock(&fs_info->volume_mutex); 3324 mutex_unlock(&fs_info->volume_mutex);
3325 mnt_drop_write(file->f_path.mnt);
3294 return ret; 3326 return ret;
3295} 3327}
3296 3328
@@ -3386,7 +3418,7 @@ long btrfs_ioctl(struct file *file, unsigned int
3386 case BTRFS_IOC_DEV_INFO: 3418 case BTRFS_IOC_DEV_INFO:
3387 return btrfs_ioctl_dev_info(root, argp); 3419 return btrfs_ioctl_dev_info(root, argp);
3388 case BTRFS_IOC_BALANCE: 3420 case BTRFS_IOC_BALANCE:
3389 return btrfs_ioctl_balance(root, NULL); 3421 return btrfs_ioctl_balance(file, NULL);
3390 case BTRFS_IOC_CLONE: 3422 case BTRFS_IOC_CLONE:
3391 return btrfs_ioctl_clone(file, arg, 0, 0, 0); 3423 return btrfs_ioctl_clone(file, arg, 0, 0, 0);
3392 case BTRFS_IOC_CLONE_RANGE: 3424 case BTRFS_IOC_CLONE_RANGE:
@@ -3419,11 +3451,15 @@ long btrfs_ioctl(struct file *file, unsigned int
3419 case BTRFS_IOC_SCRUB_PROGRESS: 3451 case BTRFS_IOC_SCRUB_PROGRESS:
3420 return btrfs_ioctl_scrub_progress(root, argp); 3452 return btrfs_ioctl_scrub_progress(root, argp);
3421 case BTRFS_IOC_BALANCE_V2: 3453 case BTRFS_IOC_BALANCE_V2:
3422 return btrfs_ioctl_balance(root, argp); 3454 return btrfs_ioctl_balance(file, argp);
3423 case BTRFS_IOC_BALANCE_CTL: 3455 case BTRFS_IOC_BALANCE_CTL:
3424 return btrfs_ioctl_balance_ctl(root, arg); 3456 return btrfs_ioctl_balance_ctl(root, arg);
3425 case BTRFS_IOC_BALANCE_PROGRESS: 3457 case BTRFS_IOC_BALANCE_PROGRESS:
3426 return btrfs_ioctl_balance_progress(root, argp); 3458 return btrfs_ioctl_balance_progress(root, argp);
3459 case BTRFS_IOC_GET_DEV_STATS:
3460 return btrfs_ioctl_get_dev_stats(root, argp, 0);
3461 case BTRFS_IOC_GET_AND_RESET_DEV_STATS:
3462 return btrfs_ioctl_get_dev_stats(root, argp, 1);
3427 } 3463 }
3428 3464
3429 return -ENOTTY; 3465 return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 086e6bdae1c4..497c530724cf 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -266,6 +266,35 @@ struct btrfs_ioctl_logical_ino_args {
266 __u64 inodes; 266 __u64 inodes;
267}; 267};
268 268
269enum btrfs_dev_stat_values {
270 /* disk I/O failure stats */
271 BTRFS_DEV_STAT_WRITE_ERRS, /* EIO or EREMOTEIO from lower layers */
272 BTRFS_DEV_STAT_READ_ERRS, /* EIO or EREMOTEIO from lower layers */
273 BTRFS_DEV_STAT_FLUSH_ERRS, /* EIO or EREMOTEIO from lower layers */
274
275 /* stats for indirect indications for I/O failures */
276 BTRFS_DEV_STAT_CORRUPTION_ERRS, /* checksum error, bytenr error or
277 * contents is illegal: this is an
278 * indication that the block was damaged
279 * during read or write, or written to
280 * wrong location or read from wrong
281 * location */
282 BTRFS_DEV_STAT_GENERATION_ERRS, /* an indication that blocks have not
283 * been written */
284
285 BTRFS_DEV_STAT_VALUES_MAX
286};
287
288struct btrfs_ioctl_get_dev_stats {
289 __u64 devid; /* in */
290 __u64 nr_items; /* in/out */
291
292 /* out values: */
293 __u64 values[BTRFS_DEV_STAT_VALUES_MAX];
294
295 __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */
296};
297
269#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ 298#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
270 struct btrfs_ioctl_vol_args) 299 struct btrfs_ioctl_vol_args)
271#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ 300#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -330,5 +359,9 @@ struct btrfs_ioctl_logical_ino_args {
330 struct btrfs_ioctl_ino_path_args) 359 struct btrfs_ioctl_ino_path_args)
331#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ 360#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
332 struct btrfs_ioctl_ino_path_args) 361 struct btrfs_ioctl_ino_path_args)
362#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
363 struct btrfs_ioctl_get_dev_stats)
364#define BTRFS_IOC_GET_AND_RESET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 53, \
365 struct btrfs_ioctl_get_dev_stats)
333 366
334#endif 367#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index bbf6d0d9aebe..9e138cdc36c5 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -196,7 +196,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
196 entry->len = len; 196 entry->len = len;
197 entry->disk_len = disk_len; 197 entry->disk_len = disk_len;
198 entry->bytes_left = len; 198 entry->bytes_left = len;
199 entry->inode = inode; 199 entry->inode = igrab(inode);
200 entry->compress_type = compress_type; 200 entry->compress_type = compress_type;
201 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) 201 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
202 set_bit(type, &entry->flags); 202 set_bit(type, &entry->flags);
@@ -212,12 +212,12 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
212 212
213 trace_btrfs_ordered_extent_add(inode, entry); 213 trace_btrfs_ordered_extent_add(inode, entry);
214 214
215 spin_lock(&tree->lock); 215 spin_lock_irq(&tree->lock);
216 node = tree_insert(&tree->tree, file_offset, 216 node = tree_insert(&tree->tree, file_offset,
217 &entry->rb_node); 217 &entry->rb_node);
218 if (node) 218 if (node)
219 ordered_data_tree_panic(inode, -EEXIST, file_offset); 219 ordered_data_tree_panic(inode, -EEXIST, file_offset);
220 spin_unlock(&tree->lock); 220 spin_unlock_irq(&tree->lock);
221 221
222 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 222 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
223 list_add_tail(&entry->root_extent_list, 223 list_add_tail(&entry->root_extent_list,
@@ -264,9 +264,9 @@ void btrfs_add_ordered_sum(struct inode *inode,
264 struct btrfs_ordered_inode_tree *tree; 264 struct btrfs_ordered_inode_tree *tree;
265 265
266 tree = &BTRFS_I(inode)->ordered_tree; 266 tree = &BTRFS_I(inode)->ordered_tree;
267 spin_lock(&tree->lock); 267 spin_lock_irq(&tree->lock);
268 list_add_tail(&sum->list, &entry->list); 268 list_add_tail(&sum->list, &entry->list);
269 spin_unlock(&tree->lock); 269 spin_unlock_irq(&tree->lock);
270} 270}
271 271
272/* 272/*
@@ -283,18 +283,19 @@ void btrfs_add_ordered_sum(struct inode *inode,
283 */ 283 */
284int btrfs_dec_test_first_ordered_pending(struct inode *inode, 284int btrfs_dec_test_first_ordered_pending(struct inode *inode,
285 struct btrfs_ordered_extent **cached, 285 struct btrfs_ordered_extent **cached,
286 u64 *file_offset, u64 io_size) 286 u64 *file_offset, u64 io_size, int uptodate)
287{ 287{
288 struct btrfs_ordered_inode_tree *tree; 288 struct btrfs_ordered_inode_tree *tree;
289 struct rb_node *node; 289 struct rb_node *node;
290 struct btrfs_ordered_extent *entry = NULL; 290 struct btrfs_ordered_extent *entry = NULL;
291 int ret; 291 int ret;
292 unsigned long flags;
292 u64 dec_end; 293 u64 dec_end;
293 u64 dec_start; 294 u64 dec_start;
294 u64 to_dec; 295 u64 to_dec;
295 296
296 tree = &BTRFS_I(inode)->ordered_tree; 297 tree = &BTRFS_I(inode)->ordered_tree;
297 spin_lock(&tree->lock); 298 spin_lock_irqsave(&tree->lock, flags);
298 node = tree_search(tree, *file_offset); 299 node = tree_search(tree, *file_offset);
299 if (!node) { 300 if (!node) {
300 ret = 1; 301 ret = 1;
@@ -323,6 +324,9 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
323 (unsigned long long)to_dec); 324 (unsigned long long)to_dec);
324 } 325 }
325 entry->bytes_left -= to_dec; 326 entry->bytes_left -= to_dec;
327 if (!uptodate)
328 set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
329
326 if (entry->bytes_left == 0) 330 if (entry->bytes_left == 0)
327 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 331 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
328 else 332 else
@@ -332,7 +336,7 @@ out:
332 *cached = entry; 336 *cached = entry;
333 atomic_inc(&entry->refs); 337 atomic_inc(&entry->refs);
334 } 338 }
335 spin_unlock(&tree->lock); 339 spin_unlock_irqrestore(&tree->lock, flags);
336 return ret == 0; 340 return ret == 0;
337} 341}
338 342
@@ -347,15 +351,21 @@ out:
347 */ 351 */
348int btrfs_dec_test_ordered_pending(struct inode *inode, 352int btrfs_dec_test_ordered_pending(struct inode *inode,
349 struct btrfs_ordered_extent **cached, 353 struct btrfs_ordered_extent **cached,
350 u64 file_offset, u64 io_size) 354 u64 file_offset, u64 io_size, int uptodate)
351{ 355{
352 struct btrfs_ordered_inode_tree *tree; 356 struct btrfs_ordered_inode_tree *tree;
353 struct rb_node *node; 357 struct rb_node *node;
354 struct btrfs_ordered_extent *entry = NULL; 358 struct btrfs_ordered_extent *entry = NULL;
359 unsigned long flags;
355 int ret; 360 int ret;
356 361
357 tree = &BTRFS_I(inode)->ordered_tree; 362 tree = &BTRFS_I(inode)->ordered_tree;
358 spin_lock(&tree->lock); 363 spin_lock_irqsave(&tree->lock, flags);
364 if (cached && *cached) {
365 entry = *cached;
366 goto have_entry;
367 }
368
359 node = tree_search(tree, file_offset); 369 node = tree_search(tree, file_offset);
360 if (!node) { 370 if (!node) {
361 ret = 1; 371 ret = 1;
@@ -363,6 +373,7 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
363 } 373 }
364 374
365 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 375 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
376have_entry:
366 if (!offset_in_entry(entry, file_offset)) { 377 if (!offset_in_entry(entry, file_offset)) {
367 ret = 1; 378 ret = 1;
368 goto out; 379 goto out;
@@ -374,6 +385,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
374 (unsigned long long)io_size); 385 (unsigned long long)io_size);
375 } 386 }
376 entry->bytes_left -= io_size; 387 entry->bytes_left -= io_size;
388 if (!uptodate)
389 set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
390
377 if (entry->bytes_left == 0) 391 if (entry->bytes_left == 0)
378 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 392 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
379 else 393 else
@@ -383,7 +397,7 @@ out:
383 *cached = entry; 397 *cached = entry;
384 atomic_inc(&entry->refs); 398 atomic_inc(&entry->refs);
385 } 399 }
386 spin_unlock(&tree->lock); 400 spin_unlock_irqrestore(&tree->lock, flags);
387 return ret == 0; 401 return ret == 0;
388} 402}
389 403
@@ -399,6 +413,8 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
399 trace_btrfs_ordered_extent_put(entry->inode, entry); 413 trace_btrfs_ordered_extent_put(entry->inode, entry);
400 414
401 if (atomic_dec_and_test(&entry->refs)) { 415 if (atomic_dec_and_test(&entry->refs)) {
416 if (entry->inode)
417 btrfs_add_delayed_iput(entry->inode);
402 while (!list_empty(&entry->list)) { 418 while (!list_empty(&entry->list)) {
403 cur = entry->list.next; 419 cur = entry->list.next;
404 sum = list_entry(cur, struct btrfs_ordered_sum, list); 420 sum = list_entry(cur, struct btrfs_ordered_sum, list);
@@ -411,21 +427,22 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
411 427
412/* 428/*
413 * remove an ordered extent from the tree. No references are dropped 429 * remove an ordered extent from the tree. No references are dropped
414 * and you must wake_up entry->wait. You must hold the tree lock 430 * and waiters are woken up.
415 * while you call this function.
416 */ 431 */
417static void __btrfs_remove_ordered_extent(struct inode *inode, 432void btrfs_remove_ordered_extent(struct inode *inode,
418 struct btrfs_ordered_extent *entry) 433 struct btrfs_ordered_extent *entry)
419{ 434{
420 struct btrfs_ordered_inode_tree *tree; 435 struct btrfs_ordered_inode_tree *tree;
421 struct btrfs_root *root = BTRFS_I(inode)->root; 436 struct btrfs_root *root = BTRFS_I(inode)->root;
422 struct rb_node *node; 437 struct rb_node *node;
423 438
424 tree = &BTRFS_I(inode)->ordered_tree; 439 tree = &BTRFS_I(inode)->ordered_tree;
440 spin_lock_irq(&tree->lock);
425 node = &entry->rb_node; 441 node = &entry->rb_node;
426 rb_erase(node, &tree->tree); 442 rb_erase(node, &tree->tree);
427 tree->last = NULL; 443 tree->last = NULL;
428 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 444 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
445 spin_unlock_irq(&tree->lock);
429 446
430 spin_lock(&root->fs_info->ordered_extent_lock); 447 spin_lock(&root->fs_info->ordered_extent_lock);
431 list_del_init(&entry->root_extent_list); 448 list_del_init(&entry->root_extent_list);
@@ -442,21 +459,6 @@ static void __btrfs_remove_ordered_extent(struct inode *inode,
442 list_del_init(&BTRFS_I(inode)->ordered_operations); 459 list_del_init(&BTRFS_I(inode)->ordered_operations);
443 } 460 }
444 spin_unlock(&root->fs_info->ordered_extent_lock); 461 spin_unlock(&root->fs_info->ordered_extent_lock);
445}
446
447/*
448 * remove an ordered extent from the tree. No references are dropped
449 * but any waiters are woken.
450 */
451void btrfs_remove_ordered_extent(struct inode *inode,
452 struct btrfs_ordered_extent *entry)
453{
454 struct btrfs_ordered_inode_tree *tree;
455
456 tree = &BTRFS_I(inode)->ordered_tree;
457 spin_lock(&tree->lock);
458 __btrfs_remove_ordered_extent(inode, entry);
459 spin_unlock(&tree->lock);
460 wake_up(&entry->wait); 462 wake_up(&entry->wait);
461} 463}
462 464
@@ -621,19 +623,11 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
621 if (orig_end > INT_LIMIT(loff_t)) 623 if (orig_end > INT_LIMIT(loff_t))
622 orig_end = INT_LIMIT(loff_t); 624 orig_end = INT_LIMIT(loff_t);
623 } 625 }
624again: 626
625 /* start IO across the range first to instantiate any delalloc 627 /* start IO across the range first to instantiate any delalloc
626 * extents 628 * extents
627 */ 629 */
628 filemap_fdatawrite_range(inode->i_mapping, start, orig_end); 630 filemap_write_and_wait_range(inode->i_mapping, start, orig_end);
629
630 /* The compression code will leave pages locked but return from
631 * writepage without setting the page writeback. Starting again
632 * with WB_SYNC_ALL will end up waiting for the IO to actually start.
633 */
634 filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
635
636 filemap_fdatawait_range(inode->i_mapping, start, orig_end);
637 631
638 end = orig_end; 632 end = orig_end;
639 found = 0; 633 found = 0;
@@ -657,11 +651,6 @@ again:
657 break; 651 break;
658 end--; 652 end--;
659 } 653 }
660 if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
661 EXTENT_DELALLOC, 0, NULL)) {
662 schedule_timeout(1);
663 goto again;
664 }
665} 654}
666 655
667/* 656/*
@@ -676,7 +665,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
676 struct btrfs_ordered_extent *entry = NULL; 665 struct btrfs_ordered_extent *entry = NULL;
677 666
678 tree = &BTRFS_I(inode)->ordered_tree; 667 tree = &BTRFS_I(inode)->ordered_tree;
679 spin_lock(&tree->lock); 668 spin_lock_irq(&tree->lock);
680 node = tree_search(tree, file_offset); 669 node = tree_search(tree, file_offset);
681 if (!node) 670 if (!node)
682 goto out; 671 goto out;
@@ -687,7 +676,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
687 if (entry) 676 if (entry)
688 atomic_inc(&entry->refs); 677 atomic_inc(&entry->refs);
689out: 678out:
690 spin_unlock(&tree->lock); 679 spin_unlock_irq(&tree->lock);
691 return entry; 680 return entry;
692} 681}
693 682
@@ -703,7 +692,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
703 struct btrfs_ordered_extent *entry = NULL; 692 struct btrfs_ordered_extent *entry = NULL;
704 693
705 tree = &BTRFS_I(inode)->ordered_tree; 694 tree = &BTRFS_I(inode)->ordered_tree;
706 spin_lock(&tree->lock); 695 spin_lock_irq(&tree->lock);
707 node = tree_search(tree, file_offset); 696 node = tree_search(tree, file_offset);
708 if (!node) { 697 if (!node) {
709 node = tree_search(tree, file_offset + len); 698 node = tree_search(tree, file_offset + len);
@@ -728,7 +717,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
728out: 717out:
729 if (entry) 718 if (entry)
730 atomic_inc(&entry->refs); 719 atomic_inc(&entry->refs);
731 spin_unlock(&tree->lock); 720 spin_unlock_irq(&tree->lock);
732 return entry; 721 return entry;
733} 722}
734 723
@@ -744,7 +733,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
744 struct btrfs_ordered_extent *entry = NULL; 733 struct btrfs_ordered_extent *entry = NULL;
745 734
746 tree = &BTRFS_I(inode)->ordered_tree; 735 tree = &BTRFS_I(inode)->ordered_tree;
747 spin_lock(&tree->lock); 736 spin_lock_irq(&tree->lock);
748 node = tree_search(tree, file_offset); 737 node = tree_search(tree, file_offset);
749 if (!node) 738 if (!node)
750 goto out; 739 goto out;
@@ -752,7 +741,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
752 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 741 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
753 atomic_inc(&entry->refs); 742 atomic_inc(&entry->refs);
754out: 743out:
755 spin_unlock(&tree->lock); 744 spin_unlock_irq(&tree->lock);
756 return entry; 745 return entry;
757} 746}
758 747
@@ -764,7 +753,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
764 struct btrfs_ordered_extent *ordered) 753 struct btrfs_ordered_extent *ordered)
765{ 754{
766 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 755 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
767 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
768 u64 disk_i_size; 756 u64 disk_i_size;
769 u64 new_i_size; 757 u64 new_i_size;
770 u64 i_size_test; 758 u64 i_size_test;
@@ -779,7 +767,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
779 else 767 else
780 offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize); 768 offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
781 769
782 spin_lock(&tree->lock); 770 spin_lock_irq(&tree->lock);
783 disk_i_size = BTRFS_I(inode)->disk_i_size; 771 disk_i_size = BTRFS_I(inode)->disk_i_size;
784 772
785 /* truncate file */ 773 /* truncate file */
@@ -798,14 +786,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
798 } 786 }
799 787
800 /* 788 /*
801 * we can't update the disk_isize if there are delalloc bytes
802 * between disk_i_size and this ordered extent
803 */
804 if (test_range_bit(io_tree, disk_i_size, offset - 1,
805 EXTENT_DELALLOC, 0, NULL)) {
806 goto out;
807 }
808 /*
809 * walk backward from this ordered extent to disk_i_size. 789 * walk backward from this ordered extent to disk_i_size.
810 * if we find an ordered extent then we can't update disk i_size 790 * if we find an ordered extent then we can't update disk i_size
811 * yet 791 * yet
@@ -825,15 +805,18 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
825 } 805 }
826 node = prev; 806 node = prev;
827 } 807 }
828 while (node) { 808 for (; node; node = rb_prev(node)) {
829 test = rb_entry(node, struct btrfs_ordered_extent, rb_node); 809 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
810
811 /* We treat this entry as if it doesnt exist */
812 if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
813 continue;
830 if (test->file_offset + test->len <= disk_i_size) 814 if (test->file_offset + test->len <= disk_i_size)
831 break; 815 break;
832 if (test->file_offset >= i_size) 816 if (test->file_offset >= i_size)
833 break; 817 break;
834 if (test->file_offset >= disk_i_size) 818 if (test->file_offset >= disk_i_size)
835 goto out; 819 goto out;
836 node = rb_prev(node);
837 } 820 }
838 new_i_size = min_t(u64, offset, i_size); 821 new_i_size = min_t(u64, offset, i_size);
839 822
@@ -851,43 +834,49 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
851 else 834 else
852 node = rb_first(&tree->tree); 835 node = rb_first(&tree->tree);
853 } 836 }
854 i_size_test = 0; 837
855 if (node) { 838 /*
856 /* 839 * We are looking for an area between our current extent and the next
857 * do we have an area where IO might have finished 840 * ordered extent to update the i_size to. There are 3 cases here
858 * between our ordered extent and the next one. 841 *
859 */ 842 * 1) We don't actually have anything and we can update to i_size.
843 * 2) We have stuff but they already did their i_size update so again we
844 * can just update to i_size.
845 * 3) We have an outstanding ordered extent so the most we can update
846 * our disk_i_size to is the start of the next offset.
847 */
848 i_size_test = i_size;
849 for (; node; node = rb_next(node)) {
860 test = rb_entry(node, struct btrfs_ordered_extent, rb_node); 850 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
861 if (test->file_offset > offset) 851
852 if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
853 continue;
854 if (test->file_offset > offset) {
862 i_size_test = test->file_offset; 855 i_size_test = test->file_offset;
863 } else { 856 break;
864 i_size_test = i_size; 857 }
865 } 858 }
866 859
867 /* 860 /*
868 * i_size_test is the end of a region after this ordered 861 * i_size_test is the end of a region after this ordered
869 * extent where there are no ordered extents. As long as there 862 * extent where there are no ordered extents, we can safely set
870 * are no delalloc bytes in this area, it is safe to update 863 * disk_i_size to this.
871 * disk_i_size to the end of the region.
872 */ 864 */
873 if (i_size_test > offset && 865 if (i_size_test > offset)
874 !test_range_bit(io_tree, offset, i_size_test - 1,
875 EXTENT_DELALLOC, 0, NULL)) {
876 new_i_size = min_t(u64, i_size_test, i_size); 866 new_i_size = min_t(u64, i_size_test, i_size);
877 }
878 BTRFS_I(inode)->disk_i_size = new_i_size; 867 BTRFS_I(inode)->disk_i_size = new_i_size;
879 ret = 0; 868 ret = 0;
880out: 869out:
881 /* 870 /*
882 * we need to remove the ordered extent with the tree lock held 871 * We need to do this because we can't remove ordered extents until
883 * so that other people calling this function don't find our fully 872 * after the i_disk_size has been updated and then the inode has been
884 * processed ordered entry and skip updating the i_size 873 * updated to reflect the change, so we need to tell anybody who finds
874 * this ordered extent that we've already done all the real work, we
875 * just haven't completed all the other work.
885 */ 876 */
886 if (ordered) 877 if (ordered)
887 __btrfs_remove_ordered_extent(inode, ordered); 878 set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags);
888 spin_unlock(&tree->lock); 879 spin_unlock_irq(&tree->lock);
889 if (ordered)
890 wake_up(&ordered->wait);
891 return ret; 880 return ret;
892} 881}
893 882
@@ -912,7 +901,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
912 if (!ordered) 901 if (!ordered)
913 return 1; 902 return 1;
914 903
915 spin_lock(&tree->lock); 904 spin_lock_irq(&tree->lock);
916 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { 905 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
917 if (disk_bytenr >= ordered_sum->bytenr) { 906 if (disk_bytenr >= ordered_sum->bytenr) {
918 num_sectors = ordered_sum->len / sectorsize; 907 num_sectors = ordered_sum->len / sectorsize;
@@ -927,7 +916,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
927 } 916 }
928 } 917 }
929out: 918out:
930 spin_unlock(&tree->lock); 919 spin_unlock_irq(&tree->lock);
931 btrfs_put_ordered_extent(ordered); 920 btrfs_put_ordered_extent(ordered);
932 return ret; 921 return ret;
933} 922}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index c355ad4dc1a6..e03c560d2997 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -74,6 +74,12 @@ struct btrfs_ordered_sum {
74 74
75#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */ 75#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
76 76
77#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */
78
79#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates wether this ordered extent
80 * has done its due diligence in updating
81 * the isize. */
82
77struct btrfs_ordered_extent { 83struct btrfs_ordered_extent {
78 /* logical offset in the file */ 84 /* logical offset in the file */
79 u64 file_offset; 85 u64 file_offset;
@@ -113,6 +119,8 @@ struct btrfs_ordered_extent {
113 119
114 /* a per root list of all the pending ordered extents */ 120 /* a per root list of all the pending ordered extents */
115 struct list_head root_extent_list; 121 struct list_head root_extent_list;
122
123 struct btrfs_work work;
116}; 124};
117 125
118 126
@@ -143,10 +151,11 @@ void btrfs_remove_ordered_extent(struct inode *inode,
143 struct btrfs_ordered_extent *entry); 151 struct btrfs_ordered_extent *entry);
144int btrfs_dec_test_ordered_pending(struct inode *inode, 152int btrfs_dec_test_ordered_pending(struct inode *inode,
145 struct btrfs_ordered_extent **cached, 153 struct btrfs_ordered_extent **cached,
146 u64 file_offset, u64 io_size); 154 u64 file_offset, u64 io_size, int uptodate);
147int btrfs_dec_test_first_ordered_pending(struct inode *inode, 155int btrfs_dec_test_first_ordered_pending(struct inode *inode,
148 struct btrfs_ordered_extent **cached, 156 struct btrfs_ordered_extent **cached,
149 u64 *file_offset, u64 io_size); 157 u64 *file_offset, u64 io_size,
158 int uptodate);
150int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 159int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
151 u64 start, u64 len, u64 disk_len, int type); 160 u64 start, u64 len, u64 disk_len, int type);
152int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, 161int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index f38e452486b8..5e23684887eb 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -294,6 +294,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
294 btrfs_dev_extent_chunk_offset(l, dev_extent), 294 btrfs_dev_extent_chunk_offset(l, dev_extent),
295 (unsigned long long) 295 (unsigned long long)
296 btrfs_dev_extent_length(l, dev_extent)); 296 btrfs_dev_extent_length(l, dev_extent));
297 case BTRFS_DEV_STATS_KEY:
298 printk(KERN_INFO "\t\tdevice stats\n");
299 break;
297 }; 300 };
298 } 301 }
299} 302}
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index ac5d01085884..48a4882d8ad5 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -718,13 +718,18 @@ static void reada_start_machine_worker(struct btrfs_work *work)
718{ 718{
719 struct reada_machine_work *rmw; 719 struct reada_machine_work *rmw;
720 struct btrfs_fs_info *fs_info; 720 struct btrfs_fs_info *fs_info;
721 int old_ioprio;
721 722
722 rmw = container_of(work, struct reada_machine_work, work); 723 rmw = container_of(work, struct reada_machine_work, work);
723 fs_info = rmw->fs_info; 724 fs_info = rmw->fs_info;
724 725
725 kfree(rmw); 726 kfree(rmw);
726 727
728 old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current),
729 task_nice_ioprio(current));
730 set_task_ioprio(current, BTRFS_IOPRIO_READA);
727 __reada_start_machine(fs_info); 731 __reada_start_machine(fs_info);
732 set_task_ioprio(current, old_ioprio);
728} 733}
729 734
730static void __reada_start_machine(struct btrfs_fs_info *fs_info) 735static void __reada_start_machine(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 2f3d6f917fb3..a38cfa4f251e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -50,7 +50,7 @@ struct scrub_dev;
50struct scrub_page { 50struct scrub_page {
51 struct scrub_block *sblock; 51 struct scrub_block *sblock;
52 struct page *page; 52 struct page *page;
53 struct block_device *bdev; 53 struct btrfs_device *dev;
54 u64 flags; /* extent flags */ 54 u64 flags; /* extent flags */
55 u64 generation; 55 u64 generation;
56 u64 logical; 56 u64 logical;
@@ -86,6 +86,7 @@ struct scrub_block {
86 unsigned int header_error:1; 86 unsigned int header_error:1;
87 unsigned int checksum_error:1; 87 unsigned int checksum_error:1;
88 unsigned int no_io_error_seen:1; 88 unsigned int no_io_error_seen:1;
89 unsigned int generation_error:1; /* also sets header_error */
89 }; 90 };
90}; 91};
91 92
@@ -675,6 +676,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
675 sdev->stat.read_errors++; 676 sdev->stat.read_errors++;
676 sdev->stat.uncorrectable_errors++; 677 sdev->stat.uncorrectable_errors++;
677 spin_unlock(&sdev->stat_lock); 678 spin_unlock(&sdev->stat_lock);
679 btrfs_dev_stat_inc_and_print(sdev->dev,
680 BTRFS_DEV_STAT_READ_ERRS);
678 goto out; 681 goto out;
679 } 682 }
680 683
@@ -686,6 +689,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
686 sdev->stat.read_errors++; 689 sdev->stat.read_errors++;
687 sdev->stat.uncorrectable_errors++; 690 sdev->stat.uncorrectable_errors++;
688 spin_unlock(&sdev->stat_lock); 691 spin_unlock(&sdev->stat_lock);
692 btrfs_dev_stat_inc_and_print(sdev->dev,
693 BTRFS_DEV_STAT_READ_ERRS);
689 goto out; 694 goto out;
690 } 695 }
691 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); 696 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
@@ -699,6 +704,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
699 sdev->stat.read_errors++; 704 sdev->stat.read_errors++;
700 sdev->stat.uncorrectable_errors++; 705 sdev->stat.uncorrectable_errors++;
701 spin_unlock(&sdev->stat_lock); 706 spin_unlock(&sdev->stat_lock);
707 btrfs_dev_stat_inc_and_print(sdev->dev,
708 BTRFS_DEV_STAT_READ_ERRS);
702 goto out; 709 goto out;
703 } 710 }
704 711
@@ -725,12 +732,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
725 spin_unlock(&sdev->stat_lock); 732 spin_unlock(&sdev->stat_lock);
726 if (__ratelimit(&_rs)) 733 if (__ratelimit(&_rs))
727 scrub_print_warning("i/o error", sblock_to_check); 734 scrub_print_warning("i/o error", sblock_to_check);
735 btrfs_dev_stat_inc_and_print(sdev->dev,
736 BTRFS_DEV_STAT_READ_ERRS);
728 } else if (sblock_bad->checksum_error) { 737 } else if (sblock_bad->checksum_error) {
729 spin_lock(&sdev->stat_lock); 738 spin_lock(&sdev->stat_lock);
730 sdev->stat.csum_errors++; 739 sdev->stat.csum_errors++;
731 spin_unlock(&sdev->stat_lock); 740 spin_unlock(&sdev->stat_lock);
732 if (__ratelimit(&_rs)) 741 if (__ratelimit(&_rs))
733 scrub_print_warning("checksum error", sblock_to_check); 742 scrub_print_warning("checksum error", sblock_to_check);
743 btrfs_dev_stat_inc_and_print(sdev->dev,
744 BTRFS_DEV_STAT_CORRUPTION_ERRS);
734 } else if (sblock_bad->header_error) { 745 } else if (sblock_bad->header_error) {
735 spin_lock(&sdev->stat_lock); 746 spin_lock(&sdev->stat_lock);
736 sdev->stat.verify_errors++; 747 sdev->stat.verify_errors++;
@@ -738,6 +749,12 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
738 if (__ratelimit(&_rs)) 749 if (__ratelimit(&_rs))
739 scrub_print_warning("checksum/header error", 750 scrub_print_warning("checksum/header error",
740 sblock_to_check); 751 sblock_to_check);
752 if (sblock_bad->generation_error)
753 btrfs_dev_stat_inc_and_print(sdev->dev,
754 BTRFS_DEV_STAT_GENERATION_ERRS);
755 else
756 btrfs_dev_stat_inc_and_print(sdev->dev,
757 BTRFS_DEV_STAT_CORRUPTION_ERRS);
741 } 758 }
742 759
743 if (sdev->readonly) 760 if (sdev->readonly)
@@ -998,8 +1015,8 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
998 page = sblock->pagev + page_index; 1015 page = sblock->pagev + page_index;
999 page->logical = logical; 1016 page->logical = logical;
1000 page->physical = bbio->stripes[mirror_index].physical; 1017 page->physical = bbio->stripes[mirror_index].physical;
1001 /* for missing devices, bdev is NULL */ 1018 /* for missing devices, dev->bdev is NULL */
1002 page->bdev = bbio->stripes[mirror_index].dev->bdev; 1019 page->dev = bbio->stripes[mirror_index].dev;
1003 page->mirror_num = mirror_index + 1; 1020 page->mirror_num = mirror_index + 1;
1004 page->page = alloc_page(GFP_NOFS); 1021 page->page = alloc_page(GFP_NOFS);
1005 if (!page->page) { 1022 if (!page->page) {
@@ -1043,7 +1060,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
1043 struct scrub_page *page = sblock->pagev + page_num; 1060 struct scrub_page *page = sblock->pagev + page_num;
1044 DECLARE_COMPLETION_ONSTACK(complete); 1061 DECLARE_COMPLETION_ONSTACK(complete);
1045 1062
1046 if (page->bdev == NULL) { 1063 if (page->dev->bdev == NULL) {
1047 page->io_error = 1; 1064 page->io_error = 1;
1048 sblock->no_io_error_seen = 0; 1065 sblock->no_io_error_seen = 0;
1049 continue; 1066 continue;
@@ -1053,7 +1070,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
1053 bio = bio_alloc(GFP_NOFS, 1); 1070 bio = bio_alloc(GFP_NOFS, 1);
1054 if (!bio) 1071 if (!bio)
1055 return -EIO; 1072 return -EIO;
1056 bio->bi_bdev = page->bdev; 1073 bio->bi_bdev = page->dev->bdev;
1057 bio->bi_sector = page->physical >> 9; 1074 bio->bi_sector = page->physical >> 9;
1058 bio->bi_end_io = scrub_complete_bio_end_io; 1075 bio->bi_end_io = scrub_complete_bio_end_io;
1059 bio->bi_private = &complete; 1076 bio->bi_private = &complete;
@@ -1102,11 +1119,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1102 h = (struct btrfs_header *)mapped_buffer; 1119 h = (struct btrfs_header *)mapped_buffer;
1103 1120
1104 if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || 1121 if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) ||
1105 generation != le64_to_cpu(h->generation) ||
1106 memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || 1122 memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
1107 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 1123 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1108 BTRFS_UUID_SIZE)) 1124 BTRFS_UUID_SIZE)) {
1109 sblock->header_error = 1; 1125 sblock->header_error = 1;
1126 } else if (generation != le64_to_cpu(h->generation)) {
1127 sblock->header_error = 1;
1128 sblock->generation_error = 1;
1129 }
1110 csum = h->csum; 1130 csum = h->csum;
1111 } else { 1131 } else {
1112 if (!have_csum) 1132 if (!have_csum)
@@ -1182,7 +1202,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1182 bio = bio_alloc(GFP_NOFS, 1); 1202 bio = bio_alloc(GFP_NOFS, 1);
1183 if (!bio) 1203 if (!bio)
1184 return -EIO; 1204 return -EIO;
1185 bio->bi_bdev = page_bad->bdev; 1205 bio->bi_bdev = page_bad->dev->bdev;
1186 bio->bi_sector = page_bad->physical >> 9; 1206 bio->bi_sector = page_bad->physical >> 9;
1187 bio->bi_end_io = scrub_complete_bio_end_io; 1207 bio->bi_end_io = scrub_complete_bio_end_io;
1188 bio->bi_private = &complete; 1208 bio->bi_private = &complete;
@@ -1196,6 +1216,12 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1196 1216
1197 /* this will also unplug the queue */ 1217 /* this will also unplug the queue */
1198 wait_for_completion(&complete); 1218 wait_for_completion(&complete);
1219 if (!bio_flagged(bio, BIO_UPTODATE)) {
1220 btrfs_dev_stat_inc_and_print(page_bad->dev,
1221 BTRFS_DEV_STAT_WRITE_ERRS);
1222 bio_put(bio);
1223 return -EIO;
1224 }
1199 bio_put(bio); 1225 bio_put(bio);
1200 } 1226 }
1201 1227
@@ -1352,7 +1378,8 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1352 u64 mapped_size; 1378 u64 mapped_size;
1353 void *p; 1379 void *p;
1354 u32 crc = ~(u32)0; 1380 u32 crc = ~(u32)0;
1355 int fail = 0; 1381 int fail_gen = 0;
1382 int fail_cor = 0;
1356 u64 len; 1383 u64 len;
1357 int index; 1384 int index;
1358 1385
@@ -1363,13 +1390,13 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1363 memcpy(on_disk_csum, s->csum, sdev->csum_size); 1390 memcpy(on_disk_csum, s->csum, sdev->csum_size);
1364 1391
1365 if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) 1392 if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr))
1366 ++fail; 1393 ++fail_cor;
1367 1394
1368 if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) 1395 if (sblock->pagev[0].generation != le64_to_cpu(s->generation))
1369 ++fail; 1396 ++fail_gen;
1370 1397
1371 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1398 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
1372 ++fail; 1399 ++fail_cor;
1373 1400
1374 len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; 1401 len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
1375 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; 1402 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
@@ -1394,9 +1421,9 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1394 1421
1395 btrfs_csum_final(crc, calculated_csum); 1422 btrfs_csum_final(crc, calculated_csum);
1396 if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) 1423 if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
1397 ++fail; 1424 ++fail_cor;
1398 1425
1399 if (fail) { 1426 if (fail_cor + fail_gen) {
1400 /* 1427 /*
1401 * if we find an error in a super block, we just report it. 1428 * if we find an error in a super block, we just report it.
1402 * They will get written with the next transaction commit 1429 * They will get written with the next transaction commit
@@ -1405,9 +1432,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1405 spin_lock(&sdev->stat_lock); 1432 spin_lock(&sdev->stat_lock);
1406 ++sdev->stat.super_errors; 1433 ++sdev->stat.super_errors;
1407 spin_unlock(&sdev->stat_lock); 1434 spin_unlock(&sdev->stat_lock);
1435 if (fail_cor)
1436 btrfs_dev_stat_inc_and_print(sdev->dev,
1437 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1438 else
1439 btrfs_dev_stat_inc_and_print(sdev->dev,
1440 BTRFS_DEV_STAT_GENERATION_ERRS);
1408 } 1441 }
1409 1442
1410 return fail; 1443 return fail_cor + fail_gen;
1411} 1444}
1412 1445
1413static void scrub_block_get(struct scrub_block *sblock) 1446static void scrub_block_get(struct scrub_block *sblock)
@@ -1551,7 +1584,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
1551 return -ENOMEM; 1584 return -ENOMEM;
1552 } 1585 }
1553 spage->sblock = sblock; 1586 spage->sblock = sblock;
1554 spage->bdev = sdev->dev->bdev; 1587 spage->dev = sdev->dev;
1555 spage->flags = flags; 1588 spage->flags = flags;
1556 spage->generation = gen; 1589 spage->generation = gen;
1557 spage->logical = logical; 1590 spage->logical = logical;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c5f8fca4195f..96eb9fef7bd2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -188,7 +188,8 @@ void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...)
188 va_start(args, fmt); 188 va_start(args, fmt);
189 189
190 if (fmt[0] == '<' && isdigit(fmt[1]) && fmt[2] == '>') { 190 if (fmt[0] == '<' && isdigit(fmt[1]) && fmt[2] == '>') {
191 strncpy(lvl, fmt, 3); 191 memcpy(lvl, fmt, 3);
192 lvl[3] = '\0';
192 fmt += 3; 193 fmt += 3;
193 type = logtypes[fmt[1] - '0']; 194 type = logtypes[fmt[1] - '0'];
194 } else 195 } else
@@ -435,11 +436,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
435 case Opt_thread_pool: 436 case Opt_thread_pool:
436 intarg = 0; 437 intarg = 0;
437 match_int(&args[0], &intarg); 438 match_int(&args[0], &intarg);
438 if (intarg) { 439 if (intarg)
439 info->thread_pool_size = intarg; 440 info->thread_pool_size = intarg;
440 printk(KERN_INFO "btrfs: thread pool %d\n",
441 info->thread_pool_size);
442 }
443 break; 441 break;
444 case Opt_max_inline: 442 case Opt_max_inline:
445 num = match_strdup(&args[0]); 443 num = match_strdup(&args[0]);
@@ -769,7 +767,7 @@ static int btrfs_fill_super(struct super_block *sb,
769#ifdef CONFIG_BTRFS_FS_POSIX_ACL 767#ifdef CONFIG_BTRFS_FS_POSIX_ACL
770 sb->s_flags |= MS_POSIXACL; 768 sb->s_flags |= MS_POSIXACL;
771#endif 769#endif
772 770 sb->s_flags |= MS_I_VERSION;
773 err = open_ctree(sb, fs_devices, (char *)data); 771 err = open_ctree(sb, fs_devices, (char *)data);
774 if (err) { 772 if (err) {
775 printk("btrfs: open_ctree failed\n"); 773 printk("btrfs: open_ctree failed\n");
@@ -925,63 +923,48 @@ static inline int is_subvolume_inode(struct inode *inode)
925 */ 923 */
926static char *setup_root_args(char *args) 924static char *setup_root_args(char *args)
927{ 925{
928 unsigned copied = 0; 926 unsigned len = strlen(args) + 2 + 1;
929 unsigned len = strlen(args) + 2; 927 char *src, *dst, *buf;
930 char *pos;
931 char *ret;
932 928
933 /* 929 /*
934 * We need the same args as before, but minus 930 * We need the same args as before, but with this substitution:
935 * 931 * s!subvol=[^,]+!subvolid=0!
936 * subvol=a
937 *
938 * and add
939 *
940 * subvolid=0
941 * 932 *
942 * which is a difference of 2 characters, so we allocate strlen(args) + 933 * Since the replacement string is up to 2 bytes longer than the
943 * 2 characters. 934 * original, allocate strlen(args) + 2 + 1 bytes.
944 */ 935 */
945 ret = kzalloc(len * sizeof(char), GFP_NOFS);
946 if (!ret)
947 return NULL;
948 pos = strstr(args, "subvol=");
949 936
937 src = strstr(args, "subvol=");
950 /* This shouldn't happen, but just in case.. */ 938 /* This shouldn't happen, but just in case.. */
951 if (!pos) { 939 if (!src)
952 kfree(ret); 940 return NULL;
941
942 buf = dst = kmalloc(len, GFP_NOFS);
943 if (!buf)
953 return NULL; 944 return NULL;
954 }
955 945
956 /* 946 /*
957 * The subvol=<> arg is not at the front of the string, copy everybody 947 * If the subvol= arg is not at the start of the string,
958 * up to that into ret. 948 * copy whatever precedes it into buf.
959 */ 949 */
960 if (pos != args) { 950 if (src != args) {
961 *pos = '\0'; 951 *src++ = '\0';
962 strcpy(ret, args); 952 strcpy(buf, args);
963 copied += strlen(args); 953 dst += strlen(args);
964 pos++;
965 } 954 }
966 955
967 strncpy(ret + copied, "subvolid=0", len - copied); 956 strcpy(dst, "subvolid=0");
968 957 dst += strlen("subvolid=0");
969 /* Length of subvolid=0 */
970 copied += 10;
971 958
972 /* 959 /*
973 * If there is no , after the subvol= option then we know there's no 960 * If there is a "," after the original subvol=... string,
974 * other options and we can just return. 961 * copy that suffix into our buffer. Otherwise, we're done.
975 */ 962 */
976 pos = strchr(pos, ','); 963 src = strchr(src, ',');
977 if (!pos) 964 if (src)
978 return ret; 965 strcpy(dst, src);
979 966
980 /* Copy the rest of the arguments into our buffer */ 967 return buf;
981 strncpy(ret + copied, pos, len - copied);
982 copied += strlen(pos);
983
984 return ret;
985} 968}
986 969
987static struct dentry *mount_subvol(const char *subvol_name, int flags, 970static struct dentry *mount_subvol(const char *subvol_name, int flags,
@@ -1118,6 +1101,40 @@ error_fs_info:
1118 return ERR_PTR(error); 1101 return ERR_PTR(error);
1119} 1102}
1120 1103
1104static void btrfs_set_max_workers(struct btrfs_workers *workers, int new_limit)
1105{
1106 spin_lock_irq(&workers->lock);
1107 workers->max_workers = new_limit;
1108 spin_unlock_irq(&workers->lock);
1109}
1110
1111static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
1112 int new_pool_size, int old_pool_size)
1113{
1114 if (new_pool_size == old_pool_size)
1115 return;
1116
1117 fs_info->thread_pool_size = new_pool_size;
1118
1119 printk(KERN_INFO "btrfs: resize thread pool %d -> %d\n",
1120 old_pool_size, new_pool_size);
1121
1122 btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size);
1123 btrfs_set_max_workers(&fs_info->workers, new_pool_size);
1124 btrfs_set_max_workers(&fs_info->delalloc_workers, new_pool_size);
1125 btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size);
1126 btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size);
1127 btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size);
1128 btrfs_set_max_workers(&fs_info->endio_workers, new_pool_size);
1129 btrfs_set_max_workers(&fs_info->endio_meta_workers, new_pool_size);
1130 btrfs_set_max_workers(&fs_info->endio_meta_write_workers, new_pool_size);
1131 btrfs_set_max_workers(&fs_info->endio_write_workers, new_pool_size);
1132 btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size);
1133 btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
1134 btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size);
1135 btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size);
1136}
1137
1121static int btrfs_remount(struct super_block *sb, int *flags, char *data) 1138static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1122{ 1139{
1123 struct btrfs_fs_info *fs_info = btrfs_sb(sb); 1140 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -1137,6 +1154,9 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1137 goto restore; 1154 goto restore;
1138 } 1155 }
1139 1156
1157 btrfs_resize_thread_pool(fs_info,
1158 fs_info->thread_pool_size, old_thread_pool_size);
1159
1140 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 1160 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
1141 return 0; 1161 return 0;
1142 1162
@@ -1180,7 +1200,8 @@ restore:
1180 fs_info->compress_type = old_compress_type; 1200 fs_info->compress_type = old_compress_type;
1181 fs_info->max_inline = old_max_inline; 1201 fs_info->max_inline = old_max_inline;
1182 fs_info->alloc_start = old_alloc_start; 1202 fs_info->alloc_start = old_alloc_start;
1183 fs_info->thread_pool_size = old_thread_pool_size; 1203 btrfs_resize_thread_pool(fs_info,
1204 old_thread_pool_size, fs_info->thread_pool_size);
1184 fs_info->metadata_ratio = old_metadata_ratio; 1205 fs_info->metadata_ratio = old_metadata_ratio;
1185 return ret; 1206 return ret;
1186} 1207}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 36422254ef67..1791c6e3d834 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -28,6 +28,7 @@
28#include "locking.h" 28#include "locking.h"
29#include "tree-log.h" 29#include "tree-log.h"
30#include "inode-map.h" 30#include "inode-map.h"
31#include "volumes.h"
31 32
32#define BTRFS_ROOT_TRANS_TAG 0 33#define BTRFS_ROOT_TRANS_TAG 0
33 34
@@ -55,48 +56,49 @@ static noinline void switch_commit_root(struct btrfs_root *root)
55static noinline int join_transaction(struct btrfs_root *root, int nofail) 56static noinline int join_transaction(struct btrfs_root *root, int nofail)
56{ 57{
57 struct btrfs_transaction *cur_trans; 58 struct btrfs_transaction *cur_trans;
59 struct btrfs_fs_info *fs_info = root->fs_info;
58 60
59 spin_lock(&root->fs_info->trans_lock); 61 spin_lock(&fs_info->trans_lock);
60loop: 62loop:
61 /* The file system has been taken offline. No new transactions. */ 63 /* The file system has been taken offline. No new transactions. */
62 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 64 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
63 spin_unlock(&root->fs_info->trans_lock); 65 spin_unlock(&fs_info->trans_lock);
64 return -EROFS; 66 return -EROFS;
65 } 67 }
66 68
67 if (root->fs_info->trans_no_join) { 69 if (fs_info->trans_no_join) {
68 if (!nofail) { 70 if (!nofail) {
69 spin_unlock(&root->fs_info->trans_lock); 71 spin_unlock(&fs_info->trans_lock);
70 return -EBUSY; 72 return -EBUSY;
71 } 73 }
72 } 74 }
73 75
74 cur_trans = root->fs_info->running_transaction; 76 cur_trans = fs_info->running_transaction;
75 if (cur_trans) { 77 if (cur_trans) {
76 if (cur_trans->aborted) { 78 if (cur_trans->aborted) {
77 spin_unlock(&root->fs_info->trans_lock); 79 spin_unlock(&fs_info->trans_lock);
78 return cur_trans->aborted; 80 return cur_trans->aborted;
79 } 81 }
80 atomic_inc(&cur_trans->use_count); 82 atomic_inc(&cur_trans->use_count);
81 atomic_inc(&cur_trans->num_writers); 83 atomic_inc(&cur_trans->num_writers);
82 cur_trans->num_joined++; 84 cur_trans->num_joined++;
83 spin_unlock(&root->fs_info->trans_lock); 85 spin_unlock(&fs_info->trans_lock);
84 return 0; 86 return 0;
85 } 87 }
86 spin_unlock(&root->fs_info->trans_lock); 88 spin_unlock(&fs_info->trans_lock);
87 89
88 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); 90 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
89 if (!cur_trans) 91 if (!cur_trans)
90 return -ENOMEM; 92 return -ENOMEM;
91 93
92 spin_lock(&root->fs_info->trans_lock); 94 spin_lock(&fs_info->trans_lock);
93 if (root->fs_info->running_transaction) { 95 if (fs_info->running_transaction) {
94 /* 96 /*
95 * someone started a transaction after we unlocked. Make sure 97 * someone started a transaction after we unlocked. Make sure
96 * to redo the trans_no_join checks above 98 * to redo the trans_no_join checks above
97 */ 99 */
98 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 100 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
99 cur_trans = root->fs_info->running_transaction; 101 cur_trans = fs_info->running_transaction;
100 goto loop; 102 goto loop;
101 } 103 }
102 104
@@ -121,20 +123,38 @@ loop:
121 cur_trans->delayed_refs.flushing = 0; 123 cur_trans->delayed_refs.flushing = 0;
122 cur_trans->delayed_refs.run_delayed_start = 0; 124 cur_trans->delayed_refs.run_delayed_start = 0;
123 cur_trans->delayed_refs.seq = 1; 125 cur_trans->delayed_refs.seq = 1;
126
127 /*
128 * although the tree mod log is per file system and not per transaction,
129 * the log must never go across transaction boundaries.
130 */
131 smp_mb();
132 if (!list_empty(&fs_info->tree_mod_seq_list)) {
133 printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when "
134 "creating a fresh transaction\n");
135 WARN_ON(1);
136 }
137 if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) {
138 printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
139 "creating a fresh transaction\n");
140 WARN_ON(1);
141 }
142 atomic_set(&fs_info->tree_mod_seq, 0);
143
124 init_waitqueue_head(&cur_trans->delayed_refs.seq_wait); 144 init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);
125 spin_lock_init(&cur_trans->commit_lock); 145 spin_lock_init(&cur_trans->commit_lock);
126 spin_lock_init(&cur_trans->delayed_refs.lock); 146 spin_lock_init(&cur_trans->delayed_refs.lock);
127 INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head); 147 INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
128 148
129 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 149 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
130 list_add_tail(&cur_trans->list, &root->fs_info->trans_list); 150 list_add_tail(&cur_trans->list, &fs_info->trans_list);
131 extent_io_tree_init(&cur_trans->dirty_pages, 151 extent_io_tree_init(&cur_trans->dirty_pages,
132 root->fs_info->btree_inode->i_mapping); 152 fs_info->btree_inode->i_mapping);
133 root->fs_info->generation++; 153 fs_info->generation++;
134 cur_trans->transid = root->fs_info->generation; 154 cur_trans->transid = fs_info->generation;
135 root->fs_info->running_transaction = cur_trans; 155 fs_info->running_transaction = cur_trans;
136 cur_trans->aborted = 0; 156 cur_trans->aborted = 0;
137 spin_unlock(&root->fs_info->trans_lock); 157 spin_unlock(&fs_info->trans_lock);
138 158
139 return 0; 159 return 0;
140} 160}
@@ -758,6 +778,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
758 if (ret) 778 if (ret)
759 return ret; 779 return ret;
760 780
781 ret = btrfs_run_dev_stats(trans, root->fs_info);
782 BUG_ON(ret);
783
761 while (!list_empty(&fs_info->dirty_cowonly_roots)) { 784 while (!list_empty(&fs_info->dirty_cowonly_roots)) {
762 next = fs_info->dirty_cowonly_roots.next; 785 next = fs_info->dirty_cowonly_roots.next;
763 list_del_init(next); 786 list_del_init(next);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index eb1ae908582c..2017d0ff511c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1628,7 +1628,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1628 int i; 1628 int i;
1629 int ret; 1629 int ret;
1630 1630
1631 btrfs_read_buffer(eb, gen); 1631 ret = btrfs_read_buffer(eb, gen);
1632 if (ret)
1633 return ret;
1632 1634
1633 level = btrfs_header_level(eb); 1635 level = btrfs_header_level(eb);
1634 1636
@@ -1749,7 +1751,11 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1749 1751
1750 path->slots[*level]++; 1752 path->slots[*level]++;
1751 if (wc->free) { 1753 if (wc->free) {
1752 btrfs_read_buffer(next, ptr_gen); 1754 ret = btrfs_read_buffer(next, ptr_gen);
1755 if (ret) {
1756 free_extent_buffer(next);
1757 return ret;
1758 }
1753 1759
1754 btrfs_tree_lock(next); 1760 btrfs_tree_lock(next);
1755 btrfs_set_lock_blocking(next); 1761 btrfs_set_lock_blocking(next);
@@ -1766,7 +1772,11 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1766 free_extent_buffer(next); 1772 free_extent_buffer(next);
1767 continue; 1773 continue;
1768 } 1774 }
1769 btrfs_read_buffer(next, ptr_gen); 1775 ret = btrfs_read_buffer(next, ptr_gen);
1776 if (ret) {
1777 free_extent_buffer(next);
1778 return ret;
1779 }
1770 1780
1771 WARN_ON(*level <= 0); 1781 WARN_ON(*level <= 0);
1772 if (path->nodes[*level-1]) 1782 if (path->nodes[*level-1])
@@ -2657,6 +2667,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2657 btrfs_release_path(path); 2667 btrfs_release_path(path);
2658 } 2668 }
2659 btrfs_release_path(path); 2669 btrfs_release_path(path);
2670 if (ret > 0)
2671 ret = 0;
2660 return ret; 2672 return ret;
2661} 2673}
2662 2674
@@ -3028,21 +3040,6 @@ out:
3028 return ret; 3040 return ret;
3029} 3041}
3030 3042
3031static int inode_in_log(struct btrfs_trans_handle *trans,
3032 struct inode *inode)
3033{
3034 struct btrfs_root *root = BTRFS_I(inode)->root;
3035 int ret = 0;
3036
3037 mutex_lock(&root->log_mutex);
3038 if (BTRFS_I(inode)->logged_trans == trans->transid &&
3039 BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
3040 ret = 1;
3041 mutex_unlock(&root->log_mutex);
3042 return ret;
3043}
3044
3045
3046/* 3043/*
3047 * helper function around btrfs_log_inode to make sure newly created 3044 * helper function around btrfs_log_inode to make sure newly created
3048 * parent directories also end up in the log. A minimal inode and backref 3045 * parent directories also end up in the log. A minimal inode and backref
@@ -3083,7 +3080,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
3083 if (ret) 3080 if (ret)
3084 goto end_no_trans; 3081 goto end_no_trans;
3085 3082
3086 if (inode_in_log(trans, inode)) { 3083 if (btrfs_inode_in_log(inode, trans->transid)) {
3087 ret = BTRFS_NO_LOG_SYNC; 3084 ret = BTRFS_NO_LOG_SYNC;
3088 goto end_no_trans; 3085 goto end_no_trans;
3089 } 3086 }
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 12f5147bd2b1..ab942f46b3dd 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -23,9 +23,9 @@
23 * 23 *
24 * ulist = ulist_alloc(); 24 * ulist = ulist_alloc();
25 * ulist_add(ulist, root); 25 * ulist_add(ulist, root);
26 * elem = NULL; 26 * ULIST_ITER_INIT(&uiter);
27 * 27 *
28 * while ((elem = ulist_next(ulist, elem)) { 28 * while ((elem = ulist_next(ulist, &uiter)) {
29 * for (all child nodes n in elem) 29 * for (all child nodes n in elem)
30 * ulist_add(ulist, n); 30 * ulist_add(ulist, n);
31 * do something useful with the node; 31 * do something useful with the node;
@@ -95,7 +95,7 @@ EXPORT_SYMBOL(ulist_reinit);
95 * 95 *
96 * The allocated ulist will be returned in an initialized state. 96 * The allocated ulist will be returned in an initialized state.
97 */ 97 */
98struct ulist *ulist_alloc(unsigned long gfp_mask) 98struct ulist *ulist_alloc(gfp_t gfp_mask)
99{ 99{
100 struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask); 100 struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask);
101 101
@@ -144,13 +144,22 @@ EXPORT_SYMBOL(ulist_free);
144 * unaltered. 144 * unaltered.
145 */ 145 */
146int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, 146int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
147 unsigned long gfp_mask) 147 gfp_t gfp_mask)
148{
149 return ulist_add_merge(ulist, val, aux, NULL, gfp_mask);
150}
151
152int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux,
153 unsigned long *old_aux, gfp_t gfp_mask)
148{ 154{
149 int i; 155 int i;
150 156
151 for (i = 0; i < ulist->nnodes; ++i) { 157 for (i = 0; i < ulist->nnodes; ++i) {
152 if (ulist->nodes[i].val == val) 158 if (ulist->nodes[i].val == val) {
159 if (old_aux)
160 *old_aux = ulist->nodes[i].aux;
153 return 0; 161 return 0;
162 }
154 } 163 }
155 164
156 if (ulist->nnodes >= ulist->nodes_alloced) { 165 if (ulist->nnodes >= ulist->nodes_alloced) {
@@ -188,33 +197,26 @@ EXPORT_SYMBOL(ulist_add);
188/** 197/**
189 * ulist_next - iterate ulist 198 * ulist_next - iterate ulist
190 * @ulist: ulist to iterate 199 * @ulist: ulist to iterate
191 * @prev: previously returned element or %NULL to start iteration 200 * @uiter: iterator variable, initialized with ULIST_ITER_INIT(&iterator)
192 * 201 *
193 * Note: locking must be provided by the caller. In case of rwlocks only read 202 * Note: locking must be provided by the caller. In case of rwlocks only read
194 * locking is needed 203 * locking is needed
195 * 204 *
196 * This function is used to iterate an ulist. The iteration is started with 205 * This function is used to iterate an ulist.
197 * @prev = %NULL. It returns the next element from the ulist or %NULL when the 206 * It returns the next element from the ulist or %NULL when the
198 * end is reached. No guarantee is made with respect to the order in which 207 * end is reached. No guarantee is made with respect to the order in which
199 * the elements are returned. They might neither be returned in order of 208 * the elements are returned. They might neither be returned in order of
200 * addition nor in ascending order. 209 * addition nor in ascending order.
201 * It is allowed to call ulist_add during an enumeration. Newly added items 210 * It is allowed to call ulist_add during an enumeration. Newly added items
202 * are guaranteed to show up in the running enumeration. 211 * are guaranteed to show up in the running enumeration.
203 */ 212 */
204struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev) 213struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter)
205{ 214{
206 int next;
207
208 if (ulist->nnodes == 0) 215 if (ulist->nnodes == 0)
209 return NULL; 216 return NULL;
210 217 if (uiter->i < 0 || uiter->i >= ulist->nnodes)
211 if (!prev)
212 return &ulist->nodes[0];
213
214 next = (prev - ulist->nodes) + 1;
215 if (next < 0 || next >= ulist->nnodes)
216 return NULL; 218 return NULL;
217 219
218 return &ulist->nodes[next]; 220 return &ulist->nodes[uiter->i++];
219} 221}
220EXPORT_SYMBOL(ulist_next); 222EXPORT_SYMBOL(ulist_next);
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 2e25dec58ec0..21bdc8ec8130 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -24,6 +24,10 @@
24 */ 24 */
25#define ULIST_SIZE 16 25#define ULIST_SIZE 16
26 26
27struct ulist_iterator {
28 int i;
29};
30
27/* 31/*
28 * element of the list 32 * element of the list
29 */ 33 */
@@ -59,10 +63,15 @@ struct ulist {
59void ulist_init(struct ulist *ulist); 63void ulist_init(struct ulist *ulist);
60void ulist_fini(struct ulist *ulist); 64void ulist_fini(struct ulist *ulist);
61void ulist_reinit(struct ulist *ulist); 65void ulist_reinit(struct ulist *ulist);
62struct ulist *ulist_alloc(unsigned long gfp_mask); 66struct ulist *ulist_alloc(gfp_t gfp_mask);
63void ulist_free(struct ulist *ulist); 67void ulist_free(struct ulist *ulist);
64int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, 68int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
65 unsigned long gfp_mask); 69 gfp_t gfp_mask);
66struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev); 70int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux,
71 unsigned long *old_aux, gfp_t gfp_mask);
72struct ulist_node *ulist_next(struct ulist *ulist,
73 struct ulist_iterator *uiter);
74
75#define ULIST_ITER_INIT(uiter) ((uiter)->i = 0)
67 76
68#endif 77#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1411b99555a4..7782020996fe 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -23,6 +23,7 @@
23#include <linux/random.h> 23#include <linux/random.h>
24#include <linux/iocontext.h> 24#include <linux/iocontext.h>
25#include <linux/capability.h> 25#include <linux/capability.h>
26#include <linux/ratelimit.h>
26#include <linux/kthread.h> 27#include <linux/kthread.h>
27#include <asm/div64.h> 28#include <asm/div64.h>
28#include "compat.h" 29#include "compat.h"
@@ -39,6 +40,8 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans,
39 struct btrfs_root *root, 40 struct btrfs_root *root,
40 struct btrfs_device *device); 41 struct btrfs_device *device);
41static int btrfs_relocate_sys_chunks(struct btrfs_root *root); 42static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
43static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
44static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
42 45
43static DEFINE_MUTEX(uuid_mutex); 46static DEFINE_MUTEX(uuid_mutex);
44static LIST_HEAD(fs_uuids); 47static LIST_HEAD(fs_uuids);
@@ -361,6 +364,7 @@ static noinline int device_list_add(const char *path,
361 return -ENOMEM; 364 return -ENOMEM;
362 } 365 }
363 device->devid = devid; 366 device->devid = devid;
367 device->dev_stats_valid = 0;
364 device->work.func = pending_bios_fn; 368 device->work.func = pending_bios_fn;
365 memcpy(device->uuid, disk_super->dev_item.uuid, 369 memcpy(device->uuid, disk_super->dev_item.uuid,
366 BTRFS_UUID_SIZE); 370 BTRFS_UUID_SIZE);
@@ -1633,7 +1637,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1633 int ret = 0; 1637 int ret = 0;
1634 1638
1635 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1639 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1636 return -EINVAL; 1640 return -EROFS;
1637 1641
1638 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 1642 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
1639 root->fs_info->bdev_holder); 1643 root->fs_info->bdev_holder);
@@ -4001,13 +4005,58 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4001 return 0; 4005 return 0;
4002} 4006}
4003 4007
4008static void *merge_stripe_index_into_bio_private(void *bi_private,
4009 unsigned int stripe_index)
4010{
4011 /*
4012 * with single, dup, RAID0, RAID1 and RAID10, stripe_index is
4013 * at most 1.
4014 * The alternative solution (instead of stealing bits from the
4015 * pointer) would be to allocate an intermediate structure
4016 * that contains the old private pointer plus the stripe_index.
4017 */
4018 BUG_ON((((uintptr_t)bi_private) & 3) != 0);
4019 BUG_ON(stripe_index > 3);
4020 return (void *)(((uintptr_t)bi_private) | stripe_index);
4021}
4022
4023static struct btrfs_bio *extract_bbio_from_bio_private(void *bi_private)
4024{
4025 return (struct btrfs_bio *)(((uintptr_t)bi_private) & ~((uintptr_t)3));
4026}
4027
4028static unsigned int extract_stripe_index_from_bio_private(void *bi_private)
4029{
4030 return (unsigned int)((uintptr_t)bi_private) & 3;
4031}
4032
4004static void btrfs_end_bio(struct bio *bio, int err) 4033static void btrfs_end_bio(struct bio *bio, int err)
4005{ 4034{
4006 struct btrfs_bio *bbio = bio->bi_private; 4035 struct btrfs_bio *bbio = extract_bbio_from_bio_private(bio->bi_private);
4007 int is_orig_bio = 0; 4036 int is_orig_bio = 0;
4008 4037
4009 if (err) 4038 if (err) {
4010 atomic_inc(&bbio->error); 4039 atomic_inc(&bbio->error);
4040 if (err == -EIO || err == -EREMOTEIO) {
4041 unsigned int stripe_index =
4042 extract_stripe_index_from_bio_private(
4043 bio->bi_private);
4044 struct btrfs_device *dev;
4045
4046 BUG_ON(stripe_index >= bbio->num_stripes);
4047 dev = bbio->stripes[stripe_index].dev;
4048 if (bio->bi_rw & WRITE)
4049 btrfs_dev_stat_inc(dev,
4050 BTRFS_DEV_STAT_WRITE_ERRS);
4051 else
4052 btrfs_dev_stat_inc(dev,
4053 BTRFS_DEV_STAT_READ_ERRS);
4054 if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH)
4055 btrfs_dev_stat_inc(dev,
4056 BTRFS_DEV_STAT_FLUSH_ERRS);
4057 btrfs_dev_stat_print_on_error(dev);
4058 }
4059 }
4011 4060
4012 if (bio == bbio->orig_bio) 4061 if (bio == bbio->orig_bio)
4013 is_orig_bio = 1; 4062 is_orig_bio = 1;
@@ -4149,6 +4198,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4149 bio = first_bio; 4198 bio = first_bio;
4150 } 4199 }
4151 bio->bi_private = bbio; 4200 bio->bi_private = bbio;
4201 bio->bi_private = merge_stripe_index_into_bio_private(
4202 bio->bi_private, (unsigned int)dev_nr);
4152 bio->bi_end_io = btrfs_end_bio; 4203 bio->bi_end_io = btrfs_end_bio;
4153 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; 4204 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
4154 dev = bbio->stripes[dev_nr].dev; 4205 dev = bbio->stripes[dev_nr].dev;
@@ -4509,6 +4560,28 @@ int btrfs_read_sys_array(struct btrfs_root *root)
4509 return ret; 4560 return ret;
4510} 4561}
4511 4562
4563struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root,
4564 u64 logical, int mirror_num)
4565{
4566 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
4567 int ret;
4568 u64 map_length = 0;
4569 struct btrfs_bio *bbio = NULL;
4570 struct btrfs_device *device;
4571
4572 BUG_ON(mirror_num == 0);
4573 ret = btrfs_map_block(map_tree, WRITE, logical, &map_length, &bbio,
4574 mirror_num);
4575 if (ret) {
4576 BUG_ON(bbio != NULL);
4577 return NULL;
4578 }
4579 BUG_ON(mirror_num != bbio->mirror_num);
4580 device = bbio->stripes[mirror_num - 1].dev;
4581 kfree(bbio);
4582 return device;
4583}
4584
4512int btrfs_read_chunk_tree(struct btrfs_root *root) 4585int btrfs_read_chunk_tree(struct btrfs_root *root)
4513{ 4586{
4514 struct btrfs_path *path; 4587 struct btrfs_path *path;
@@ -4583,3 +4656,230 @@ error:
4583 btrfs_free_path(path); 4656 btrfs_free_path(path);
4584 return ret; 4657 return ret;
4585} 4658}
4659
4660static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
4661{
4662 int i;
4663
4664 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
4665 btrfs_dev_stat_reset(dev, i);
4666}
4667
4668int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
4669{
4670 struct btrfs_key key;
4671 struct btrfs_key found_key;
4672 struct btrfs_root *dev_root = fs_info->dev_root;
4673 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
4674 struct extent_buffer *eb;
4675 int slot;
4676 int ret = 0;
4677 struct btrfs_device *device;
4678 struct btrfs_path *path = NULL;
4679 int i;
4680
4681 path = btrfs_alloc_path();
4682 if (!path) {
4683 ret = -ENOMEM;
4684 goto out;
4685 }
4686
4687 mutex_lock(&fs_devices->device_list_mutex);
4688 list_for_each_entry(device, &fs_devices->devices, dev_list) {
4689 int item_size;
4690 struct btrfs_dev_stats_item *ptr;
4691
4692 key.objectid = 0;
4693 key.type = BTRFS_DEV_STATS_KEY;
4694 key.offset = device->devid;
4695 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
4696 if (ret) {
4697 printk(KERN_WARNING "btrfs: no dev_stats entry found for device %s (devid %llu) (OK on first mount after mkfs)\n",
4698 device->name, (unsigned long long)device->devid);
4699 __btrfs_reset_dev_stats(device);
4700 device->dev_stats_valid = 1;
4701 btrfs_release_path(path);
4702 continue;
4703 }
4704 slot = path->slots[0];
4705 eb = path->nodes[0];
4706 btrfs_item_key_to_cpu(eb, &found_key, slot);
4707 item_size = btrfs_item_size_nr(eb, slot);
4708
4709 ptr = btrfs_item_ptr(eb, slot,
4710 struct btrfs_dev_stats_item);
4711
4712 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
4713 if (item_size >= (1 + i) * sizeof(__le64))
4714 btrfs_dev_stat_set(device, i,
4715 btrfs_dev_stats_value(eb, ptr, i));
4716 else
4717 btrfs_dev_stat_reset(device, i);
4718 }
4719
4720 device->dev_stats_valid = 1;
4721 btrfs_dev_stat_print_on_load(device);
4722 btrfs_release_path(path);
4723 }
4724 mutex_unlock(&fs_devices->device_list_mutex);
4725
4726out:
4727 btrfs_free_path(path);
4728 return ret < 0 ? ret : 0;
4729}
4730
4731static int update_dev_stat_item(struct btrfs_trans_handle *trans,
4732 struct btrfs_root *dev_root,
4733 struct btrfs_device *device)
4734{
4735 struct btrfs_path *path;
4736 struct btrfs_key key;
4737 struct extent_buffer *eb;
4738 struct btrfs_dev_stats_item *ptr;
4739 int ret;
4740 int i;
4741
4742 key.objectid = 0;
4743 key.type = BTRFS_DEV_STATS_KEY;
4744 key.offset = device->devid;
4745
4746 path = btrfs_alloc_path();
4747 BUG_ON(!path);
4748 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
4749 if (ret < 0) {
4750 printk(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n",
4751 ret, device->name);
4752 goto out;
4753 }
4754
4755 if (ret == 0 &&
4756 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
4757 /* need to delete old one and insert a new one */
4758 ret = btrfs_del_item(trans, dev_root, path);
4759 if (ret != 0) {
4760 printk(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n",
4761 device->name, ret);
4762 goto out;
4763 }
4764 ret = 1;
4765 }
4766
4767 if (ret == 1) {
4768 /* need to insert a new item */
4769 btrfs_release_path(path);
4770 ret = btrfs_insert_empty_item(trans, dev_root, path,
4771 &key, sizeof(*ptr));
4772 if (ret < 0) {
4773 printk(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n",
4774 device->name, ret);
4775 goto out;
4776 }
4777 }
4778
4779 eb = path->nodes[0];
4780 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
4781 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
4782 btrfs_set_dev_stats_value(eb, ptr, i,
4783 btrfs_dev_stat_read(device, i));
4784 btrfs_mark_buffer_dirty(eb);
4785
4786out:
4787 btrfs_free_path(path);
4788 return ret;
4789}
4790
4791/*
4792 * called from commit_transaction. Writes all changed device stats to disk.
4793 */
4794int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
4795 struct btrfs_fs_info *fs_info)
4796{
4797 struct btrfs_root *dev_root = fs_info->dev_root;
4798 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
4799 struct btrfs_device *device;
4800 int ret = 0;
4801
4802 mutex_lock(&fs_devices->device_list_mutex);
4803 list_for_each_entry(device, &fs_devices->devices, dev_list) {
4804 if (!device->dev_stats_valid || !device->dev_stats_dirty)
4805 continue;
4806
4807 ret = update_dev_stat_item(trans, dev_root, device);
4808 if (!ret)
4809 device->dev_stats_dirty = 0;
4810 }
4811 mutex_unlock(&fs_devices->device_list_mutex);
4812
4813 return ret;
4814}
4815
4816void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
4817{
4818 btrfs_dev_stat_inc(dev, index);
4819 btrfs_dev_stat_print_on_error(dev);
4820}
4821
4822void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
4823{
4824 if (!dev->dev_stats_valid)
4825 return;
4826 printk_ratelimited(KERN_ERR
4827 "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
4828 dev->name,
4829 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
4830 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
4831 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
4832 btrfs_dev_stat_read(dev,
4833 BTRFS_DEV_STAT_CORRUPTION_ERRS),
4834 btrfs_dev_stat_read(dev,
4835 BTRFS_DEV_STAT_GENERATION_ERRS));
4836}
4837
4838static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
4839{
4840 printk(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
4841 dev->name,
4842 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
4843 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
4844 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
4845 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
4846 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
4847}
4848
4849int btrfs_get_dev_stats(struct btrfs_root *root,
4850 struct btrfs_ioctl_get_dev_stats *stats,
4851 int reset_after_read)
4852{
4853 struct btrfs_device *dev;
4854 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
4855 int i;
4856
4857 mutex_lock(&fs_devices->device_list_mutex);
4858 dev = btrfs_find_device(root, stats->devid, NULL, NULL);
4859 mutex_unlock(&fs_devices->device_list_mutex);
4860
4861 if (!dev) {
4862 printk(KERN_WARNING
4863 "btrfs: get dev_stats failed, device not found\n");
4864 return -ENODEV;
4865 } else if (!dev->dev_stats_valid) {
4866 printk(KERN_WARNING
4867 "btrfs: get dev_stats failed, not yet valid\n");
4868 return -ENODEV;
4869 } else if (reset_after_read) {
4870 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
4871 if (stats->nr_items > i)
4872 stats->values[i] =
4873 btrfs_dev_stat_read_and_reset(dev, i);
4874 else
4875 btrfs_dev_stat_reset(dev, i);
4876 }
4877 } else {
4878 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
4879 if (stats->nr_items > i)
4880 stats->values[i] = btrfs_dev_stat_read(dev, i);
4881 }
4882 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
4883 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
4884 return 0;
4885}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index bb6b03f97aaa..3406a88ca83e 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -22,6 +22,7 @@
22#include <linux/bio.h> 22#include <linux/bio.h>
23#include <linux/sort.h> 23#include <linux/sort.h>
24#include "async-thread.h" 24#include "async-thread.h"
25#include "ioctl.h"
25 26
26#define BTRFS_STRIPE_LEN (64 * 1024) 27#define BTRFS_STRIPE_LEN (64 * 1024)
27 28
@@ -106,6 +107,11 @@ struct btrfs_device {
106 struct completion flush_wait; 107 struct completion flush_wait;
107 int nobarriers; 108 int nobarriers;
108 109
110 /* disk I/O failure stats. For detailed description refer to
111 * enum btrfs_dev_stat_values in ioctl.h */
112 int dev_stats_valid;
113 int dev_stats_dirty; /* counters need to be written to disk */
114 atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
109}; 115};
110 116
111struct btrfs_fs_devices { 117struct btrfs_fs_devices {
@@ -281,4 +287,50 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
281int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); 287int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
282int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 288int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
283 u64 *start, u64 *max_avail); 289 u64 *start, u64 *max_avail);
290struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root,
291 u64 logical, int mirror_num);
292void btrfs_dev_stat_print_on_error(struct btrfs_device *device);
293void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
294int btrfs_get_dev_stats(struct btrfs_root *root,
295 struct btrfs_ioctl_get_dev_stats *stats,
296 int reset_after_read);
297int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
298int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
299 struct btrfs_fs_info *fs_info);
300
301static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
302 int index)
303{
304 atomic_inc(dev->dev_stat_values + index);
305 dev->dev_stats_dirty = 1;
306}
307
308static inline int btrfs_dev_stat_read(struct btrfs_device *dev,
309 int index)
310{
311 return atomic_read(dev->dev_stat_values + index);
312}
313
314static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev,
315 int index)
316{
317 int ret;
318
319 ret = atomic_xchg(dev->dev_stat_values + index, 0);
320 dev->dev_stats_dirty = 1;
321 return ret;
322}
323
324static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
325 int index, unsigned long val)
326{
327 atomic_set(dev->dev_stat_values + index, val);
328 dev->dev_stats_dirty = 1;
329}
330
331static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
332 int index)
333{
334 btrfs_dev_stat_set(dev, index, 0);
335}
284#endif 336#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index e7a5659087e6..3f4e2d69e83a 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -196,6 +196,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
196 if (ret) 196 if (ret)
197 goto out; 197 goto out;
198 198
199 inode_inc_iversion(inode);
199 inode->i_ctime = CURRENT_TIME; 200 inode->i_ctime = CURRENT_TIME;
200 ret = btrfs_update_inode(trans, root, inode); 201 ret = btrfs_update_inode(trans, root, inode);
201 BUG_ON(ret); 202 BUG_ON(ret);