aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-06-01 11:37:31 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-06-01 11:37:31 -0400
commit51eab603f5c86dd1eae4c525df3e7f7eeab401d6 (patch)
treee7a8c6214b072db126cca62d39008b3620134798 /fs/btrfs
parent419f4319495043a9507ac3e616be9ca60af09744 (diff)
parent1e20932a23578bb1ec59107843574e259b96193f (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull btrfs updates from Chris Mason: "This includes a fairly large change from Josef around data writeback completion. Before, the writeback wasn't completed until the metadata insertions for the extent were done, and this made for fairly large latency spikes on the last page of each ordered extent. We already had a separate mechanism for tracking pending metadata insertions, so Josef just needed to tweak things a little to end writeback earlier on the page. Overall it makes us much friendly to memory reclaim and lowers latencies quite a lot for synchronous IO. Jan Schmidt has finished some background work required to track btree blocks as they go through changes in ownership. It's the missing piece he needed for both btrfs send/receive and subvolume quotas. Neither of those are ready yet, but the new tracking code is included here. Most of the time, the new code is off. It is only used by scrub and other backref walkers. Stefan Behrens has added io failure tracking. This includes counters for which drives are causing the most trouble so the admin (or an automated tool) can choose to kick them out. We're tracking IO errors, crc errors, and generation checks we do on each metadata block. RAID5/6 did miss the cut this time because I'm having trouble with corruptions. I'll nail it down next week and post as a beta testing before 3.6" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (58 commits) Btrfs: fix tree mod log rewinded level and rewinding of moved keys Btrfs: fix tree mod log del_ptr Btrfs: add tree_mod_dont_log helper Btrfs: add missing spin_lock for insertion into tree mod log Btrfs: add inodes before dropping the extent lock in find_all_leafs Btrfs: use delayed ref sequence numbers for all fs-tree updates Btrfs: fix false positive in check-integrity on unmount Btrfs: fix runtime warning in check-integrity check data mode Btrfs: set ioprio of scrub readahead to idle Btrfs: fix return code in drop_objectid_items Btrfs: check to see if the inode is in the log before fsyncing Btrfs: return value of btrfs_read_buffer is checked correctly Btrfs: read device stats on mount, write modified ones during commit Btrfs: add ioctl to get and reset the device stats Btrfs: add device counters for detected IO and checksum errors btrfs: Drop unused function btrfs_abort_devices() Btrfs: fix the same inode id problem when doing auto defragment Btrfs: fall back to non-inline if we don't have enough space Btrfs: fix how we deal with the orphan block rsv Btrfs: convert the inode bit field to use the actual bit operations ...
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/acl.c4
-rw-r--r--fs/btrfs/backref.c495
-rw-r--r--fs/btrfs/backref.h3
-rw-r--r--fs/btrfs/btrfs_inode.h50
-rw-r--r--fs/btrfs/check-integrity.c584
-rw-r--r--fs/btrfs/ctree.c861
-rw-r--r--fs/btrfs/ctree.h77
-rw-r--r--fs/btrfs/delayed-inode.c8
-rw-r--r--fs/btrfs/delayed-ref.c10
-rw-r--r--fs/btrfs/delayed-ref.h24
-rw-r--r--fs/btrfs/disk-io.c57
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/extent-tree.c23
-rw-r--r--fs/btrfs/extent_io.c168
-rw-r--r--fs/btrfs/extent_io.h8
-rw-r--r--fs/btrfs/file.c76
-rw-r--r--fs/btrfs/free-space-cache.c45
-rw-r--r--fs/btrfs/inode.c264
-rw-r--r--fs/btrfs/ioctl.c50
-rw-r--r--fs/btrfs/ioctl.h33
-rw-r--r--fs/btrfs/ordered-data.c165
-rw-r--r--fs/btrfs/ordered-data.h13
-rw-r--r--fs/btrfs/print-tree.c3
-rw-r--r--fs/btrfs/reada.c5
-rw-r--r--fs/btrfs/scrub.c65
-rw-r--r--fs/btrfs/super.c117
-rw-r--r--fs/btrfs/transaction.c59
-rw-r--r--fs/btrfs/tree-log.c35
-rw-r--r--fs/btrfs/ulist.c38
-rw-r--r--fs/btrfs/ulist.h15
-rw-r--r--fs/btrfs/volumes.c306
-rw-r--r--fs/btrfs/volumes.h52
-rw-r--r--fs/btrfs/xattr.c1
33 files changed, 2849 insertions, 866 deletions
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 89b156d85d63..761e2cd8fed1 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -227,7 +227,11 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans,
227 if (ret > 0) { 227 if (ret > 0) {
228 /* we need an acl */ 228 /* we need an acl */
229 ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS); 229 ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS);
230 } else {
231 cache_no_acl(inode);
230 } 232 }
233 } else {
234 cache_no_acl(inode);
231 } 235 }
232failed: 236failed:
233 posix_acl_release(acl); 237 posix_acl_release(acl);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index bcec06750232..3f75895c919b 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -24,22 +24,135 @@
24#include "delayed-ref.h" 24#include "delayed-ref.h"
25#include "locking.h" 25#include "locking.h"
26 26
27struct extent_inode_elem {
28 u64 inum;
29 u64 offset;
30 struct extent_inode_elem *next;
31};
32
33static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb,
34 struct btrfs_file_extent_item *fi,
35 u64 extent_item_pos,
36 struct extent_inode_elem **eie)
37{
38 u64 data_offset;
39 u64 data_len;
40 struct extent_inode_elem *e;
41
42 data_offset = btrfs_file_extent_offset(eb, fi);
43 data_len = btrfs_file_extent_num_bytes(eb, fi);
44
45 if (extent_item_pos < data_offset ||
46 extent_item_pos >= data_offset + data_len)
47 return 1;
48
49 e = kmalloc(sizeof(*e), GFP_NOFS);
50 if (!e)
51 return -ENOMEM;
52
53 e->next = *eie;
54 e->inum = key->objectid;
55 e->offset = key->offset + (extent_item_pos - data_offset);
56 *eie = e;
57
58 return 0;
59}
60
61static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte,
62 u64 extent_item_pos,
63 struct extent_inode_elem **eie)
64{
65 u64 disk_byte;
66 struct btrfs_key key;
67 struct btrfs_file_extent_item *fi;
68 int slot;
69 int nritems;
70 int extent_type;
71 int ret;
72
73 /*
74 * from the shared data ref, we only have the leaf but we need
75 * the key. thus, we must look into all items and see that we
76 * find one (some) with a reference to our extent item.
77 */
78 nritems = btrfs_header_nritems(eb);
79 for (slot = 0; slot < nritems; ++slot) {
80 btrfs_item_key_to_cpu(eb, &key, slot);
81 if (key.type != BTRFS_EXTENT_DATA_KEY)
82 continue;
83 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
84 extent_type = btrfs_file_extent_type(eb, fi);
85 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
86 continue;
87 /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
88 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
89 if (disk_byte != wanted_disk_byte)
90 continue;
91
92 ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie);
93 if (ret < 0)
94 return ret;
95 }
96
97 return 0;
98}
99
27/* 100/*
28 * this structure records all encountered refs on the way up to the root 101 * this structure records all encountered refs on the way up to the root
29 */ 102 */
30struct __prelim_ref { 103struct __prelim_ref {
31 struct list_head list; 104 struct list_head list;
32 u64 root_id; 105 u64 root_id;
33 struct btrfs_key key; 106 struct btrfs_key key_for_search;
34 int level; 107 int level;
35 int count; 108 int count;
109 struct extent_inode_elem *inode_list;
36 u64 parent; 110 u64 parent;
37 u64 wanted_disk_byte; 111 u64 wanted_disk_byte;
38}; 112};
39 113
114/*
115 * the rules for all callers of this function are:
116 * - obtaining the parent is the goal
117 * - if you add a key, you must know that it is a correct key
118 * - if you cannot add the parent or a correct key, then we will look into the
119 * block later to set a correct key
120 *
121 * delayed refs
122 * ============
123 * backref type | shared | indirect | shared | indirect
124 * information | tree | tree | data | data
125 * --------------------+--------+----------+--------+----------
126 * parent logical | y | - | - | -
127 * key to resolve | - | y | y | y
128 * tree block logical | - | - | - | -
129 * root for resolving | y | y | y | y
130 *
131 * - column 1: we've the parent -> done
132 * - column 2, 3, 4: we use the key to find the parent
133 *
134 * on disk refs (inline or keyed)
135 * ==============================
136 * backref type | shared | indirect | shared | indirect
137 * information | tree | tree | data | data
138 * --------------------+--------+----------+--------+----------
139 * parent logical | y | - | y | -
140 * key to resolve | - | - | - | y
141 * tree block logical | y | y | y | y
142 * root for resolving | - | y | y | y
143 *
144 * - column 1, 3: we've the parent -> done
145 * - column 2: we take the first key from the block to find the parent
146 * (see __add_missing_keys)
147 * - column 4: we use the key to find the parent
148 *
149 * additional information that's available but not required to find the parent
150 * block might help in merging entries to gain some speed.
151 */
152
40static int __add_prelim_ref(struct list_head *head, u64 root_id, 153static int __add_prelim_ref(struct list_head *head, u64 root_id,
41 struct btrfs_key *key, int level, u64 parent, 154 struct btrfs_key *key, int level,
42 u64 wanted_disk_byte, int count) 155 u64 parent, u64 wanted_disk_byte, int count)
43{ 156{
44 struct __prelim_ref *ref; 157 struct __prelim_ref *ref;
45 158
@@ -50,10 +163,11 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
50 163
51 ref->root_id = root_id; 164 ref->root_id = root_id;
52 if (key) 165 if (key)
53 ref->key = *key; 166 ref->key_for_search = *key;
54 else 167 else
55 memset(&ref->key, 0, sizeof(ref->key)); 168 memset(&ref->key_for_search, 0, sizeof(ref->key_for_search));
56 169
170 ref->inode_list = NULL;
57 ref->level = level; 171 ref->level = level;
58 ref->count = count; 172 ref->count = count;
59 ref->parent = parent; 173 ref->parent = parent;
@@ -64,18 +178,26 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
64} 178}
65 179
66static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, 180static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
67 struct ulist *parents, 181 struct ulist *parents, int level,
68 struct extent_buffer *eb, int level, 182 struct btrfs_key *key, u64 wanted_disk_byte,
69 u64 wanted_objectid, u64 wanted_disk_byte) 183 const u64 *extent_item_pos)
70{ 184{
71 int ret; 185 int ret;
72 int slot; 186 int slot = path->slots[level];
187 struct extent_buffer *eb = path->nodes[level];
73 struct btrfs_file_extent_item *fi; 188 struct btrfs_file_extent_item *fi;
74 struct btrfs_key key; 189 struct extent_inode_elem *eie = NULL;
75 u64 disk_byte; 190 u64 disk_byte;
191 u64 wanted_objectid = key->objectid;
76 192
77add_parent: 193add_parent:
78 ret = ulist_add(parents, eb->start, 0, GFP_NOFS); 194 if (level == 0 && extent_item_pos) {
195 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
196 ret = check_extent_in_eb(key, eb, fi, *extent_item_pos, &eie);
197 if (ret < 0)
198 return ret;
199 }
200 ret = ulist_add(parents, eb->start, (unsigned long)eie, GFP_NOFS);
79 if (ret < 0) 201 if (ret < 0)
80 return ret; 202 return ret;
81 203
@@ -89,6 +211,7 @@ add_parent:
89 * repeat this until we don't find any additional EXTENT_DATA items. 211 * repeat this until we don't find any additional EXTENT_DATA items.
90 */ 212 */
91 while (1) { 213 while (1) {
214 eie = NULL;
92 ret = btrfs_next_leaf(root, path); 215 ret = btrfs_next_leaf(root, path);
93 if (ret < 0) 216 if (ret < 0)
94 return ret; 217 return ret;
@@ -97,9 +220,9 @@ add_parent:
97 220
98 eb = path->nodes[0]; 221 eb = path->nodes[0];
99 for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) { 222 for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) {
100 btrfs_item_key_to_cpu(eb, &key, slot); 223 btrfs_item_key_to_cpu(eb, key, slot);
101 if (key.objectid != wanted_objectid || 224 if (key->objectid != wanted_objectid ||
102 key.type != BTRFS_EXTENT_DATA_KEY) 225 key->type != BTRFS_EXTENT_DATA_KEY)
103 return 0; 226 return 0;
104 fi = btrfs_item_ptr(eb, slot, 227 fi = btrfs_item_ptr(eb, slot,
105 struct btrfs_file_extent_item); 228 struct btrfs_file_extent_item);
@@ -118,8 +241,10 @@ add_parent:
118 */ 241 */
119static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, 242static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
120 int search_commit_root, 243 int search_commit_root,
244 u64 time_seq,
121 struct __prelim_ref *ref, 245 struct __prelim_ref *ref,
122 struct ulist *parents) 246 struct ulist *parents,
247 const u64 *extent_item_pos)
123{ 248{
124 struct btrfs_path *path; 249 struct btrfs_path *path;
125 struct btrfs_root *root; 250 struct btrfs_root *root;
@@ -152,12 +277,13 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
152 goto out; 277 goto out;
153 278
154 path->lowest_level = level; 279 path->lowest_level = level;
155 ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0); 280 ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq);
156 pr_debug("search slot in root %llu (level %d, ref count %d) returned " 281 pr_debug("search slot in root %llu (level %d, ref count %d) returned "
157 "%d for key (%llu %u %llu)\n", 282 "%d for key (%llu %u %llu)\n",
158 (unsigned long long)ref->root_id, level, ref->count, ret, 283 (unsigned long long)ref->root_id, level, ref->count, ret,
159 (unsigned long long)ref->key.objectid, ref->key.type, 284 (unsigned long long)ref->key_for_search.objectid,
160 (unsigned long long)ref->key.offset); 285 ref->key_for_search.type,
286 (unsigned long long)ref->key_for_search.offset);
161 if (ret < 0) 287 if (ret < 0)
162 goto out; 288 goto out;
163 289
@@ -179,9 +305,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
179 btrfs_item_key_to_cpu(eb, &key, path->slots[0]); 305 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
180 } 306 }
181 307
182 /* the last two parameters will only be used for level == 0 */ 308 ret = add_all_parents(root, path, parents, level, &key,
183 ret = add_all_parents(root, path, parents, eb, level, key.objectid, 309 ref->wanted_disk_byte, extent_item_pos);
184 ref->wanted_disk_byte);
185out: 310out:
186 btrfs_free_path(path); 311 btrfs_free_path(path);
187 return ret; 312 return ret;
@@ -191,8 +316,9 @@ out:
191 * resolve all indirect backrefs from the list 316 * resolve all indirect backrefs from the list
192 */ 317 */
193static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, 318static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
194 int search_commit_root, 319 int search_commit_root, u64 time_seq,
195 struct list_head *head) 320 struct list_head *head,
321 const u64 *extent_item_pos)
196{ 322{
197 int err; 323 int err;
198 int ret = 0; 324 int ret = 0;
@@ -201,6 +327,7 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
201 struct __prelim_ref *new_ref; 327 struct __prelim_ref *new_ref;
202 struct ulist *parents; 328 struct ulist *parents;
203 struct ulist_node *node; 329 struct ulist_node *node;
330 struct ulist_iterator uiter;
204 331
205 parents = ulist_alloc(GFP_NOFS); 332 parents = ulist_alloc(GFP_NOFS);
206 if (!parents) 333 if (!parents)
@@ -217,7 +344,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
217 if (ref->count == 0) 344 if (ref->count == 0)
218 continue; 345 continue;
219 err = __resolve_indirect_ref(fs_info, search_commit_root, 346 err = __resolve_indirect_ref(fs_info, search_commit_root,
220 ref, parents); 347 time_seq, ref, parents,
348 extent_item_pos);
221 if (err) { 349 if (err) {
222 if (ret == 0) 350 if (ret == 0)
223 ret = err; 351 ret = err;
@@ -225,11 +353,14 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
225 } 353 }
226 354
227 /* we put the first parent into the ref at hand */ 355 /* we put the first parent into the ref at hand */
228 node = ulist_next(parents, NULL); 356 ULIST_ITER_INIT(&uiter);
357 node = ulist_next(parents, &uiter);
229 ref->parent = node ? node->val : 0; 358 ref->parent = node ? node->val : 0;
359 ref->inode_list =
360 node ? (struct extent_inode_elem *)node->aux : 0;
230 361
231 /* additional parents require new refs being added here */ 362 /* additional parents require new refs being added here */
232 while ((node = ulist_next(parents, node))) { 363 while ((node = ulist_next(parents, &uiter))) {
233 new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS); 364 new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS);
234 if (!new_ref) { 365 if (!new_ref) {
235 ret = -ENOMEM; 366 ret = -ENOMEM;
@@ -237,6 +368,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
237 } 368 }
238 memcpy(new_ref, ref, sizeof(*ref)); 369 memcpy(new_ref, ref, sizeof(*ref));
239 new_ref->parent = node->val; 370 new_ref->parent = node->val;
371 new_ref->inode_list =
372 (struct extent_inode_elem *)node->aux;
240 list_add(&new_ref->list, &ref->list); 373 list_add(&new_ref->list, &ref->list);
241 } 374 }
242 ulist_reinit(parents); 375 ulist_reinit(parents);
@@ -246,10 +379,65 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
246 return ret; 379 return ret;
247} 380}
248 381
382static inline int ref_for_same_block(struct __prelim_ref *ref1,
383 struct __prelim_ref *ref2)
384{
385 if (ref1->level != ref2->level)
386 return 0;
387 if (ref1->root_id != ref2->root_id)
388 return 0;
389 if (ref1->key_for_search.type != ref2->key_for_search.type)
390 return 0;
391 if (ref1->key_for_search.objectid != ref2->key_for_search.objectid)
392 return 0;
393 if (ref1->key_for_search.offset != ref2->key_for_search.offset)
394 return 0;
395 if (ref1->parent != ref2->parent)
396 return 0;
397
398 return 1;
399}
400
401/*
402 * read tree blocks and add keys where required.
403 */
404static int __add_missing_keys(struct btrfs_fs_info *fs_info,
405 struct list_head *head)
406{
407 struct list_head *pos;
408 struct extent_buffer *eb;
409
410 list_for_each(pos, head) {
411 struct __prelim_ref *ref;
412 ref = list_entry(pos, struct __prelim_ref, list);
413
414 if (ref->parent)
415 continue;
416 if (ref->key_for_search.type)
417 continue;
418 BUG_ON(!ref->wanted_disk_byte);
419 eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
420 fs_info->tree_root->leafsize, 0);
421 BUG_ON(!eb);
422 btrfs_tree_read_lock(eb);
423 if (btrfs_header_level(eb) == 0)
424 btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
425 else
426 btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
427 btrfs_tree_read_unlock(eb);
428 free_extent_buffer(eb);
429 }
430 return 0;
431}
432
249/* 433/*
250 * merge two lists of backrefs and adjust counts accordingly 434 * merge two lists of backrefs and adjust counts accordingly
251 * 435 *
252 * mode = 1: merge identical keys, if key is set 436 * mode = 1: merge identical keys, if key is set
437 * FIXME: if we add more keys in __add_prelim_ref, we can merge more here.
438 * additionally, we could even add a key range for the blocks we
439 * looked into to merge even more (-> replace unresolved refs by those
440 * having a parent).
253 * mode = 2: merge identical parents 441 * mode = 2: merge identical parents
254 */ 442 */
255static int __merge_refs(struct list_head *head, int mode) 443static int __merge_refs(struct list_head *head, int mode)
@@ -263,20 +451,21 @@ static int __merge_refs(struct list_head *head, int mode)
263 451
264 ref1 = list_entry(pos1, struct __prelim_ref, list); 452 ref1 = list_entry(pos1, struct __prelim_ref, list);
265 453
266 if (mode == 1 && ref1->key.type == 0)
267 continue;
268 for (pos2 = pos1->next, n2 = pos2->next; pos2 != head; 454 for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
269 pos2 = n2, n2 = pos2->next) { 455 pos2 = n2, n2 = pos2->next) {
270 struct __prelim_ref *ref2; 456 struct __prelim_ref *ref2;
457 struct __prelim_ref *xchg;
271 458
272 ref2 = list_entry(pos2, struct __prelim_ref, list); 459 ref2 = list_entry(pos2, struct __prelim_ref, list);
273 460
274 if (mode == 1) { 461 if (mode == 1) {
275 if (memcmp(&ref1->key, &ref2->key, 462 if (!ref_for_same_block(ref1, ref2))
276 sizeof(ref1->key)) ||
277 ref1->level != ref2->level ||
278 ref1->root_id != ref2->root_id)
279 continue; 463 continue;
464 if (!ref1->parent && ref2->parent) {
465 xchg = ref1;
466 ref1 = ref2;
467 ref2 = xchg;
468 }
280 ref1->count += ref2->count; 469 ref1->count += ref2->count;
281 } else { 470 } else {
282 if (ref1->parent != ref2->parent) 471 if (ref1->parent != ref2->parent)
@@ -296,16 +485,17 @@ static int __merge_refs(struct list_head *head, int mode)
296 * smaller or equal that seq to the list 485 * smaller or equal that seq to the list
297 */ 486 */
298static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, 487static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
299 struct btrfs_key *info_key,
300 struct list_head *prefs) 488 struct list_head *prefs)
301{ 489{
302 struct btrfs_delayed_extent_op *extent_op = head->extent_op; 490 struct btrfs_delayed_extent_op *extent_op = head->extent_op;
303 struct rb_node *n = &head->node.rb_node; 491 struct rb_node *n = &head->node.rb_node;
492 struct btrfs_key key;
493 struct btrfs_key op_key = {0};
304 int sgn; 494 int sgn;
305 int ret = 0; 495 int ret = 0;
306 496
307 if (extent_op && extent_op->update_key) 497 if (extent_op && extent_op->update_key)
308 btrfs_disk_key_to_cpu(info_key, &extent_op->key); 498 btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
309 499
310 while ((n = rb_prev(n))) { 500 while ((n = rb_prev(n))) {
311 struct btrfs_delayed_ref_node *node; 501 struct btrfs_delayed_ref_node *node;
@@ -337,7 +527,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
337 struct btrfs_delayed_tree_ref *ref; 527 struct btrfs_delayed_tree_ref *ref;
338 528
339 ref = btrfs_delayed_node_to_tree_ref(node); 529 ref = btrfs_delayed_node_to_tree_ref(node);
340 ret = __add_prelim_ref(prefs, ref->root, info_key, 530 ret = __add_prelim_ref(prefs, ref->root, &op_key,
341 ref->level + 1, 0, node->bytenr, 531 ref->level + 1, 0, node->bytenr,
342 node->ref_mod * sgn); 532 node->ref_mod * sgn);
343 break; 533 break;
@@ -346,7 +536,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
346 struct btrfs_delayed_tree_ref *ref; 536 struct btrfs_delayed_tree_ref *ref;
347 537
348 ref = btrfs_delayed_node_to_tree_ref(node); 538 ref = btrfs_delayed_node_to_tree_ref(node);
349 ret = __add_prelim_ref(prefs, ref->root, info_key, 539 ret = __add_prelim_ref(prefs, ref->root, NULL,
350 ref->level + 1, ref->parent, 540 ref->level + 1, ref->parent,
351 node->bytenr, 541 node->bytenr,
352 node->ref_mod * sgn); 542 node->ref_mod * sgn);
@@ -354,8 +544,6 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
354 } 544 }
355 case BTRFS_EXTENT_DATA_REF_KEY: { 545 case BTRFS_EXTENT_DATA_REF_KEY: {
356 struct btrfs_delayed_data_ref *ref; 546 struct btrfs_delayed_data_ref *ref;
357 struct btrfs_key key;
358
359 ref = btrfs_delayed_node_to_data_ref(node); 547 ref = btrfs_delayed_node_to_data_ref(node);
360 548
361 key.objectid = ref->objectid; 549 key.objectid = ref->objectid;
@@ -368,7 +556,6 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
368 } 556 }
369 case BTRFS_SHARED_DATA_REF_KEY: { 557 case BTRFS_SHARED_DATA_REF_KEY: {
370 struct btrfs_delayed_data_ref *ref; 558 struct btrfs_delayed_data_ref *ref;
371 struct btrfs_key key;
372 559
373 ref = btrfs_delayed_node_to_data_ref(node); 560 ref = btrfs_delayed_node_to_data_ref(node);
374 561
@@ -394,8 +581,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
394 */ 581 */
395static int __add_inline_refs(struct btrfs_fs_info *fs_info, 582static int __add_inline_refs(struct btrfs_fs_info *fs_info,
396 struct btrfs_path *path, u64 bytenr, 583 struct btrfs_path *path, u64 bytenr,
397 struct btrfs_key *info_key, int *info_level, 584 int *info_level, struct list_head *prefs)
398 struct list_head *prefs)
399{ 585{
400 int ret = 0; 586 int ret = 0;
401 int slot; 587 int slot;
@@ -411,7 +597,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
411 * enumerate all inline refs 597 * enumerate all inline refs
412 */ 598 */
413 leaf = path->nodes[0]; 599 leaf = path->nodes[0];
414 slot = path->slots[0] - 1; 600 slot = path->slots[0];
415 601
416 item_size = btrfs_item_size_nr(leaf, slot); 602 item_size = btrfs_item_size_nr(leaf, slot);
417 BUG_ON(item_size < sizeof(*ei)); 603 BUG_ON(item_size < sizeof(*ei));
@@ -424,12 +610,9 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
424 610
425 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 611 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
426 struct btrfs_tree_block_info *info; 612 struct btrfs_tree_block_info *info;
427 struct btrfs_disk_key disk_key;
428 613
429 info = (struct btrfs_tree_block_info *)ptr; 614 info = (struct btrfs_tree_block_info *)ptr;
430 *info_level = btrfs_tree_block_level(leaf, info); 615 *info_level = btrfs_tree_block_level(leaf, info);
431 btrfs_tree_block_key(leaf, info, &disk_key);
432 btrfs_disk_key_to_cpu(info_key, &disk_key);
433 ptr += sizeof(struct btrfs_tree_block_info); 616 ptr += sizeof(struct btrfs_tree_block_info);
434 BUG_ON(ptr > end); 617 BUG_ON(ptr > end);
435 } else { 618 } else {
@@ -447,7 +630,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
447 630
448 switch (type) { 631 switch (type) {
449 case BTRFS_SHARED_BLOCK_REF_KEY: 632 case BTRFS_SHARED_BLOCK_REF_KEY:
450 ret = __add_prelim_ref(prefs, 0, info_key, 633 ret = __add_prelim_ref(prefs, 0, NULL,
451 *info_level + 1, offset, 634 *info_level + 1, offset,
452 bytenr, 1); 635 bytenr, 1);
453 break; 636 break;
@@ -462,8 +645,9 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
462 break; 645 break;
463 } 646 }
464 case BTRFS_TREE_BLOCK_REF_KEY: 647 case BTRFS_TREE_BLOCK_REF_KEY:
465 ret = __add_prelim_ref(prefs, offset, info_key, 648 ret = __add_prelim_ref(prefs, offset, NULL,
466 *info_level + 1, 0, bytenr, 1); 649 *info_level + 1, 0,
650 bytenr, 1);
467 break; 651 break;
468 case BTRFS_EXTENT_DATA_REF_KEY: { 652 case BTRFS_EXTENT_DATA_REF_KEY: {
469 struct btrfs_extent_data_ref *dref; 653 struct btrfs_extent_data_ref *dref;
@@ -477,8 +661,8 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
477 key.type = BTRFS_EXTENT_DATA_KEY; 661 key.type = BTRFS_EXTENT_DATA_KEY;
478 key.offset = btrfs_extent_data_ref_offset(leaf, dref); 662 key.offset = btrfs_extent_data_ref_offset(leaf, dref);
479 root = btrfs_extent_data_ref_root(leaf, dref); 663 root = btrfs_extent_data_ref_root(leaf, dref);
480 ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr, 664 ret = __add_prelim_ref(prefs, root, &key, 0, 0,
481 count); 665 bytenr, count);
482 break; 666 break;
483 } 667 }
484 default: 668 default:
@@ -496,8 +680,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
496 */ 680 */
497static int __add_keyed_refs(struct btrfs_fs_info *fs_info, 681static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
498 struct btrfs_path *path, u64 bytenr, 682 struct btrfs_path *path, u64 bytenr,
499 struct btrfs_key *info_key, int info_level, 683 int info_level, struct list_head *prefs)
500 struct list_head *prefs)
501{ 684{
502 struct btrfs_root *extent_root = fs_info->extent_root; 685 struct btrfs_root *extent_root = fs_info->extent_root;
503 int ret; 686 int ret;
@@ -527,7 +710,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
527 710
528 switch (key.type) { 711 switch (key.type) {
529 case BTRFS_SHARED_BLOCK_REF_KEY: 712 case BTRFS_SHARED_BLOCK_REF_KEY:
530 ret = __add_prelim_ref(prefs, 0, info_key, 713 ret = __add_prelim_ref(prefs, 0, NULL,
531 info_level + 1, key.offset, 714 info_level + 1, key.offset,
532 bytenr, 1); 715 bytenr, 1);
533 break; 716 break;
@@ -543,8 +726,9 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
543 break; 726 break;
544 } 727 }
545 case BTRFS_TREE_BLOCK_REF_KEY: 728 case BTRFS_TREE_BLOCK_REF_KEY:
546 ret = __add_prelim_ref(prefs, key.offset, info_key, 729 ret = __add_prelim_ref(prefs, key.offset, NULL,
547 info_level + 1, 0, bytenr, 1); 730 info_level + 1, 0,
731 bytenr, 1);
548 break; 732 break;
549 case BTRFS_EXTENT_DATA_REF_KEY: { 733 case BTRFS_EXTENT_DATA_REF_KEY: {
550 struct btrfs_extent_data_ref *dref; 734 struct btrfs_extent_data_ref *dref;
@@ -560,7 +744,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
560 key.offset = btrfs_extent_data_ref_offset(leaf, dref); 744 key.offset = btrfs_extent_data_ref_offset(leaf, dref);
561 root = btrfs_extent_data_ref_root(leaf, dref); 745 root = btrfs_extent_data_ref_root(leaf, dref);
562 ret = __add_prelim_ref(prefs, root, &key, 0, 0, 746 ret = __add_prelim_ref(prefs, root, &key, 0, 0,
563 bytenr, count); 747 bytenr, count);
564 break; 748 break;
565 } 749 }
566 default: 750 default:
@@ -582,11 +766,12 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
582 */ 766 */
583static int find_parent_nodes(struct btrfs_trans_handle *trans, 767static int find_parent_nodes(struct btrfs_trans_handle *trans,
584 struct btrfs_fs_info *fs_info, u64 bytenr, 768 struct btrfs_fs_info *fs_info, u64 bytenr,
585 u64 seq, struct ulist *refs, struct ulist *roots) 769 u64 delayed_ref_seq, u64 time_seq,
770 struct ulist *refs, struct ulist *roots,
771 const u64 *extent_item_pos)
586{ 772{
587 struct btrfs_key key; 773 struct btrfs_key key;
588 struct btrfs_path *path; 774 struct btrfs_path *path;
589 struct btrfs_key info_key = { 0 };
590 struct btrfs_delayed_ref_root *delayed_refs = NULL; 775 struct btrfs_delayed_ref_root *delayed_refs = NULL;
591 struct btrfs_delayed_ref_head *head; 776 struct btrfs_delayed_ref_head *head;
592 int info_level = 0; 777 int info_level = 0;
@@ -645,7 +830,7 @@ again:
645 btrfs_put_delayed_ref(&head->node); 830 btrfs_put_delayed_ref(&head->node);
646 goto again; 831 goto again;
647 } 832 }
648 ret = __add_delayed_refs(head, seq, &info_key, 833 ret = __add_delayed_refs(head, delayed_ref_seq,
649 &prefs_delayed); 834 &prefs_delayed);
650 if (ret) { 835 if (ret) {
651 spin_unlock(&delayed_refs->lock); 836 spin_unlock(&delayed_refs->lock);
@@ -659,16 +844,17 @@ again:
659 struct extent_buffer *leaf; 844 struct extent_buffer *leaf;
660 int slot; 845 int slot;
661 846
847 path->slots[0]--;
662 leaf = path->nodes[0]; 848 leaf = path->nodes[0];
663 slot = path->slots[0] - 1; 849 slot = path->slots[0];
664 btrfs_item_key_to_cpu(leaf, &key, slot); 850 btrfs_item_key_to_cpu(leaf, &key, slot);
665 if (key.objectid == bytenr && 851 if (key.objectid == bytenr &&
666 key.type == BTRFS_EXTENT_ITEM_KEY) { 852 key.type == BTRFS_EXTENT_ITEM_KEY) {
667 ret = __add_inline_refs(fs_info, path, bytenr, 853 ret = __add_inline_refs(fs_info, path, bytenr,
668 &info_key, &info_level, &prefs); 854 &info_level, &prefs);
669 if (ret) 855 if (ret)
670 goto out; 856 goto out;
671 ret = __add_keyed_refs(fs_info, path, bytenr, &info_key, 857 ret = __add_keyed_refs(fs_info, path, bytenr,
672 info_level, &prefs); 858 info_level, &prefs);
673 if (ret) 859 if (ret)
674 goto out; 860 goto out;
@@ -676,21 +862,18 @@ again:
676 } 862 }
677 btrfs_release_path(path); 863 btrfs_release_path(path);
678 864
679 /*
680 * when adding the delayed refs above, the info_key might not have
681 * been known yet. Go over the list and replace the missing keys
682 */
683 list_for_each_entry(ref, &prefs_delayed, list) {
684 if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0)
685 memcpy(&ref->key, &info_key, sizeof(ref->key));
686 }
687 list_splice_init(&prefs_delayed, &prefs); 865 list_splice_init(&prefs_delayed, &prefs);
688 866
867 ret = __add_missing_keys(fs_info, &prefs);
868 if (ret)
869 goto out;
870
689 ret = __merge_refs(&prefs, 1); 871 ret = __merge_refs(&prefs, 1);
690 if (ret) 872 if (ret)
691 goto out; 873 goto out;
692 874
693 ret = __resolve_indirect_refs(fs_info, search_commit_root, &prefs); 875 ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq,
876 &prefs, extent_item_pos);
694 if (ret) 877 if (ret)
695 goto out; 878 goto out;
696 879
@@ -709,7 +892,33 @@ again:
709 BUG_ON(ret < 0); 892 BUG_ON(ret < 0);
710 } 893 }
711 if (ref->count && ref->parent) { 894 if (ref->count && ref->parent) {
712 ret = ulist_add(refs, ref->parent, 0, GFP_NOFS); 895 struct extent_inode_elem *eie = NULL;
896 if (extent_item_pos && !ref->inode_list) {
897 u32 bsz;
898 struct extent_buffer *eb;
899 bsz = btrfs_level_size(fs_info->extent_root,
900 info_level);
901 eb = read_tree_block(fs_info->extent_root,
902 ref->parent, bsz, 0);
903 BUG_ON(!eb);
904 ret = find_extent_in_eb(eb, bytenr,
905 *extent_item_pos, &eie);
906 ref->inode_list = eie;
907 free_extent_buffer(eb);
908 }
909 ret = ulist_add_merge(refs, ref->parent,
910 (unsigned long)ref->inode_list,
911 (unsigned long *)&eie, GFP_NOFS);
912 if (!ret && extent_item_pos) {
913 /*
914 * we've recorded that parent, so we must extend
915 * its inode list here
916 */
917 BUG_ON(!eie);
918 while (eie->next)
919 eie = eie->next;
920 eie->next = ref->inode_list;
921 }
713 BUG_ON(ret < 0); 922 BUG_ON(ret < 0);
714 } 923 }
715 kfree(ref); 924 kfree(ref);
@@ -734,6 +943,28 @@ out:
734 return ret; 943 return ret;
735} 944}
736 945
946static void free_leaf_list(struct ulist *blocks)
947{
948 struct ulist_node *node = NULL;
949 struct extent_inode_elem *eie;
950 struct extent_inode_elem *eie_next;
951 struct ulist_iterator uiter;
952
953 ULIST_ITER_INIT(&uiter);
954 while ((node = ulist_next(blocks, &uiter))) {
955 if (!node->aux)
956 continue;
957 eie = (struct extent_inode_elem *)node->aux;
958 for (; eie; eie = eie_next) {
959 eie_next = eie->next;
960 kfree(eie);
961 }
962 node->aux = 0;
963 }
964
965 ulist_free(blocks);
966}
967
737/* 968/*
738 * Finds all leafs with a reference to the specified combination of bytenr and 969 * Finds all leafs with a reference to the specified combination of bytenr and
739 * offset. key_list_head will point to a list of corresponding keys (caller must 970 * offset. key_list_head will point to a list of corresponding keys (caller must
@@ -744,7 +975,9 @@ out:
744 */ 975 */
745static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, 976static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
746 struct btrfs_fs_info *fs_info, u64 bytenr, 977 struct btrfs_fs_info *fs_info, u64 bytenr,
747 u64 num_bytes, u64 seq, struct ulist **leafs) 978 u64 delayed_ref_seq, u64 time_seq,
979 struct ulist **leafs,
980 const u64 *extent_item_pos)
748{ 981{
749 struct ulist *tmp; 982 struct ulist *tmp;
750 int ret; 983 int ret;
@@ -758,11 +991,12 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
758 return -ENOMEM; 991 return -ENOMEM;
759 } 992 }
760 993
761 ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp); 994 ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq,
995 time_seq, *leafs, tmp, extent_item_pos);
762 ulist_free(tmp); 996 ulist_free(tmp);
763 997
764 if (ret < 0 && ret != -ENOENT) { 998 if (ret < 0 && ret != -ENOENT) {
765 ulist_free(*leafs); 999 free_leaf_list(*leafs);
766 return ret; 1000 return ret;
767 } 1001 }
768 1002
@@ -784,10 +1018,12 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
784 */ 1018 */
785int btrfs_find_all_roots(struct btrfs_trans_handle *trans, 1019int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
786 struct btrfs_fs_info *fs_info, u64 bytenr, 1020 struct btrfs_fs_info *fs_info, u64 bytenr,
787 u64 num_bytes, u64 seq, struct ulist **roots) 1021 u64 delayed_ref_seq, u64 time_seq,
1022 struct ulist **roots)
788{ 1023{
789 struct ulist *tmp; 1024 struct ulist *tmp;
790 struct ulist_node *node = NULL; 1025 struct ulist_node *node = NULL;
1026 struct ulist_iterator uiter;
791 int ret; 1027 int ret;
792 1028
793 tmp = ulist_alloc(GFP_NOFS); 1029 tmp = ulist_alloc(GFP_NOFS);
@@ -799,15 +1035,16 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
799 return -ENOMEM; 1035 return -ENOMEM;
800 } 1036 }
801 1037
1038 ULIST_ITER_INIT(&uiter);
802 while (1) { 1039 while (1) {
803 ret = find_parent_nodes(trans, fs_info, bytenr, seq, 1040 ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq,
804 tmp, *roots); 1041 time_seq, tmp, *roots, NULL);
805 if (ret < 0 && ret != -ENOENT) { 1042 if (ret < 0 && ret != -ENOENT) {
806 ulist_free(tmp); 1043 ulist_free(tmp);
807 ulist_free(*roots); 1044 ulist_free(*roots);
808 return ret; 1045 return ret;
809 } 1046 }
810 node = ulist_next(tmp, node); 1047 node = ulist_next(tmp, &uiter);
811 if (!node) 1048 if (!node)
812 break; 1049 break;
813 bytenr = node->val; 1050 bytenr = node->val;
@@ -1093,67 +1330,25 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
1093 return 0; 1330 return 0;
1094} 1331}
1095 1332
1096static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, u64 logical, 1333static int iterate_leaf_refs(struct extent_inode_elem *inode_list,
1097 u64 orig_extent_item_objectid, 1334 u64 root, u64 extent_item_objectid,
1098 u64 extent_item_pos, u64 root,
1099 iterate_extent_inodes_t *iterate, void *ctx) 1335 iterate_extent_inodes_t *iterate, void *ctx)
1100{ 1336{
1101 u64 disk_byte; 1337 struct extent_inode_elem *eie;
1102 struct btrfs_key key;
1103 struct btrfs_file_extent_item *fi;
1104 struct extent_buffer *eb;
1105 int slot;
1106 int nritems;
1107 int ret = 0; 1338 int ret = 0;
1108 int extent_type;
1109 u64 data_offset;
1110 u64 data_len;
1111
1112 eb = read_tree_block(fs_info->tree_root, logical,
1113 fs_info->tree_root->leafsize, 0);
1114 if (!eb)
1115 return -EIO;
1116
1117 /*
1118 * from the shared data ref, we only have the leaf but we need
1119 * the key. thus, we must look into all items and see that we
1120 * find one (some) with a reference to our extent item.
1121 */
1122 nritems = btrfs_header_nritems(eb);
1123 for (slot = 0; slot < nritems; ++slot) {
1124 btrfs_item_key_to_cpu(eb, &key, slot);
1125 if (key.type != BTRFS_EXTENT_DATA_KEY)
1126 continue;
1127 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
1128 extent_type = btrfs_file_extent_type(eb, fi);
1129 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
1130 continue;
1131 /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
1132 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
1133 if (disk_byte != orig_extent_item_objectid)
1134 continue;
1135
1136 data_offset = btrfs_file_extent_offset(eb, fi);
1137 data_len = btrfs_file_extent_num_bytes(eb, fi);
1138
1139 if (extent_item_pos < data_offset ||
1140 extent_item_pos >= data_offset + data_len)
1141 continue;
1142 1339
1340 for (eie = inode_list; eie; eie = eie->next) {
1143 pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), " 1341 pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), "
1144 "root %llu\n", orig_extent_item_objectid, 1342 "root %llu\n", extent_item_objectid,
1145 key.objectid, key.offset, root); 1343 eie->inum, eie->offset, root);
1146 ret = iterate(key.objectid, 1344 ret = iterate(eie->inum, eie->offset, root, ctx);
1147 key.offset + (extent_item_pos - data_offset),
1148 root, ctx);
1149 if (ret) { 1345 if (ret) {
1150 pr_debug("stopping iteration because ret=%d\n", ret); 1346 pr_debug("stopping iteration for %llu due to ret=%d\n",
1347 extent_item_objectid, ret);
1151 break; 1348 break;
1152 } 1349 }
1153 } 1350 }
1154 1351
1155 free_extent_buffer(eb);
1156
1157 return ret; 1352 return ret;
1158} 1353}
1159 1354
@@ -1175,7 +1370,10 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1175 struct ulist *roots = NULL; 1370 struct ulist *roots = NULL;
1176 struct ulist_node *ref_node = NULL; 1371 struct ulist_node *ref_node = NULL;
1177 struct ulist_node *root_node = NULL; 1372 struct ulist_node *root_node = NULL;
1178 struct seq_list seq_elem; 1373 struct seq_list seq_elem = {};
1374 struct seq_list tree_mod_seq_elem = {};
1375 struct ulist_iterator ref_uiter;
1376 struct ulist_iterator root_uiter;
1179 struct btrfs_delayed_ref_root *delayed_refs = NULL; 1377 struct btrfs_delayed_ref_root *delayed_refs = NULL;
1180 1378
1181 pr_debug("resolving all inodes for extent %llu\n", 1379 pr_debug("resolving all inodes for extent %llu\n",
@@ -1192,34 +1390,41 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1192 spin_lock(&delayed_refs->lock); 1390 spin_lock(&delayed_refs->lock);
1193 btrfs_get_delayed_seq(delayed_refs, &seq_elem); 1391 btrfs_get_delayed_seq(delayed_refs, &seq_elem);
1194 spin_unlock(&delayed_refs->lock); 1392 spin_unlock(&delayed_refs->lock);
1393 btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
1195 } 1394 }
1196 1395
1197 ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, 1396 ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
1198 extent_item_pos, seq_elem.seq, 1397 seq_elem.seq, tree_mod_seq_elem.seq, &refs,
1199 &refs); 1398 &extent_item_pos);
1200
1201 if (ret) 1399 if (ret)
1202 goto out; 1400 goto out;
1203 1401
1204 while (!ret && (ref_node = ulist_next(refs, ref_node))) { 1402 ULIST_ITER_INIT(&ref_uiter);
1205 ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1, 1403 while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
1206 seq_elem.seq, &roots); 1404 ret = btrfs_find_all_roots(trans, fs_info, ref_node->val,
1405 seq_elem.seq,
1406 tree_mod_seq_elem.seq, &roots);
1207 if (ret) 1407 if (ret)
1208 break; 1408 break;
1209 while (!ret && (root_node = ulist_next(roots, root_node))) { 1409 ULIST_ITER_INIT(&root_uiter);
1210 pr_debug("root %llu references leaf %llu\n", 1410 while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
1211 root_node->val, ref_node->val); 1411 pr_debug("root %llu references leaf %llu, data list "
1212 ret = iterate_leaf_refs(fs_info, ref_node->val, 1412 "%#lx\n", root_node->val, ref_node->val,
1213 extent_item_objectid, 1413 ref_node->aux);
1214 extent_item_pos, root_node->val, 1414 ret = iterate_leaf_refs(
1215 iterate, ctx); 1415 (struct extent_inode_elem *)ref_node->aux,
1416 root_node->val, extent_item_objectid,
1417 iterate, ctx);
1216 } 1418 }
1419 ulist_free(roots);
1420 roots = NULL;
1217 } 1421 }
1218 1422
1219 ulist_free(refs); 1423 free_leaf_list(refs);
1220 ulist_free(roots); 1424 ulist_free(roots);
1221out: 1425out:
1222 if (!search_commit_root) { 1426 if (!search_commit_root) {
1427 btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
1223 btrfs_put_delayed_seq(delayed_refs, &seq_elem); 1428 btrfs_put_delayed_seq(delayed_refs, &seq_elem);
1224 btrfs_end_transaction(trans, fs_info->extent_root); 1429 btrfs_end_transaction(trans, fs_info->extent_root);
1225 } 1430 }
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 57ea2e959e4d..c18d8ac7b795 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -58,7 +58,8 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
58 58
59int btrfs_find_all_roots(struct btrfs_trans_handle *trans, 59int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
60 struct btrfs_fs_info *fs_info, u64 bytenr, 60 struct btrfs_fs_info *fs_info, u64 bytenr,
61 u64 num_bytes, u64 seq, struct ulist **roots); 61 u64 delayed_ref_seq, u64 time_seq,
62 struct ulist **roots);
62 63
63struct btrfs_data_container *init_data_container(u32 total_bytes); 64struct btrfs_data_container *init_data_container(u32 total_bytes);
64struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, 65struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 9b9b15fd5204..e616f8872e69 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -24,6 +24,20 @@
24#include "ordered-data.h" 24#include "ordered-data.h"
25#include "delayed-inode.h" 25#include "delayed-inode.h"
26 26
27/*
28 * ordered_data_close is set by truncate when a file that used
29 * to have good data has been truncated to zero. When it is set
30 * the btrfs file release call will add this inode to the
31 * ordered operations list so that we make sure to flush out any
32 * new data the application may have written before commit.
33 */
34#define BTRFS_INODE_ORDERED_DATA_CLOSE 0
35#define BTRFS_INODE_ORPHAN_META_RESERVED 1
36#define BTRFS_INODE_DUMMY 2
37#define BTRFS_INODE_IN_DEFRAG 3
38#define BTRFS_INODE_DELALLOC_META_RESERVED 4
39#define BTRFS_INODE_HAS_ORPHAN_ITEM 5
40
27/* in memory btrfs inode */ 41/* in memory btrfs inode */
28struct btrfs_inode { 42struct btrfs_inode {
29 /* which subvolume this inode belongs to */ 43 /* which subvolume this inode belongs to */
@@ -57,9 +71,6 @@ struct btrfs_inode {
57 /* used to order data wrt metadata */ 71 /* used to order data wrt metadata */
58 struct btrfs_ordered_inode_tree ordered_tree; 72 struct btrfs_ordered_inode_tree ordered_tree;
59 73
60 /* for keeping track of orphaned inodes */
61 struct list_head i_orphan;
62
63 /* list of all the delalloc inodes in the FS. There are times we need 74 /* list of all the delalloc inodes in the FS. There are times we need
64 * to write all the delalloc pages to disk, and this list is used 75 * to write all the delalloc pages to disk, and this list is used
65 * to walk them all. 76 * to walk them all.
@@ -78,14 +89,13 @@ struct btrfs_inode {
78 /* the space_info for where this inode's data allocations are done */ 89 /* the space_info for where this inode's data allocations are done */
79 struct btrfs_space_info *space_info; 90 struct btrfs_space_info *space_info;
80 91
92 unsigned long runtime_flags;
93
81 /* full 64 bit generation number, struct vfs_inode doesn't have a big 94 /* full 64 bit generation number, struct vfs_inode doesn't have a big
82 * enough field for this. 95 * enough field for this.
83 */ 96 */
84 u64 generation; 97 u64 generation;
85 98
86 /* sequence number for NFS changes */
87 u64 sequence;
88
89 /* 99 /*
90 * transid of the trans_handle that last modified this inode 100 * transid of the trans_handle that last modified this inode
91 */ 101 */
@@ -145,22 +155,9 @@ struct btrfs_inode {
145 unsigned reserved_extents; 155 unsigned reserved_extents;
146 156
147 /* 157 /*
148 * ordered_data_close is set by truncate when a file that used
149 * to have good data has been truncated to zero. When it is set
150 * the btrfs file release call will add this inode to the
151 * ordered operations list so that we make sure to flush out any
152 * new data the application may have written before commit.
153 */
154 unsigned ordered_data_close:1;
155 unsigned orphan_meta_reserved:1;
156 unsigned dummy_inode:1;
157 unsigned in_defrag:1;
158 unsigned delalloc_meta_reserved:1;
159
160 /*
161 * always compress this one file 158 * always compress this one file
162 */ 159 */
163 unsigned force_compress:4; 160 unsigned force_compress;
164 161
165 struct btrfs_delayed_node *delayed_node; 162 struct btrfs_delayed_node *delayed_node;
166 163
@@ -202,4 +199,17 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_root *root,
202 return false; 199 return false;
203} 200}
204 201
202static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
203{
204 struct btrfs_root *root = BTRFS_I(inode)->root;
205 int ret = 0;
206
207 mutex_lock(&root->log_mutex);
208 if (BTRFS_I(inode)->logged_trans == generation &&
209 BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
210 ret = 1;
211 mutex_unlock(&root->log_mutex);
212 return ret;
213}
214
205#endif 215#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index c053e90f2006..9cebb1fd6a3c 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -103,8 +103,6 @@
103#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300 103#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
104#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters, 104#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters,
105 * excluding " [...]" */ 105 * excluding " [...]" */
106#define BTRFSIC_BLOCK_SIZE PAGE_SIZE
107
108#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1) 106#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
109 107
110/* 108/*
@@ -210,8 +208,9 @@ struct btrfsic_block_data_ctx {
210 u64 dev_bytenr; /* physical bytenr on device */ 208 u64 dev_bytenr; /* physical bytenr on device */
211 u32 len; 209 u32 len;
212 struct btrfsic_dev_state *dev; 210 struct btrfsic_dev_state *dev;
213 char *data; 211 char **datav;
214 struct buffer_head *bh; /* do not use if set to NULL */ 212 struct page **pagev;
213 void *mem_to_free;
215}; 214};
216 215
217/* This structure is used to implement recursion without occupying 216/* This structure is used to implement recursion without occupying
@@ -243,6 +242,8 @@ struct btrfsic_state {
243 struct btrfs_root *root; 242 struct btrfs_root *root;
244 u64 max_superblock_generation; 243 u64 max_superblock_generation;
245 struct btrfsic_block *latest_superblock; 244 struct btrfsic_block *latest_superblock;
245 u32 metablock_size;
246 u32 datablock_size;
246}; 247};
247 248
248static void btrfsic_block_init(struct btrfsic_block *b); 249static void btrfsic_block_init(struct btrfsic_block *b);
@@ -290,8 +291,10 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
290static int btrfsic_process_metablock(struct btrfsic_state *state, 291static int btrfsic_process_metablock(struct btrfsic_state *state,
291 struct btrfsic_block *block, 292 struct btrfsic_block *block,
292 struct btrfsic_block_data_ctx *block_ctx, 293 struct btrfsic_block_data_ctx *block_ctx,
293 struct btrfs_header *hdr,
294 int limit_nesting, int force_iodone_flag); 294 int limit_nesting, int force_iodone_flag);
295static void btrfsic_read_from_block_data(
296 struct btrfsic_block_data_ctx *block_ctx,
297 void *dst, u32 offset, size_t len);
295static int btrfsic_create_link_to_next_block( 298static int btrfsic_create_link_to_next_block(
296 struct btrfsic_state *state, 299 struct btrfsic_state *state,
297 struct btrfsic_block *block, 300 struct btrfsic_block *block,
@@ -318,12 +321,13 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
318static int btrfsic_read_block(struct btrfsic_state *state, 321static int btrfsic_read_block(struct btrfsic_state *state,
319 struct btrfsic_block_data_ctx *block_ctx); 322 struct btrfsic_block_data_ctx *block_ctx);
320static void btrfsic_dump_database(struct btrfsic_state *state); 323static void btrfsic_dump_database(struct btrfsic_state *state);
324static void btrfsic_complete_bio_end_io(struct bio *bio, int err);
321static int btrfsic_test_for_metadata(struct btrfsic_state *state, 325static int btrfsic_test_for_metadata(struct btrfsic_state *state,
322 const u8 *data, unsigned int size); 326 char **datav, unsigned int num_pages);
323static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, 327static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
324 u64 dev_bytenr, u8 *mapped_data, 328 u64 dev_bytenr, char **mapped_datav,
325 unsigned int len, struct bio *bio, 329 unsigned int num_pages,
326 int *bio_is_patched, 330 struct bio *bio, int *bio_is_patched,
327 struct buffer_head *bh, 331 struct buffer_head *bh,
328 int submit_bio_bh_rw); 332 int submit_bio_bh_rw);
329static int btrfsic_process_written_superblock( 333static int btrfsic_process_written_superblock(
@@ -375,7 +379,7 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
375static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, 379static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
376 u64 bytenr, 380 u64 bytenr,
377 struct btrfsic_dev_state *dev_state, 381 struct btrfsic_dev_state *dev_state,
378 u64 dev_bytenr, char *data); 382 u64 dev_bytenr);
379 383
380static struct mutex btrfsic_mutex; 384static struct mutex btrfsic_mutex;
381static int btrfsic_is_initialized; 385static int btrfsic_is_initialized;
@@ -651,7 +655,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
651 int pass; 655 int pass;
652 656
653 BUG_ON(NULL == state); 657 BUG_ON(NULL == state);
654 selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS); 658 selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
655 if (NULL == selected_super) { 659 if (NULL == selected_super) {
656 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); 660 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
657 return -1; 661 return -1;
@@ -718,7 +722,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
718 722
719 num_copies = 723 num_copies =
720 btrfs_num_copies(&state->root->fs_info->mapping_tree, 724 btrfs_num_copies(&state->root->fs_info->mapping_tree,
721 next_bytenr, PAGE_SIZE); 725 next_bytenr, state->metablock_size);
722 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 726 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
723 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 727 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
724 (unsigned long long)next_bytenr, num_copies); 728 (unsigned long long)next_bytenr, num_copies);
@@ -727,9 +731,9 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
727 struct btrfsic_block *next_block; 731 struct btrfsic_block *next_block;
728 struct btrfsic_block_data_ctx tmp_next_block_ctx; 732 struct btrfsic_block_data_ctx tmp_next_block_ctx;
729 struct btrfsic_block_link *l; 733 struct btrfsic_block_link *l;
730 struct btrfs_header *hdr;
731 734
732 ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE, 735 ret = btrfsic_map_block(state, next_bytenr,
736 state->metablock_size,
733 &tmp_next_block_ctx, 737 &tmp_next_block_ctx,
734 mirror_num); 738 mirror_num);
735 if (ret) { 739 if (ret) {
@@ -758,7 +762,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
758 BUG_ON(NULL == l); 762 BUG_ON(NULL == l);
759 763
760 ret = btrfsic_read_block(state, &tmp_next_block_ctx); 764 ret = btrfsic_read_block(state, &tmp_next_block_ctx);
761 if (ret < (int)BTRFSIC_BLOCK_SIZE) { 765 if (ret < (int)PAGE_CACHE_SIZE) {
762 printk(KERN_INFO 766 printk(KERN_INFO
763 "btrfsic: read @logical %llu failed!\n", 767 "btrfsic: read @logical %llu failed!\n",
764 (unsigned long long) 768 (unsigned long long)
@@ -768,11 +772,9 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
768 return -1; 772 return -1;
769 } 773 }
770 774
771 hdr = (struct btrfs_header *)tmp_next_block_ctx.data;
772 ret = btrfsic_process_metablock(state, 775 ret = btrfsic_process_metablock(state,
773 next_block, 776 next_block,
774 &tmp_next_block_ctx, 777 &tmp_next_block_ctx,
775 hdr,
776 BTRFS_MAX_LEVEL + 3, 1); 778 BTRFS_MAX_LEVEL + 3, 1);
777 btrfsic_release_block_ctx(&tmp_next_block_ctx); 779 btrfsic_release_block_ctx(&tmp_next_block_ctx);
778 } 780 }
@@ -799,7 +801,10 @@ static int btrfsic_process_superblock_dev_mirror(
799 801
800 /* super block bytenr is always the unmapped device bytenr */ 802 /* super block bytenr is always the unmapped device bytenr */
801 dev_bytenr = btrfs_sb_offset(superblock_mirror_num); 803 dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
802 bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096); 804 if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
805 return -1;
806 bh = __bread(superblock_bdev, dev_bytenr / 4096,
807 BTRFS_SUPER_INFO_SIZE);
803 if (NULL == bh) 808 if (NULL == bh)
804 return -1; 809 return -1;
805 super_tmp = (struct btrfs_super_block *) 810 super_tmp = (struct btrfs_super_block *)
@@ -808,7 +813,10 @@ static int btrfsic_process_superblock_dev_mirror(
808 if (btrfs_super_bytenr(super_tmp) != dev_bytenr || 813 if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
809 strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC, 814 strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC,
810 sizeof(super_tmp->magic)) || 815 sizeof(super_tmp->magic)) ||
811 memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) { 816 memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
817 btrfs_super_nodesize(super_tmp) != state->metablock_size ||
818 btrfs_super_leafsize(super_tmp) != state->metablock_size ||
819 btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
812 brelse(bh); 820 brelse(bh);
813 return 0; 821 return 0;
814 } 822 }
@@ -893,7 +901,7 @@ static int btrfsic_process_superblock_dev_mirror(
893 901
894 num_copies = 902 num_copies =
895 btrfs_num_copies(&state->root->fs_info->mapping_tree, 903 btrfs_num_copies(&state->root->fs_info->mapping_tree,
896 next_bytenr, PAGE_SIZE); 904 next_bytenr, state->metablock_size);
897 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 905 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
898 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 906 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
899 (unsigned long long)next_bytenr, num_copies); 907 (unsigned long long)next_bytenr, num_copies);
@@ -902,7 +910,8 @@ static int btrfsic_process_superblock_dev_mirror(
902 struct btrfsic_block_data_ctx tmp_next_block_ctx; 910 struct btrfsic_block_data_ctx tmp_next_block_ctx;
903 struct btrfsic_block_link *l; 911 struct btrfsic_block_link *l;
904 912
905 if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE, 913 if (btrfsic_map_block(state, next_bytenr,
914 state->metablock_size,
906 &tmp_next_block_ctx, 915 &tmp_next_block_ctx,
907 mirror_num)) { 916 mirror_num)) {
908 printk(KERN_INFO "btrfsic: btrfsic_map_block(" 917 printk(KERN_INFO "btrfsic: btrfsic_map_block("
@@ -966,13 +975,15 @@ static int btrfsic_process_metablock(
966 struct btrfsic_state *state, 975 struct btrfsic_state *state,
967 struct btrfsic_block *const first_block, 976 struct btrfsic_block *const first_block,
968 struct btrfsic_block_data_ctx *const first_block_ctx, 977 struct btrfsic_block_data_ctx *const first_block_ctx,
969 struct btrfs_header *const first_hdr,
970 int first_limit_nesting, int force_iodone_flag) 978 int first_limit_nesting, int force_iodone_flag)
971{ 979{
972 struct btrfsic_stack_frame initial_stack_frame = { 0 }; 980 struct btrfsic_stack_frame initial_stack_frame = { 0 };
973 struct btrfsic_stack_frame *sf; 981 struct btrfsic_stack_frame *sf;
974 struct btrfsic_stack_frame *next_stack; 982 struct btrfsic_stack_frame *next_stack;
983 struct btrfs_header *const first_hdr =
984 (struct btrfs_header *)first_block_ctx->datav[0];
975 985
986 BUG_ON(!first_hdr);
976 sf = &initial_stack_frame; 987 sf = &initial_stack_frame;
977 sf->error = 0; 988 sf->error = 0;
978 sf->i = -1; 989 sf->i = -1;
@@ -1012,21 +1023,47 @@ continue_with_current_leaf_stack_frame:
1012 } 1023 }
1013 1024
1014 if (sf->i < sf->nr) { 1025 if (sf->i < sf->nr) {
1015 struct btrfs_item *disk_item = leafhdr->items + sf->i; 1026 struct btrfs_item disk_item;
1016 struct btrfs_disk_key *disk_key = &disk_item->key; 1027 u32 disk_item_offset =
1028 (uintptr_t)(leafhdr->items + sf->i) -
1029 (uintptr_t)leafhdr;
1030 struct btrfs_disk_key *disk_key;
1017 u8 type; 1031 u8 type;
1018 const u32 item_offset = le32_to_cpu(disk_item->offset); 1032 u32 item_offset;
1019 1033
1034 if (disk_item_offset + sizeof(struct btrfs_item) >
1035 sf->block_ctx->len) {
1036leaf_item_out_of_bounce_error:
1037 printk(KERN_INFO
1038 "btrfsic: leaf item out of bounce at logical %llu, dev %s\n",
1039 sf->block_ctx->start,
1040 sf->block_ctx->dev->name);
1041 goto one_stack_frame_backwards;
1042 }
1043 btrfsic_read_from_block_data(sf->block_ctx,
1044 &disk_item,
1045 disk_item_offset,
1046 sizeof(struct btrfs_item));
1047 item_offset = le32_to_cpu(disk_item.offset);
1048 disk_key = &disk_item.key;
1020 type = disk_key->type; 1049 type = disk_key->type;
1021 1050
1022 if (BTRFS_ROOT_ITEM_KEY == type) { 1051 if (BTRFS_ROOT_ITEM_KEY == type) {
1023 const struct btrfs_root_item *const root_item = 1052 struct btrfs_root_item root_item;
1024 (struct btrfs_root_item *) 1053 u32 root_item_offset;
1025 (sf->block_ctx->data + 1054 u64 next_bytenr;
1026 offsetof(struct btrfs_leaf, items) + 1055
1027 item_offset); 1056 root_item_offset = item_offset +
1028 const u64 next_bytenr = 1057 offsetof(struct btrfs_leaf, items);
1029 le64_to_cpu(root_item->bytenr); 1058 if (root_item_offset +
1059 sizeof(struct btrfs_root_item) >
1060 sf->block_ctx->len)
1061 goto leaf_item_out_of_bounce_error;
1062 btrfsic_read_from_block_data(
1063 sf->block_ctx, &root_item,
1064 root_item_offset,
1065 sizeof(struct btrfs_root_item));
1066 next_bytenr = le64_to_cpu(root_item.bytenr);
1030 1067
1031 sf->error = 1068 sf->error =
1032 btrfsic_create_link_to_next_block( 1069 btrfsic_create_link_to_next_block(
@@ -1041,7 +1078,7 @@ continue_with_current_leaf_stack_frame:
1041 &sf->num_copies, 1078 &sf->num_copies,
1042 &sf->mirror_num, 1079 &sf->mirror_num,
1043 disk_key, 1080 disk_key,
1044 le64_to_cpu(root_item-> 1081 le64_to_cpu(root_item.
1045 generation)); 1082 generation));
1046 if (sf->error) 1083 if (sf->error)
1047 goto one_stack_frame_backwards; 1084 goto one_stack_frame_backwards;
@@ -1049,7 +1086,7 @@ continue_with_current_leaf_stack_frame:
1049 if (NULL != sf->next_block) { 1086 if (NULL != sf->next_block) {
1050 struct btrfs_header *const next_hdr = 1087 struct btrfs_header *const next_hdr =
1051 (struct btrfs_header *) 1088 (struct btrfs_header *)
1052 sf->next_block_ctx.data; 1089 sf->next_block_ctx.datav[0];
1053 1090
1054 next_stack = 1091 next_stack =
1055 btrfsic_stack_frame_alloc(); 1092 btrfsic_stack_frame_alloc();
@@ -1111,10 +1148,24 @@ continue_with_current_node_stack_frame:
1111 } 1148 }
1112 1149
1113 if (sf->i < sf->nr) { 1150 if (sf->i < sf->nr) {
1114 struct btrfs_key_ptr *disk_key_ptr = 1151 struct btrfs_key_ptr key_ptr;
1115 nodehdr->ptrs + sf->i; 1152 u32 key_ptr_offset;
1116 const u64 next_bytenr = 1153 u64 next_bytenr;
1117 le64_to_cpu(disk_key_ptr->blockptr); 1154
1155 key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) -
1156 (uintptr_t)nodehdr;
1157 if (key_ptr_offset + sizeof(struct btrfs_key_ptr) >
1158 sf->block_ctx->len) {
1159 printk(KERN_INFO
1160 "btrfsic: node item out of bounce at logical %llu, dev %s\n",
1161 sf->block_ctx->start,
1162 sf->block_ctx->dev->name);
1163 goto one_stack_frame_backwards;
1164 }
1165 btrfsic_read_from_block_data(
1166 sf->block_ctx, &key_ptr, key_ptr_offset,
1167 sizeof(struct btrfs_key_ptr));
1168 next_bytenr = le64_to_cpu(key_ptr.blockptr);
1118 1169
1119 sf->error = btrfsic_create_link_to_next_block( 1170 sf->error = btrfsic_create_link_to_next_block(
1120 state, 1171 state,
@@ -1127,15 +1178,15 @@ continue_with_current_node_stack_frame:
1127 force_iodone_flag, 1178 force_iodone_flag,
1128 &sf->num_copies, 1179 &sf->num_copies,
1129 &sf->mirror_num, 1180 &sf->mirror_num,
1130 &disk_key_ptr->key, 1181 &key_ptr.key,
1131 le64_to_cpu(disk_key_ptr->generation)); 1182 le64_to_cpu(key_ptr.generation));
1132 if (sf->error) 1183 if (sf->error)
1133 goto one_stack_frame_backwards; 1184 goto one_stack_frame_backwards;
1134 1185
1135 if (NULL != sf->next_block) { 1186 if (NULL != sf->next_block) {
1136 struct btrfs_header *const next_hdr = 1187 struct btrfs_header *const next_hdr =
1137 (struct btrfs_header *) 1188 (struct btrfs_header *)
1138 sf->next_block_ctx.data; 1189 sf->next_block_ctx.datav[0];
1139 1190
1140 next_stack = btrfsic_stack_frame_alloc(); 1191 next_stack = btrfsic_stack_frame_alloc();
1141 if (NULL == next_stack) 1192 if (NULL == next_stack)
@@ -1181,6 +1232,35 @@ one_stack_frame_backwards:
1181 return sf->error; 1232 return sf->error;
1182} 1233}
1183 1234
1235static void btrfsic_read_from_block_data(
1236 struct btrfsic_block_data_ctx *block_ctx,
1237 void *dstv, u32 offset, size_t len)
1238{
1239 size_t cur;
1240 size_t offset_in_page;
1241 char *kaddr;
1242 char *dst = (char *)dstv;
1243 size_t start_offset = block_ctx->start & ((u64)PAGE_CACHE_SIZE - 1);
1244 unsigned long i = (start_offset + offset) >> PAGE_CACHE_SHIFT;
1245
1246 WARN_ON(offset + len > block_ctx->len);
1247 offset_in_page = (start_offset + offset) &
1248 ((unsigned long)PAGE_CACHE_SIZE - 1);
1249
1250 while (len > 0) {
1251 cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page));
1252 BUG_ON(i >= (block_ctx->len + PAGE_CACHE_SIZE - 1) >>
1253 PAGE_CACHE_SHIFT);
1254 kaddr = block_ctx->datav[i];
1255 memcpy(dst, kaddr + offset_in_page, cur);
1256
1257 dst += cur;
1258 len -= cur;
1259 offset_in_page = 0;
1260 i++;
1261 }
1262}
1263
1184static int btrfsic_create_link_to_next_block( 1264static int btrfsic_create_link_to_next_block(
1185 struct btrfsic_state *state, 1265 struct btrfsic_state *state,
1186 struct btrfsic_block *block, 1266 struct btrfsic_block *block,
@@ -1204,7 +1284,7 @@ static int btrfsic_create_link_to_next_block(
1204 if (0 == *num_copiesp) { 1284 if (0 == *num_copiesp) {
1205 *num_copiesp = 1285 *num_copiesp =
1206 btrfs_num_copies(&state->root->fs_info->mapping_tree, 1286 btrfs_num_copies(&state->root->fs_info->mapping_tree,
1207 next_bytenr, PAGE_SIZE); 1287 next_bytenr, state->metablock_size);
1208 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 1288 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
1209 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 1289 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
1210 (unsigned long long)next_bytenr, *num_copiesp); 1290 (unsigned long long)next_bytenr, *num_copiesp);
@@ -1219,7 +1299,7 @@ static int btrfsic_create_link_to_next_block(
1219 "btrfsic_create_link_to_next_block(mirror_num=%d)\n", 1299 "btrfsic_create_link_to_next_block(mirror_num=%d)\n",
1220 *mirror_nump); 1300 *mirror_nump);
1221 ret = btrfsic_map_block(state, next_bytenr, 1301 ret = btrfsic_map_block(state, next_bytenr,
1222 BTRFSIC_BLOCK_SIZE, 1302 state->metablock_size,
1223 next_block_ctx, *mirror_nump); 1303 next_block_ctx, *mirror_nump);
1224 if (ret) { 1304 if (ret) {
1225 printk(KERN_INFO 1305 printk(KERN_INFO
@@ -1314,7 +1394,7 @@ static int btrfsic_create_link_to_next_block(
1314 1394
1315 if (limit_nesting > 0 && did_alloc_block_link) { 1395 if (limit_nesting > 0 && did_alloc_block_link) {
1316 ret = btrfsic_read_block(state, next_block_ctx); 1396 ret = btrfsic_read_block(state, next_block_ctx);
1317 if (ret < (int)BTRFSIC_BLOCK_SIZE) { 1397 if (ret < (int)next_block_ctx->len) {
1318 printk(KERN_INFO 1398 printk(KERN_INFO
1319 "btrfsic: read block @logical %llu failed!\n", 1399 "btrfsic: read block @logical %llu failed!\n",
1320 (unsigned long long)next_bytenr); 1400 (unsigned long long)next_bytenr);
@@ -1339,43 +1419,74 @@ static int btrfsic_handle_extent_data(
1339 u32 item_offset, int force_iodone_flag) 1419 u32 item_offset, int force_iodone_flag)
1340{ 1420{
1341 int ret; 1421 int ret;
1342 struct btrfs_file_extent_item *file_extent_item = 1422 struct btrfs_file_extent_item file_extent_item;
1343 (struct btrfs_file_extent_item *)(block_ctx->data + 1423 u64 file_extent_item_offset;
1344 offsetof(struct btrfs_leaf, 1424 u64 next_bytenr;
1345 items) + item_offset); 1425 u64 num_bytes;
1346 u64 next_bytenr = 1426 u64 generation;
1347 le64_to_cpu(file_extent_item->disk_bytenr) +
1348 le64_to_cpu(file_extent_item->offset);
1349 u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes);
1350 u64 generation = le64_to_cpu(file_extent_item->generation);
1351 struct btrfsic_block_link *l; 1427 struct btrfsic_block_link *l;
1352 1428
1429 file_extent_item_offset = offsetof(struct btrfs_leaf, items) +
1430 item_offset;
1431 if (file_extent_item_offset +
1432 offsetof(struct btrfs_file_extent_item, disk_num_bytes) >
1433 block_ctx->len) {
1434 printk(KERN_INFO
1435 "btrfsic: file item out of bounce at logical %llu, dev %s\n",
1436 block_ctx->start, block_ctx->dev->name);
1437 return -1;
1438 }
1439
1440 btrfsic_read_from_block_data(block_ctx, &file_extent_item,
1441 file_extent_item_offset,
1442 offsetof(struct btrfs_file_extent_item, disk_num_bytes));
1443 if (BTRFS_FILE_EXTENT_REG != file_extent_item.type ||
1444 ((u64)0) == le64_to_cpu(file_extent_item.disk_bytenr)) {
1445 if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
1446 printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu\n",
1447 file_extent_item.type,
1448 (unsigned long long)
1449 le64_to_cpu(file_extent_item.disk_bytenr));
1450 return 0;
1451 }
1452
1453 if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) >
1454 block_ctx->len) {
1455 printk(KERN_INFO
1456 "btrfsic: file item out of bounce at logical %llu, dev %s\n",
1457 block_ctx->start, block_ctx->dev->name);
1458 return -1;
1459 }
1460 btrfsic_read_from_block_data(block_ctx, &file_extent_item,
1461 file_extent_item_offset,
1462 sizeof(struct btrfs_file_extent_item));
1463 next_bytenr = le64_to_cpu(file_extent_item.disk_bytenr) +
1464 le64_to_cpu(file_extent_item.offset);
1465 generation = le64_to_cpu(file_extent_item.generation);
1466 num_bytes = le64_to_cpu(file_extent_item.num_bytes);
1467 generation = le64_to_cpu(file_extent_item.generation);
1468
1353 if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) 1469 if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
1354 printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu," 1470 printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu,"
1355 " offset = %llu, num_bytes = %llu\n", 1471 " offset = %llu, num_bytes = %llu\n",
1356 file_extent_item->type, 1472 file_extent_item.type,
1357 (unsigned long long) 1473 (unsigned long long)
1358 le64_to_cpu(file_extent_item->disk_bytenr), 1474 le64_to_cpu(file_extent_item.disk_bytenr),
1359 (unsigned long long) 1475 (unsigned long long)le64_to_cpu(file_extent_item.offset),
1360 le64_to_cpu(file_extent_item->offset), 1476 (unsigned long long)num_bytes);
1361 (unsigned long long)
1362 le64_to_cpu(file_extent_item->num_bytes));
1363 if (BTRFS_FILE_EXTENT_REG != file_extent_item->type ||
1364 ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr))
1365 return 0;
1366 while (num_bytes > 0) { 1477 while (num_bytes > 0) {
1367 u32 chunk_len; 1478 u32 chunk_len;
1368 int num_copies; 1479 int num_copies;
1369 int mirror_num; 1480 int mirror_num;
1370 1481
1371 if (num_bytes > BTRFSIC_BLOCK_SIZE) 1482 if (num_bytes > state->datablock_size)
1372 chunk_len = BTRFSIC_BLOCK_SIZE; 1483 chunk_len = state->datablock_size;
1373 else 1484 else
1374 chunk_len = num_bytes; 1485 chunk_len = num_bytes;
1375 1486
1376 num_copies = 1487 num_copies =
1377 btrfs_num_copies(&state->root->fs_info->mapping_tree, 1488 btrfs_num_copies(&state->root->fs_info->mapping_tree,
1378 next_bytenr, PAGE_SIZE); 1489 next_bytenr, state->datablock_size);
1379 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 1490 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
1380 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 1491 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
1381 (unsigned long long)next_bytenr, num_copies); 1492 (unsigned long long)next_bytenr, num_copies);
@@ -1475,8 +1586,9 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
1475 block_ctx_out->dev_bytenr = multi->stripes[0].physical; 1586 block_ctx_out->dev_bytenr = multi->stripes[0].physical;
1476 block_ctx_out->start = bytenr; 1587 block_ctx_out->start = bytenr;
1477 block_ctx_out->len = len; 1588 block_ctx_out->len = len;
1478 block_ctx_out->data = NULL; 1589 block_ctx_out->datav = NULL;
1479 block_ctx_out->bh = NULL; 1590 block_ctx_out->pagev = NULL;
1591 block_ctx_out->mem_to_free = NULL;
1480 1592
1481 if (0 == ret) 1593 if (0 == ret)
1482 kfree(multi); 1594 kfree(multi);
@@ -1496,8 +1608,9 @@ static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
1496 block_ctx_out->dev_bytenr = bytenr; 1608 block_ctx_out->dev_bytenr = bytenr;
1497 block_ctx_out->start = bytenr; 1609 block_ctx_out->start = bytenr;
1498 block_ctx_out->len = len; 1610 block_ctx_out->len = len;
1499 block_ctx_out->data = NULL; 1611 block_ctx_out->datav = NULL;
1500 block_ctx_out->bh = NULL; 1612 block_ctx_out->pagev = NULL;
1613 block_ctx_out->mem_to_free = NULL;
1501 if (NULL != block_ctx_out->dev) { 1614 if (NULL != block_ctx_out->dev) {
1502 return 0; 1615 return 0;
1503 } else { 1616 } else {
@@ -1508,38 +1621,127 @@ static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
1508 1621
1509static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) 1622static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
1510{ 1623{
1511 if (NULL != block_ctx->bh) { 1624 if (block_ctx->mem_to_free) {
1512 brelse(block_ctx->bh); 1625 unsigned int num_pages;
1513 block_ctx->bh = NULL; 1626
1627 BUG_ON(!block_ctx->datav);
1628 BUG_ON(!block_ctx->pagev);
1629 num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >>
1630 PAGE_CACHE_SHIFT;
1631 while (num_pages > 0) {
1632 num_pages--;
1633 if (block_ctx->datav[num_pages]) {
1634 kunmap(block_ctx->pagev[num_pages]);
1635 block_ctx->datav[num_pages] = NULL;
1636 }
1637 if (block_ctx->pagev[num_pages]) {
1638 __free_page(block_ctx->pagev[num_pages]);
1639 block_ctx->pagev[num_pages] = NULL;
1640 }
1641 }
1642
1643 kfree(block_ctx->mem_to_free);
1644 block_ctx->mem_to_free = NULL;
1645 block_ctx->pagev = NULL;
1646 block_ctx->datav = NULL;
1514 } 1647 }
1515} 1648}
1516 1649
1517static int btrfsic_read_block(struct btrfsic_state *state, 1650static int btrfsic_read_block(struct btrfsic_state *state,
1518 struct btrfsic_block_data_ctx *block_ctx) 1651 struct btrfsic_block_data_ctx *block_ctx)
1519{ 1652{
1520 block_ctx->bh = NULL; 1653 unsigned int num_pages;
1521 if (block_ctx->dev_bytenr & 4095) { 1654 unsigned int i;
1655 u64 dev_bytenr;
1656 int ret;
1657
1658 BUG_ON(block_ctx->datav);
1659 BUG_ON(block_ctx->pagev);
1660 BUG_ON(block_ctx->mem_to_free);
1661 if (block_ctx->dev_bytenr & ((u64)PAGE_CACHE_SIZE - 1)) {
1522 printk(KERN_INFO 1662 printk(KERN_INFO
1523 "btrfsic: read_block() with unaligned bytenr %llu\n", 1663 "btrfsic: read_block() with unaligned bytenr %llu\n",
1524 (unsigned long long)block_ctx->dev_bytenr); 1664 (unsigned long long)block_ctx->dev_bytenr);
1525 return -1; 1665 return -1;
1526 } 1666 }
1527 if (block_ctx->len > 4096) { 1667
1528 printk(KERN_INFO 1668 num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >>
1529 "btrfsic: read_block() with too huge size %d\n", 1669 PAGE_CACHE_SHIFT;
1530 block_ctx->len); 1670 block_ctx->mem_to_free = kzalloc((sizeof(*block_ctx->datav) +
1671 sizeof(*block_ctx->pagev)) *
1672 num_pages, GFP_NOFS);
1673 if (!block_ctx->mem_to_free)
1531 return -1; 1674 return -1;
1675 block_ctx->datav = block_ctx->mem_to_free;
1676 block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
1677 for (i = 0; i < num_pages; i++) {
1678 block_ctx->pagev[i] = alloc_page(GFP_NOFS);
1679 if (!block_ctx->pagev[i])
1680 return -1;
1532 } 1681 }
1533 1682
1534 block_ctx->bh = __bread(block_ctx->dev->bdev, 1683 dev_bytenr = block_ctx->dev_bytenr;
1535 block_ctx->dev_bytenr >> 12, 4096); 1684 for (i = 0; i < num_pages;) {
1536 if (NULL == block_ctx->bh) 1685 struct bio *bio;
1537 return -1; 1686 unsigned int j;
1538 block_ctx->data = block_ctx->bh->b_data; 1687 DECLARE_COMPLETION_ONSTACK(complete);
1688
1689 bio = bio_alloc(GFP_NOFS, num_pages - i);
1690 if (!bio) {
1691 printk(KERN_INFO
1692 "btrfsic: bio_alloc() for %u pages failed!\n",
1693 num_pages - i);
1694 return -1;
1695 }
1696 bio->bi_bdev = block_ctx->dev->bdev;
1697 bio->bi_sector = dev_bytenr >> 9;
1698 bio->bi_end_io = btrfsic_complete_bio_end_io;
1699 bio->bi_private = &complete;
1700
1701 for (j = i; j < num_pages; j++) {
1702 ret = bio_add_page(bio, block_ctx->pagev[j],
1703 PAGE_CACHE_SIZE, 0);
1704 if (PAGE_CACHE_SIZE != ret)
1705 break;
1706 }
1707 if (j == i) {
1708 printk(KERN_INFO
1709 "btrfsic: error, failed to add a single page!\n");
1710 return -1;
1711 }
1712 submit_bio(READ, bio);
1713
1714 /* this will also unplug the queue */
1715 wait_for_completion(&complete);
1716
1717 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1718 printk(KERN_INFO
1719 "btrfsic: read error at logical %llu dev %s!\n",
1720 block_ctx->start, block_ctx->dev->name);
1721 bio_put(bio);
1722 return -1;
1723 }
1724 bio_put(bio);
1725 dev_bytenr += (j - i) * PAGE_CACHE_SIZE;
1726 i = j;
1727 }
1728 for (i = 0; i < num_pages; i++) {
1729 block_ctx->datav[i] = kmap(block_ctx->pagev[i]);
1730 if (!block_ctx->datav[i]) {
1731 printk(KERN_INFO "btrfsic: kmap() failed (dev %s)!\n",
1732 block_ctx->dev->name);
1733 return -1;
1734 }
1735 }
1539 1736
1540 return block_ctx->len; 1737 return block_ctx->len;
1541} 1738}
1542 1739
1740static void btrfsic_complete_bio_end_io(struct bio *bio, int err)
1741{
1742 complete((struct completion *)bio->bi_private);
1743}
1744
1543static void btrfsic_dump_database(struct btrfsic_state *state) 1745static void btrfsic_dump_database(struct btrfsic_state *state)
1544{ 1746{
1545 struct list_head *elem_all; 1747 struct list_head *elem_all;
@@ -1617,32 +1819,39 @@ static void btrfsic_dump_database(struct btrfsic_state *state)
1617 * (note that this test fails for the super block) 1819 * (note that this test fails for the super block)
1618 */ 1820 */
1619static int btrfsic_test_for_metadata(struct btrfsic_state *state, 1821static int btrfsic_test_for_metadata(struct btrfsic_state *state,
1620 const u8 *data, unsigned int size) 1822 char **datav, unsigned int num_pages)
1621{ 1823{
1622 struct btrfs_header *h; 1824 struct btrfs_header *h;
1623 u8 csum[BTRFS_CSUM_SIZE]; 1825 u8 csum[BTRFS_CSUM_SIZE];
1624 u32 crc = ~(u32)0; 1826 u32 crc = ~(u32)0;
1625 int fail = 0; 1827 unsigned int i;
1626 int crc_fail = 0;
1627 1828
1628 h = (struct btrfs_header *)data; 1829 if (num_pages * PAGE_CACHE_SIZE < state->metablock_size)
1830 return 1; /* not metadata */
1831 num_pages = state->metablock_size >> PAGE_CACHE_SHIFT;
1832 h = (struct btrfs_header *)datav[0];
1629 1833
1630 if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE)) 1834 if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE))
1631 fail++; 1835 return 1;
1836
1837 for (i = 0; i < num_pages; i++) {
1838 u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE);
1839 size_t sublen = i ? PAGE_CACHE_SIZE :
1840 (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE);
1632 1841
1633 crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE); 1842 crc = crc32c(crc, data, sublen);
1843 }
1634 btrfs_csum_final(crc, csum); 1844 btrfs_csum_final(crc, csum);
1635 if (memcmp(csum, h->csum, state->csum_size)) 1845 if (memcmp(csum, h->csum, state->csum_size))
1636 crc_fail++; 1846 return 1;
1637 1847
1638 return fail || crc_fail; 1848 return 0; /* is metadata */
1639} 1849}
1640 1850
1641static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, 1851static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1642 u64 dev_bytenr, 1852 u64 dev_bytenr, char **mapped_datav,
1643 u8 *mapped_data, unsigned int len, 1853 unsigned int num_pages,
1644 struct bio *bio, 1854 struct bio *bio, int *bio_is_patched,
1645 int *bio_is_patched,
1646 struct buffer_head *bh, 1855 struct buffer_head *bh,
1647 int submit_bio_bh_rw) 1856 int submit_bio_bh_rw)
1648{ 1857{
@@ -1652,12 +1861,19 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1652 int ret; 1861 int ret;
1653 struct btrfsic_state *state = dev_state->state; 1862 struct btrfsic_state *state = dev_state->state;
1654 struct block_device *bdev = dev_state->bdev; 1863 struct block_device *bdev = dev_state->bdev;
1864 unsigned int processed_len;
1655 1865
1656 WARN_ON(len > PAGE_SIZE);
1657 is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len));
1658 if (NULL != bio_is_patched) 1866 if (NULL != bio_is_patched)
1659 *bio_is_patched = 0; 1867 *bio_is_patched = 0;
1660 1868
1869again:
1870 if (num_pages == 0)
1871 return;
1872
1873 processed_len = 0;
1874 is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav,
1875 num_pages));
1876
1661 block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr, 1877 block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
1662 &state->block_hashtable); 1878 &state->block_hashtable);
1663 if (NULL != block) { 1879 if (NULL != block) {
@@ -1667,8 +1883,16 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1667 1883
1668 if (block->is_superblock) { 1884 if (block->is_superblock) {
1669 bytenr = le64_to_cpu(((struct btrfs_super_block *) 1885 bytenr = le64_to_cpu(((struct btrfs_super_block *)
1670 mapped_data)->bytenr); 1886 mapped_datav[0])->bytenr);
1887 if (num_pages * PAGE_CACHE_SIZE <
1888 BTRFS_SUPER_INFO_SIZE) {
1889 printk(KERN_INFO
1890 "btrfsic: cannot work with too short bios!\n");
1891 return;
1892 }
1671 is_metadata = 1; 1893 is_metadata = 1;
1894 BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_CACHE_SIZE - 1));
1895 processed_len = BTRFS_SUPER_INFO_SIZE;
1672 if (state->print_mask & 1896 if (state->print_mask &
1673 BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) { 1897 BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
1674 printk(KERN_INFO 1898 printk(KERN_INFO
@@ -1678,12 +1902,18 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1678 } 1902 }
1679 if (is_metadata) { 1903 if (is_metadata) {
1680 if (!block->is_superblock) { 1904 if (!block->is_superblock) {
1905 if (num_pages * PAGE_CACHE_SIZE <
1906 state->metablock_size) {
1907 printk(KERN_INFO
1908 "btrfsic: cannot work with too short bios!\n");
1909 return;
1910 }
1911 processed_len = state->metablock_size;
1681 bytenr = le64_to_cpu(((struct btrfs_header *) 1912 bytenr = le64_to_cpu(((struct btrfs_header *)
1682 mapped_data)->bytenr); 1913 mapped_datav[0])->bytenr);
1683 btrfsic_cmp_log_and_dev_bytenr(state, bytenr, 1914 btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
1684 dev_state, 1915 dev_state,
1685 dev_bytenr, 1916 dev_bytenr);
1686 mapped_data);
1687 } 1917 }
1688 if (block->logical_bytenr != bytenr) { 1918 if (block->logical_bytenr != bytenr) {
1689 printk(KERN_INFO 1919 printk(KERN_INFO
@@ -1710,6 +1940,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1710 block->mirror_num, 1940 block->mirror_num,
1711 btrfsic_get_block_type(state, block)); 1941 btrfsic_get_block_type(state, block));
1712 } else { 1942 } else {
1943 if (num_pages * PAGE_CACHE_SIZE <
1944 state->datablock_size) {
1945 printk(KERN_INFO
1946 "btrfsic: cannot work with too short bios!\n");
1947 return;
1948 }
1949 processed_len = state->datablock_size;
1713 bytenr = block->logical_bytenr; 1950 bytenr = block->logical_bytenr;
1714 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) 1951 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1715 printk(KERN_INFO 1952 printk(KERN_INFO
@@ -1747,7 +1984,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1747 le64_to_cpu(block->disk_key.offset), 1984 le64_to_cpu(block->disk_key.offset),
1748 (unsigned long long) 1985 (unsigned long long)
1749 le64_to_cpu(((struct btrfs_header *) 1986 le64_to_cpu(((struct btrfs_header *)
1750 mapped_data)->generation), 1987 mapped_datav[0])->generation),
1751 (unsigned long long) 1988 (unsigned long long)
1752 state->max_superblock_generation); 1989 state->max_superblock_generation);
1753 btrfsic_dump_tree(state); 1990 btrfsic_dump_tree(state);
@@ -1765,10 +2002,10 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1765 (unsigned long long)block->generation, 2002 (unsigned long long)block->generation,
1766 (unsigned long long) 2003 (unsigned long long)
1767 le64_to_cpu(((struct btrfs_header *) 2004 le64_to_cpu(((struct btrfs_header *)
1768 mapped_data)->generation)); 2005 mapped_datav[0])->generation));
1769 /* it would not be safe to go on */ 2006 /* it would not be safe to go on */
1770 btrfsic_dump_tree(state); 2007 btrfsic_dump_tree(state);
1771 return; 2008 goto continue_loop;
1772 } 2009 }
1773 2010
1774 /* 2011 /*
@@ -1796,18 +2033,19 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1796 } 2033 }
1797 2034
1798 if (block->is_superblock) 2035 if (block->is_superblock)
1799 ret = btrfsic_map_superblock(state, bytenr, len, 2036 ret = btrfsic_map_superblock(state, bytenr,
2037 processed_len,
1800 bdev, &block_ctx); 2038 bdev, &block_ctx);
1801 else 2039 else
1802 ret = btrfsic_map_block(state, bytenr, len, 2040 ret = btrfsic_map_block(state, bytenr, processed_len,
1803 &block_ctx, 0); 2041 &block_ctx, 0);
1804 if (ret) { 2042 if (ret) {
1805 printk(KERN_INFO 2043 printk(KERN_INFO
1806 "btrfsic: btrfsic_map_block(root @%llu)" 2044 "btrfsic: btrfsic_map_block(root @%llu)"
1807 " failed!\n", (unsigned long long)bytenr); 2045 " failed!\n", (unsigned long long)bytenr);
1808 return; 2046 goto continue_loop;
1809 } 2047 }
1810 block_ctx.data = mapped_data; 2048 block_ctx.datav = mapped_datav;
1811 /* the following is required in case of writes to mirrors, 2049 /* the following is required in case of writes to mirrors,
1812 * use the same that was used for the lookup */ 2050 * use the same that was used for the lookup */
1813 block_ctx.dev = dev_state; 2051 block_ctx.dev = dev_state;
@@ -1863,11 +2101,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1863 block->logical_bytenr = bytenr; 2101 block->logical_bytenr = bytenr;
1864 block->is_metadata = 1; 2102 block->is_metadata = 1;
1865 if (block->is_superblock) { 2103 if (block->is_superblock) {
2104 BUG_ON(PAGE_CACHE_SIZE !=
2105 BTRFS_SUPER_INFO_SIZE);
1866 ret = btrfsic_process_written_superblock( 2106 ret = btrfsic_process_written_superblock(
1867 state, 2107 state,
1868 block, 2108 block,
1869 (struct btrfs_super_block *) 2109 (struct btrfs_super_block *)
1870 mapped_data); 2110 mapped_datav[0]);
1871 if (state->print_mask & 2111 if (state->print_mask &
1872 BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) { 2112 BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
1873 printk(KERN_INFO 2113 printk(KERN_INFO
@@ -1880,8 +2120,6 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1880 state, 2120 state,
1881 block, 2121 block,
1882 &block_ctx, 2122 &block_ctx,
1883 (struct btrfs_header *)
1884 block_ctx.data,
1885 0, 0); 2123 0, 0);
1886 } 2124 }
1887 if (ret) 2125 if (ret)
@@ -1912,26 +2150,30 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1912 u64 bytenr; 2150 u64 bytenr;
1913 2151
1914 if (!is_metadata) { 2152 if (!is_metadata) {
2153 processed_len = state->datablock_size;
1915 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) 2154 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1916 printk(KERN_INFO "Written block (%s/%llu/?)" 2155 printk(KERN_INFO "Written block (%s/%llu/?)"
1917 " !found in hash table, D.\n", 2156 " !found in hash table, D.\n",
1918 dev_state->name, 2157 dev_state->name,
1919 (unsigned long long)dev_bytenr); 2158 (unsigned long long)dev_bytenr);
1920 if (!state->include_extent_data) 2159 if (!state->include_extent_data) {
1921 return; /* ignore that written D block */ 2160 /* ignore that written D block */
2161 goto continue_loop;
2162 }
1922 2163
1923 /* this is getting ugly for the 2164 /* this is getting ugly for the
1924 * include_extent_data case... */ 2165 * include_extent_data case... */
1925 bytenr = 0; /* unknown */ 2166 bytenr = 0; /* unknown */
1926 block_ctx.start = bytenr; 2167 block_ctx.start = bytenr;
1927 block_ctx.len = len; 2168 block_ctx.len = processed_len;
1928 block_ctx.bh = NULL; 2169 block_ctx.mem_to_free = NULL;
2170 block_ctx.pagev = NULL;
1929 } else { 2171 } else {
2172 processed_len = state->metablock_size;
1930 bytenr = le64_to_cpu(((struct btrfs_header *) 2173 bytenr = le64_to_cpu(((struct btrfs_header *)
1931 mapped_data)->bytenr); 2174 mapped_datav[0])->bytenr);
1932 btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, 2175 btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
1933 dev_bytenr, 2176 dev_bytenr);
1934 mapped_data);
1935 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) 2177 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1936 printk(KERN_INFO 2178 printk(KERN_INFO
1937 "Written block @%llu (%s/%llu/?)" 2179 "Written block @%llu (%s/%llu/?)"
@@ -1940,17 +2182,17 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1940 dev_state->name, 2182 dev_state->name,
1941 (unsigned long long)dev_bytenr); 2183 (unsigned long long)dev_bytenr);
1942 2184
1943 ret = btrfsic_map_block(state, bytenr, len, &block_ctx, 2185 ret = btrfsic_map_block(state, bytenr, processed_len,
1944 0); 2186 &block_ctx, 0);
1945 if (ret) { 2187 if (ret) {
1946 printk(KERN_INFO 2188 printk(KERN_INFO
1947 "btrfsic: btrfsic_map_block(root @%llu)" 2189 "btrfsic: btrfsic_map_block(root @%llu)"
1948 " failed!\n", 2190 " failed!\n",
1949 (unsigned long long)dev_bytenr); 2191 (unsigned long long)dev_bytenr);
1950 return; 2192 goto continue_loop;
1951 } 2193 }
1952 } 2194 }
1953 block_ctx.data = mapped_data; 2195 block_ctx.datav = mapped_datav;
1954 /* the following is required in case of writes to mirrors, 2196 /* the following is required in case of writes to mirrors,
1955 * use the same that was used for the lookup */ 2197 * use the same that was used for the lookup */
1956 block_ctx.dev = dev_state; 2198 block_ctx.dev = dev_state;
@@ -1960,7 +2202,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1960 if (NULL == block) { 2202 if (NULL == block) {
1961 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); 2203 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
1962 btrfsic_release_block_ctx(&block_ctx); 2204 btrfsic_release_block_ctx(&block_ctx);
1963 return; 2205 goto continue_loop;
1964 } 2206 }
1965 block->dev_state = dev_state; 2207 block->dev_state = dev_state;
1966 block->dev_bytenr = dev_bytenr; 2208 block->dev_bytenr = dev_bytenr;
@@ -2020,9 +2262,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
2020 2262
2021 if (is_metadata) { 2263 if (is_metadata) {
2022 ret = btrfsic_process_metablock(state, block, 2264 ret = btrfsic_process_metablock(state, block,
2023 &block_ctx, 2265 &block_ctx, 0, 0);
2024 (struct btrfs_header *)
2025 block_ctx.data, 0, 0);
2026 if (ret) 2266 if (ret)
2027 printk(KERN_INFO 2267 printk(KERN_INFO
2028 "btrfsic: process_metablock(root @%llu)" 2268 "btrfsic: process_metablock(root @%llu)"
@@ -2031,6 +2271,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
2031 } 2271 }
2032 btrfsic_release_block_ctx(&block_ctx); 2272 btrfsic_release_block_ctx(&block_ctx);
2033 } 2273 }
2274
2275continue_loop:
2276 BUG_ON(!processed_len);
2277 dev_bytenr += processed_len;
2278 mapped_datav += processed_len >> PAGE_CACHE_SHIFT;
2279 num_pages -= processed_len >> PAGE_CACHE_SHIFT;
2280 goto again;
2034} 2281}
2035 2282
2036static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) 2283static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
@@ -2213,7 +2460,7 @@ static int btrfsic_process_written_superblock(
2213 2460
2214 num_copies = 2461 num_copies =
2215 btrfs_num_copies(&state->root->fs_info->mapping_tree, 2462 btrfs_num_copies(&state->root->fs_info->mapping_tree,
2216 next_bytenr, PAGE_SIZE); 2463 next_bytenr, BTRFS_SUPER_INFO_SIZE);
2217 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 2464 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
2218 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 2465 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
2219 (unsigned long long)next_bytenr, num_copies); 2466 (unsigned long long)next_bytenr, num_copies);
@@ -2224,7 +2471,8 @@ static int btrfsic_process_written_superblock(
2224 printk(KERN_INFO 2471 printk(KERN_INFO
2225 "btrfsic_process_written_superblock(" 2472 "btrfsic_process_written_superblock("
2226 "mirror_num=%d)\n", mirror_num); 2473 "mirror_num=%d)\n", mirror_num);
2227 ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE, 2474 ret = btrfsic_map_block(state, next_bytenr,
2475 BTRFS_SUPER_INFO_SIZE,
2228 &tmp_next_block_ctx, 2476 &tmp_next_block_ctx,
2229 mirror_num); 2477 mirror_num);
2230 if (ret) { 2478 if (ret) {
@@ -2689,7 +2937,7 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add(
2689static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, 2937static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
2690 u64 bytenr, 2938 u64 bytenr,
2691 struct btrfsic_dev_state *dev_state, 2939 struct btrfsic_dev_state *dev_state,
2692 u64 dev_bytenr, char *data) 2940 u64 dev_bytenr)
2693{ 2941{
2694 int num_copies; 2942 int num_copies;
2695 int mirror_num; 2943 int mirror_num;
@@ -2698,10 +2946,10 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
2698 int match = 0; 2946 int match = 0;
2699 2947
2700 num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, 2948 num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
2701 bytenr, PAGE_SIZE); 2949 bytenr, state->metablock_size);
2702 2950
2703 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { 2951 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
2704 ret = btrfsic_map_block(state, bytenr, PAGE_SIZE, 2952 ret = btrfsic_map_block(state, bytenr, state->metablock_size,
2705 &block_ctx, mirror_num); 2953 &block_ctx, mirror_num);
2706 if (ret) { 2954 if (ret) {
2707 printk(KERN_INFO "btrfsic:" 2955 printk(KERN_INFO "btrfsic:"
@@ -2727,7 +2975,8 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
2727 (unsigned long long)bytenr, dev_state->name, 2975 (unsigned long long)bytenr, dev_state->name,
2728 (unsigned long long)dev_bytenr); 2976 (unsigned long long)dev_bytenr);
2729 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { 2977 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
2730 ret = btrfsic_map_block(state, bytenr, PAGE_SIZE, 2978 ret = btrfsic_map_block(state, bytenr,
2979 state->metablock_size,
2731 &block_ctx, mirror_num); 2980 &block_ctx, mirror_num);
2732 if (ret) 2981 if (ret)
2733 continue; 2982 continue;
@@ -2781,13 +3030,13 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh)
2781 (unsigned long)bh->b_size, bh->b_data, 3030 (unsigned long)bh->b_size, bh->b_data,
2782 bh->b_bdev); 3031 bh->b_bdev);
2783 btrfsic_process_written_block(dev_state, dev_bytenr, 3032 btrfsic_process_written_block(dev_state, dev_bytenr,
2784 bh->b_data, bh->b_size, NULL, 3033 &bh->b_data, 1, NULL,
2785 NULL, bh, rw); 3034 NULL, bh, rw);
2786 } else if (NULL != dev_state && (rw & REQ_FLUSH)) { 3035 } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
2787 if (dev_state->state->print_mask & 3036 if (dev_state->state->print_mask &
2788 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) 3037 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
2789 printk(KERN_INFO 3038 printk(KERN_INFO
2790 "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n", 3039 "submit_bh(rw=0x%x FLUSH, bdev=%p)\n",
2791 rw, bh->b_bdev); 3040 rw, bh->b_bdev);
2792 if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { 3041 if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
2793 if ((dev_state->state->print_mask & 3042 if ((dev_state->state->print_mask &
@@ -2836,6 +3085,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
2836 unsigned int i; 3085 unsigned int i;
2837 u64 dev_bytenr; 3086 u64 dev_bytenr;
2838 int bio_is_patched; 3087 int bio_is_patched;
3088 char **mapped_datav;
2839 3089
2840 dev_bytenr = 512 * bio->bi_sector; 3090 dev_bytenr = 512 * bio->bi_sector;
2841 bio_is_patched = 0; 3091 bio_is_patched = 0;
@@ -2848,35 +3098,46 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
2848 (unsigned long long)dev_bytenr, 3098 (unsigned long long)dev_bytenr,
2849 bio->bi_bdev); 3099 bio->bi_bdev);
2850 3100
3101 mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt,
3102 GFP_NOFS);
3103 if (!mapped_datav)
3104 goto leave;
2851 for (i = 0; i < bio->bi_vcnt; i++) { 3105 for (i = 0; i < bio->bi_vcnt; i++) {
2852 u8 *mapped_data; 3106 BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE);
2853 3107 mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page);
2854 mapped_data = kmap(bio->bi_io_vec[i].bv_page); 3108 if (!mapped_datav[i]) {
3109 while (i > 0) {
3110 i--;
3111 kunmap(bio->bi_io_vec[i].bv_page);
3112 }
3113 kfree(mapped_datav);
3114 goto leave;
3115 }
2855 if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | 3116 if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
2856 BTRFSIC_PRINT_MASK_VERBOSE) == 3117 BTRFSIC_PRINT_MASK_VERBOSE) ==
2857 (dev_state->state->print_mask & 3118 (dev_state->state->print_mask &
2858 (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | 3119 (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
2859 BTRFSIC_PRINT_MASK_VERBOSE))) 3120 BTRFSIC_PRINT_MASK_VERBOSE)))
2860 printk(KERN_INFO 3121 printk(KERN_INFO
2861 "#%u: page=%p, mapped=%p, len=%u," 3122 "#%u: page=%p, len=%u, offset=%u\n",
2862 " offset=%u\n",
2863 i, bio->bi_io_vec[i].bv_page, 3123 i, bio->bi_io_vec[i].bv_page,
2864 mapped_data,
2865 bio->bi_io_vec[i].bv_len, 3124 bio->bi_io_vec[i].bv_len,
2866 bio->bi_io_vec[i].bv_offset); 3125 bio->bi_io_vec[i].bv_offset);
2867 btrfsic_process_written_block(dev_state, dev_bytenr, 3126 }
2868 mapped_data, 3127 btrfsic_process_written_block(dev_state, dev_bytenr,
2869 bio->bi_io_vec[i].bv_len, 3128 mapped_datav, bio->bi_vcnt,
2870 bio, &bio_is_patched, 3129 bio, &bio_is_patched,
2871 NULL, rw); 3130 NULL, rw);
3131 while (i > 0) {
3132 i--;
2872 kunmap(bio->bi_io_vec[i].bv_page); 3133 kunmap(bio->bi_io_vec[i].bv_page);
2873 dev_bytenr += bio->bi_io_vec[i].bv_len;
2874 } 3134 }
3135 kfree(mapped_datav);
2875 } else if (NULL != dev_state && (rw & REQ_FLUSH)) { 3136 } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
2876 if (dev_state->state->print_mask & 3137 if (dev_state->state->print_mask &
2877 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) 3138 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
2878 printk(KERN_INFO 3139 printk(KERN_INFO
2879 "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n", 3140 "submit_bio(rw=0x%x FLUSH, bdev=%p)\n",
2880 rw, bio->bi_bdev); 3141 rw, bio->bi_bdev);
2881 if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { 3142 if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
2882 if ((dev_state->state->print_mask & 3143 if ((dev_state->state->print_mask &
@@ -2903,6 +3164,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
2903 bio->bi_end_io = btrfsic_bio_end_io; 3164 bio->bi_end_io = btrfsic_bio_end_io;
2904 } 3165 }
2905 } 3166 }
3167leave:
2906 mutex_unlock(&btrfsic_mutex); 3168 mutex_unlock(&btrfsic_mutex);
2907 3169
2908 submit_bio(rw, bio); 3170 submit_bio(rw, bio);
@@ -2917,6 +3179,30 @@ int btrfsic_mount(struct btrfs_root *root,
2917 struct list_head *dev_head = &fs_devices->devices; 3179 struct list_head *dev_head = &fs_devices->devices;
2918 struct btrfs_device *device; 3180 struct btrfs_device *device;
2919 3181
3182 if (root->nodesize != root->leafsize) {
3183 printk(KERN_INFO
3184 "btrfsic: cannot handle nodesize %d != leafsize %d!\n",
3185 root->nodesize, root->leafsize);
3186 return -1;
3187 }
3188 if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) {
3189 printk(KERN_INFO
3190 "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
3191 root->nodesize, (unsigned long)PAGE_CACHE_SIZE);
3192 return -1;
3193 }
3194 if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) {
3195 printk(KERN_INFO
3196 "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
3197 root->leafsize, (unsigned long)PAGE_CACHE_SIZE);
3198 return -1;
3199 }
3200 if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) {
3201 printk(KERN_INFO
3202 "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
3203 root->sectorsize, (unsigned long)PAGE_CACHE_SIZE);
3204 return -1;
3205 }
2920 state = kzalloc(sizeof(*state), GFP_NOFS); 3206 state = kzalloc(sizeof(*state), GFP_NOFS);
2921 if (NULL == state) { 3207 if (NULL == state) {
2922 printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n"); 3208 printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n");
@@ -2933,6 +3219,8 @@ int btrfsic_mount(struct btrfs_root *root,
2933 state->print_mask = print_mask; 3219 state->print_mask = print_mask;
2934 state->include_extent_data = including_extent_data; 3220 state->include_extent_data = including_extent_data;
2935 state->csum_size = 0; 3221 state->csum_size = 0;
3222 state->metablock_size = root->nodesize;
3223 state->datablock_size = root->sectorsize;
2936 INIT_LIST_HEAD(&state->all_blocks_list); 3224 INIT_LIST_HEAD(&state->all_blocks_list);
2937 btrfsic_block_hashtable_init(&state->block_hashtable); 3225 btrfsic_block_hashtable_init(&state->block_hashtable);
2938 btrfsic_block_link_hashtable_init(&state->block_link_hashtable); 3226 btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
@@ -3049,7 +3337,7 @@ void btrfsic_unmount(struct btrfs_root *root,
3049 btrfsic_block_link_free(l); 3337 btrfsic_block_link_free(l);
3050 } 3338 }
3051 3339
3052 if (b_all->is_iodone) 3340 if (b_all->is_iodone || b_all->never_written)
3053 btrfsic_block_free(b_all); 3341 btrfsic_block_free(b_all);
3054 else 3342 else
3055 printk(KERN_INFO "btrfs: attempt to free %c-block" 3343 printk(KERN_INFO "btrfs: attempt to free %c-block"
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 4106264fbc65..d7a96cfdc50a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -18,6 +18,7 @@
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/rbtree.h>
21#include "ctree.h" 22#include "ctree.h"
22#include "disk-io.h" 23#include "disk-io.h"
23#include "transaction.h" 24#include "transaction.h"
@@ -37,7 +38,16 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
37 struct extent_buffer *dst_buf, 38 struct extent_buffer *dst_buf,
38 struct extent_buffer *src_buf); 39 struct extent_buffer *src_buf);
39static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 40static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
40 struct btrfs_path *path, int level, int slot); 41 struct btrfs_path *path, int level, int slot,
42 int tree_mod_log);
43static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
44 struct extent_buffer *eb);
45struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr,
46 u32 blocksize, u64 parent_transid,
47 u64 time_seq);
48struct extent_buffer *btrfs_find_old_tree_block(struct btrfs_root *root,
49 u64 bytenr, u32 blocksize,
50 u64 time_seq);
41 51
42struct btrfs_path *btrfs_alloc_path(void) 52struct btrfs_path *btrfs_alloc_path(void)
43{ 53{
@@ -255,7 +265,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
255 265
256 cow = btrfs_alloc_free_block(trans, root, buf->len, 0, 266 cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
257 new_root_objectid, &disk_key, level, 267 new_root_objectid, &disk_key, level,
258 buf->start, 0, 1); 268 buf->start, 0);
259 if (IS_ERR(cow)) 269 if (IS_ERR(cow))
260 return PTR_ERR(cow); 270 return PTR_ERR(cow);
261 271
@@ -288,6 +298,434 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
288 return 0; 298 return 0;
289} 299}
290 300
301enum mod_log_op {
302 MOD_LOG_KEY_REPLACE,
303 MOD_LOG_KEY_ADD,
304 MOD_LOG_KEY_REMOVE,
305 MOD_LOG_KEY_REMOVE_WHILE_FREEING,
306 MOD_LOG_KEY_REMOVE_WHILE_MOVING,
307 MOD_LOG_MOVE_KEYS,
308 MOD_LOG_ROOT_REPLACE,
309};
310
311struct tree_mod_move {
312 int dst_slot;
313 int nr_items;
314};
315
316struct tree_mod_root {
317 u64 logical;
318 u8 level;
319};
320
321struct tree_mod_elem {
322 struct rb_node node;
323 u64 index; /* shifted logical */
324 struct seq_list elem;
325 enum mod_log_op op;
326
327 /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
328 int slot;
329
330 /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
331 u64 generation;
332
333 /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
334 struct btrfs_disk_key key;
335 u64 blockptr;
336
337 /* this is used for op == MOD_LOG_MOVE_KEYS */
338 struct tree_mod_move move;
339
340 /* this is used for op == MOD_LOG_ROOT_REPLACE */
341 struct tree_mod_root old_root;
342};
343
344static inline void
345__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem)
346{
347 elem->seq = atomic_inc_return(&fs_info->tree_mod_seq);
348 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
349}
350
351void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
352 struct seq_list *elem)
353{
354 elem->flags = 1;
355 spin_lock(&fs_info->tree_mod_seq_lock);
356 __get_tree_mod_seq(fs_info, elem);
357 spin_unlock(&fs_info->tree_mod_seq_lock);
358}
359
360void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
361 struct seq_list *elem)
362{
363 struct rb_root *tm_root;
364 struct rb_node *node;
365 struct rb_node *next;
366 struct seq_list *cur_elem;
367 struct tree_mod_elem *tm;
368 u64 min_seq = (u64)-1;
369 u64 seq_putting = elem->seq;
370
371 if (!seq_putting)
372 return;
373
374 BUG_ON(!(elem->flags & 1));
375 spin_lock(&fs_info->tree_mod_seq_lock);
376 list_del(&elem->list);
377
378 list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
379 if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) {
380 if (seq_putting > cur_elem->seq) {
381 /*
382 * blocker with lower sequence number exists, we
383 * cannot remove anything from the log
384 */
385 goto out;
386 }
387 min_seq = cur_elem->seq;
388 }
389 }
390
391 /*
392 * anything that's lower than the lowest existing (read: blocked)
393 * sequence number can be removed from the tree.
394 */
395 write_lock(&fs_info->tree_mod_log_lock);
396 tm_root = &fs_info->tree_mod_log;
397 for (node = rb_first(tm_root); node; node = next) {
398 next = rb_next(node);
399 tm = container_of(node, struct tree_mod_elem, node);
400 if (tm->elem.seq > min_seq)
401 continue;
402 rb_erase(node, tm_root);
403 list_del(&tm->elem.list);
404 kfree(tm);
405 }
406 write_unlock(&fs_info->tree_mod_log_lock);
407out:
408 spin_unlock(&fs_info->tree_mod_seq_lock);
409}
410
411/*
412 * key order of the log:
413 * index -> sequence
414 *
415 * the index is the shifted logical of the *new* root node for root replace
416 * operations, or the shifted logical of the affected block for all other
417 * operations.
418 */
419static noinline int
420__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
421{
422 struct rb_root *tm_root;
423 struct rb_node **new;
424 struct rb_node *parent = NULL;
425 struct tree_mod_elem *cur;
426 int ret = 0;
427
428 BUG_ON(!tm || !tm->elem.seq);
429
430 write_lock(&fs_info->tree_mod_log_lock);
431 tm_root = &fs_info->tree_mod_log;
432 new = &tm_root->rb_node;
433 while (*new) {
434 cur = container_of(*new, struct tree_mod_elem, node);
435 parent = *new;
436 if (cur->index < tm->index)
437 new = &((*new)->rb_left);
438 else if (cur->index > tm->index)
439 new = &((*new)->rb_right);
440 else if (cur->elem.seq < tm->elem.seq)
441 new = &((*new)->rb_left);
442 else if (cur->elem.seq > tm->elem.seq)
443 new = &((*new)->rb_right);
444 else {
445 kfree(tm);
446 ret = -EEXIST;
447 goto unlock;
448 }
449 }
450
451 rb_link_node(&tm->node, parent, new);
452 rb_insert_color(&tm->node, tm_root);
453unlock:
454 write_unlock(&fs_info->tree_mod_log_lock);
455 return ret;
456}
457
458static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
459 struct extent_buffer *eb) {
460 smp_mb();
461 if (list_empty(&(fs_info)->tree_mod_seq_list))
462 return 1;
463 if (!eb)
464 return 0;
465 if (btrfs_header_level(eb) == 0)
466 return 1;
467 return 0;
468}
469
470static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
471 struct tree_mod_elem **tm_ret)
472{
473 struct tree_mod_elem *tm;
474 int seq;
475
476 if (tree_mod_dont_log(fs_info, NULL))
477 return 0;
478
479 tm = *tm_ret = kzalloc(sizeof(*tm), flags);
480 if (!tm)
481 return -ENOMEM;
482
483 tm->elem.flags = 0;
484 spin_lock(&fs_info->tree_mod_seq_lock);
485 if (list_empty(&fs_info->tree_mod_seq_list)) {
486 /*
487 * someone emptied the list while we were waiting for the lock.
488 * we must not add to the list, because no blocker exists. items
489 * are removed from the list only when the existing blocker is
490 * removed from the list.
491 */
492 kfree(tm);
493 seq = 0;
494 } else {
495 __get_tree_mod_seq(fs_info, &tm->elem);
496 seq = tm->elem.seq;
497 }
498 spin_unlock(&fs_info->tree_mod_seq_lock);
499
500 return seq;
501}
502
503static noinline int
504tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
505 struct extent_buffer *eb, int slot,
506 enum mod_log_op op, gfp_t flags)
507{
508 struct tree_mod_elem *tm;
509 int ret;
510
511 ret = tree_mod_alloc(fs_info, flags, &tm);
512 if (ret <= 0)
513 return ret;
514
515 tm->index = eb->start >> PAGE_CACHE_SHIFT;
516 if (op != MOD_LOG_KEY_ADD) {
517 btrfs_node_key(eb, &tm->key, slot);
518 tm->blockptr = btrfs_node_blockptr(eb, slot);
519 }
520 tm->op = op;
521 tm->slot = slot;
522 tm->generation = btrfs_node_ptr_generation(eb, slot);
523
524 return __tree_mod_log_insert(fs_info, tm);
525}
526
527static noinline int
528tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
529 int slot, enum mod_log_op op)
530{
531 return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS);
532}
533
534static noinline int
535tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
536 struct extent_buffer *eb, int dst_slot, int src_slot,
537 int nr_items, gfp_t flags)
538{
539 struct tree_mod_elem *tm;
540 int ret;
541 int i;
542
543 if (tree_mod_dont_log(fs_info, eb))
544 return 0;
545
546 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
547 ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot,
548 MOD_LOG_KEY_REMOVE_WHILE_MOVING);
549 BUG_ON(ret < 0);
550 }
551
552 ret = tree_mod_alloc(fs_info, flags, &tm);
553 if (ret <= 0)
554 return ret;
555
556 tm->index = eb->start >> PAGE_CACHE_SHIFT;
557 tm->slot = src_slot;
558 tm->move.dst_slot = dst_slot;
559 tm->move.nr_items = nr_items;
560 tm->op = MOD_LOG_MOVE_KEYS;
561
562 return __tree_mod_log_insert(fs_info, tm);
563}
564
565static noinline int
566tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
567 struct extent_buffer *old_root,
568 struct extent_buffer *new_root, gfp_t flags)
569{
570 struct tree_mod_elem *tm;
571 int ret;
572
573 ret = tree_mod_alloc(fs_info, flags, &tm);
574 if (ret <= 0)
575 return ret;
576
577 tm->index = new_root->start >> PAGE_CACHE_SHIFT;
578 tm->old_root.logical = old_root->start;
579 tm->old_root.level = btrfs_header_level(old_root);
580 tm->generation = btrfs_header_generation(old_root);
581 tm->op = MOD_LOG_ROOT_REPLACE;
582
583 return __tree_mod_log_insert(fs_info, tm);
584}
585
586static struct tree_mod_elem *
587__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
588 int smallest)
589{
590 struct rb_root *tm_root;
591 struct rb_node *node;
592 struct tree_mod_elem *cur = NULL;
593 struct tree_mod_elem *found = NULL;
594 u64 index = start >> PAGE_CACHE_SHIFT;
595
596 read_lock(&fs_info->tree_mod_log_lock);
597 tm_root = &fs_info->tree_mod_log;
598 node = tm_root->rb_node;
599 while (node) {
600 cur = container_of(node, struct tree_mod_elem, node);
601 if (cur->index < index) {
602 node = node->rb_left;
603 } else if (cur->index > index) {
604 node = node->rb_right;
605 } else if (cur->elem.seq < min_seq) {
606 node = node->rb_left;
607 } else if (!smallest) {
608 /* we want the node with the highest seq */
609 if (found)
610 BUG_ON(found->elem.seq > cur->elem.seq);
611 found = cur;
612 node = node->rb_left;
613 } else if (cur->elem.seq > min_seq) {
614 /* we want the node with the smallest seq */
615 if (found)
616 BUG_ON(found->elem.seq < cur->elem.seq);
617 found = cur;
618 node = node->rb_right;
619 } else {
620 found = cur;
621 break;
622 }
623 }
624 read_unlock(&fs_info->tree_mod_log_lock);
625
626 return found;
627}
628
629/*
630 * this returns the element from the log with the smallest time sequence
631 * value that's in the log (the oldest log item). any element with a time
632 * sequence lower than min_seq will be ignored.
633 */
634static struct tree_mod_elem *
635tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
636 u64 min_seq)
637{
638 return __tree_mod_log_search(fs_info, start, min_seq, 1);
639}
640
641/*
642 * this returns the element from the log with the largest time sequence
643 * value that's in the log (the most recent log item). any element with
644 * a time sequence lower than min_seq will be ignored.
645 */
646static struct tree_mod_elem *
647tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
648{
649 return __tree_mod_log_search(fs_info, start, min_seq, 0);
650}
651
652static inline void
653tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
654 struct extent_buffer *src, unsigned long dst_offset,
655 unsigned long src_offset, int nr_items)
656{
657 int ret;
658 int i;
659
660 if (tree_mod_dont_log(fs_info, NULL))
661 return;
662
663 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
664 return;
665
666 /* speed this up by single seq for all operations? */
667 for (i = 0; i < nr_items; i++) {
668 ret = tree_mod_log_insert_key(fs_info, src, i + src_offset,
669 MOD_LOG_KEY_REMOVE);
670 BUG_ON(ret < 0);
671 ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset,
672 MOD_LOG_KEY_ADD);
673 BUG_ON(ret < 0);
674 }
675}
676
677static inline void
678tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
679 int dst_offset, int src_offset, int nr_items)
680{
681 int ret;
682 ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
683 nr_items, GFP_NOFS);
684 BUG_ON(ret < 0);
685}
686
687static inline void
688tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
689 struct extent_buffer *eb,
690 struct btrfs_disk_key *disk_key, int slot, int atomic)
691{
692 int ret;
693
694 ret = tree_mod_log_insert_key_mask(fs_info, eb, slot,
695 MOD_LOG_KEY_REPLACE,
696 atomic ? GFP_ATOMIC : GFP_NOFS);
697 BUG_ON(ret < 0);
698}
699
700static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
701 struct extent_buffer *eb)
702{
703 int i;
704 int ret;
705 u32 nritems;
706
707 if (tree_mod_dont_log(fs_info, eb))
708 return;
709
710 nritems = btrfs_header_nritems(eb);
711 for (i = nritems - 1; i >= 0; i--) {
712 ret = tree_mod_log_insert_key(fs_info, eb, i,
713 MOD_LOG_KEY_REMOVE_WHILE_FREEING);
714 BUG_ON(ret < 0);
715 }
716}
717
718static inline void
719tree_mod_log_set_root_pointer(struct btrfs_root *root,
720 struct extent_buffer *new_root_node)
721{
722 int ret;
723 tree_mod_log_free_eb(root->fs_info, root->node);
724 ret = tree_mod_log_insert_root(root->fs_info, root->node,
725 new_root_node, GFP_NOFS);
726 BUG_ON(ret < 0);
727}
728
291/* 729/*
292 * check if the tree block can be shared by multiple trees 730 * check if the tree block can be shared by multiple trees
293 */ 731 */
@@ -409,6 +847,12 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
409 ret = btrfs_dec_ref(trans, root, buf, 1, 1); 847 ret = btrfs_dec_ref(trans, root, buf, 1, 1);
410 BUG_ON(ret); /* -ENOMEM */ 848 BUG_ON(ret); /* -ENOMEM */
411 } 849 }
850 /*
851 * don't log freeing in case we're freeing the root node, this
852 * is done by tree_mod_log_set_root_pointer later
853 */
854 if (buf != root->node && btrfs_header_level(buf) != 0)
855 tree_mod_log_free_eb(root->fs_info, buf);
412 clean_tree_block(trans, root, buf); 856 clean_tree_block(trans, root, buf);
413 *last_ref = 1; 857 *last_ref = 1;
414 } 858 }
@@ -467,7 +911,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
467 911
468 cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, 912 cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
469 root->root_key.objectid, &disk_key, 913 root->root_key.objectid, &disk_key,
470 level, search_start, empty_size, 1); 914 level, search_start, empty_size);
471 if (IS_ERR(cow)) 915 if (IS_ERR(cow))
472 return PTR_ERR(cow); 916 return PTR_ERR(cow);
473 917
@@ -506,10 +950,11 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
506 parent_start = 0; 950 parent_start = 0;
507 951
508 extent_buffer_get(cow); 952 extent_buffer_get(cow);
953 tree_mod_log_set_root_pointer(root, cow);
509 rcu_assign_pointer(root->node, cow); 954 rcu_assign_pointer(root->node, cow);
510 955
511 btrfs_free_tree_block(trans, root, buf, parent_start, 956 btrfs_free_tree_block(trans, root, buf, parent_start,
512 last_ref, 1); 957 last_ref);
513 free_extent_buffer(buf); 958 free_extent_buffer(buf);
514 add_root_to_dirty_list(root); 959 add_root_to_dirty_list(root);
515 } else { 960 } else {
@@ -519,13 +964,15 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
519 parent_start = 0; 964 parent_start = 0;
520 965
521 WARN_ON(trans->transid != btrfs_header_generation(parent)); 966 WARN_ON(trans->transid != btrfs_header_generation(parent));
967 tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
968 MOD_LOG_KEY_REPLACE);
522 btrfs_set_node_blockptr(parent, parent_slot, 969 btrfs_set_node_blockptr(parent, parent_slot,
523 cow->start); 970 cow->start);
524 btrfs_set_node_ptr_generation(parent, parent_slot, 971 btrfs_set_node_ptr_generation(parent, parent_slot,
525 trans->transid); 972 trans->transid);
526 btrfs_mark_buffer_dirty(parent); 973 btrfs_mark_buffer_dirty(parent);
527 btrfs_free_tree_block(trans, root, buf, parent_start, 974 btrfs_free_tree_block(trans, root, buf, parent_start,
528 last_ref, 1); 975 last_ref);
529 } 976 }
530 if (unlock_orig) 977 if (unlock_orig)
531 btrfs_tree_unlock(buf); 978 btrfs_tree_unlock(buf);
@@ -535,6 +982,210 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
535 return 0; 982 return 0;
536} 983}
537 984
985/*
986 * returns the logical address of the oldest predecessor of the given root.
987 * entries older than time_seq are ignored.
988 */
989static struct tree_mod_elem *
990__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
991 struct btrfs_root *root, u64 time_seq)
992{
993 struct tree_mod_elem *tm;
994 struct tree_mod_elem *found = NULL;
995 u64 root_logical = root->node->start;
996 int looped = 0;
997
998 if (!time_seq)
999 return 0;
1000
1001 /*
1002 * the very last operation that's logged for a root is the replacement
1003 * operation (if it is replaced at all). this has the index of the *new*
1004 * root, making it the very first operation that's logged for this root.
1005 */
1006 while (1) {
1007 tm = tree_mod_log_search_oldest(fs_info, root_logical,
1008 time_seq);
1009 if (!looped && !tm)
1010 return 0;
1011 /*
1012 * we must have key remove operations in the log before the
1013 * replace operation.
1014 */
1015 BUG_ON(!tm);
1016
1017 if (tm->op != MOD_LOG_ROOT_REPLACE)
1018 break;
1019
1020 found = tm;
1021 root_logical = tm->old_root.logical;
1022 BUG_ON(root_logical == root->node->start);
1023 looped = 1;
1024 }
1025
1026 return found;
1027}
1028
1029/*
1030 * tm is a pointer to the first operation to rewind within eb. then, all
1031 * previous operations will be rewinded (until we reach something older than
1032 * time_seq).
1033 */
1034static void
1035__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1036 struct tree_mod_elem *first_tm)
1037{
1038 u32 n;
1039 struct rb_node *next;
1040 struct tree_mod_elem *tm = first_tm;
1041 unsigned long o_dst;
1042 unsigned long o_src;
1043 unsigned long p_size = sizeof(struct btrfs_key_ptr);
1044
1045 n = btrfs_header_nritems(eb);
1046 while (tm && tm->elem.seq >= time_seq) {
1047 /*
1048 * all the operations are recorded with the operator used for
1049 * the modification. as we're going backwards, we do the
1050 * opposite of each operation here.
1051 */
1052 switch (tm->op) {
1053 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1054 BUG_ON(tm->slot < n);
1055 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1056 case MOD_LOG_KEY_REMOVE:
1057 btrfs_set_node_key(eb, &tm->key, tm->slot);
1058 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1059 btrfs_set_node_ptr_generation(eb, tm->slot,
1060 tm->generation);
1061 n++;
1062 break;
1063 case MOD_LOG_KEY_REPLACE:
1064 BUG_ON(tm->slot >= n);
1065 btrfs_set_node_key(eb, &tm->key, tm->slot);
1066 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1067 btrfs_set_node_ptr_generation(eb, tm->slot,
1068 tm->generation);
1069 break;
1070 case MOD_LOG_KEY_ADD:
1071 if (tm->slot != n - 1) {
1072 o_dst = btrfs_node_key_ptr_offset(tm->slot);
1073 o_src = btrfs_node_key_ptr_offset(tm->slot + 1);
1074 memmove_extent_buffer(eb, o_dst, o_src, p_size);
1075 }
1076 n--;
1077 break;
1078 case MOD_LOG_MOVE_KEYS:
1079 o_dst = btrfs_node_key_ptr_offset(tm->slot);
1080 o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1081 memmove_extent_buffer(eb, o_dst, o_src,
1082 tm->move.nr_items * p_size);
1083 break;
1084 case MOD_LOG_ROOT_REPLACE:
1085 /*
1086 * this operation is special. for roots, this must be
1087 * handled explicitly before rewinding.
1088 * for non-roots, this operation may exist if the node
1089 * was a root: root A -> child B; then A gets empty and
1090 * B is promoted to the new root. in the mod log, we'll
1091 * have a root-replace operation for B, a tree block
1092 * that is no root. we simply ignore that operation.
1093 */
1094 break;
1095 }
1096 next = rb_next(&tm->node);
1097 if (!next)
1098 break;
1099 tm = container_of(next, struct tree_mod_elem, node);
1100 if (tm->index != first_tm->index)
1101 break;
1102 }
1103 btrfs_set_header_nritems(eb, n);
1104}
1105
1106static struct extent_buffer *
1107tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1108 u64 time_seq)
1109{
1110 struct extent_buffer *eb_rewin;
1111 struct tree_mod_elem *tm;
1112
1113 if (!time_seq)
1114 return eb;
1115
1116 if (btrfs_header_level(eb) == 0)
1117 return eb;
1118
1119 tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1120 if (!tm)
1121 return eb;
1122
1123 if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1124 BUG_ON(tm->slot != 0);
1125 eb_rewin = alloc_dummy_extent_buffer(eb->start,
1126 fs_info->tree_root->nodesize);
1127 BUG_ON(!eb_rewin);
1128 btrfs_set_header_bytenr(eb_rewin, eb->start);
1129 btrfs_set_header_backref_rev(eb_rewin,
1130 btrfs_header_backref_rev(eb));
1131 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1132 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1133 } else {
1134 eb_rewin = btrfs_clone_extent_buffer(eb);
1135 BUG_ON(!eb_rewin);
1136 }
1137
1138 extent_buffer_get(eb_rewin);
1139 free_extent_buffer(eb);
1140
1141 __tree_mod_log_rewind(eb_rewin, time_seq, tm);
1142
1143 return eb_rewin;
1144}
1145
1146static inline struct extent_buffer *
1147get_old_root(struct btrfs_root *root, u64 time_seq)
1148{
1149 struct tree_mod_elem *tm;
1150 struct extent_buffer *eb;
1151 struct tree_mod_root *old_root;
1152 u64 old_generation;
1153
1154 tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq);
1155 if (!tm)
1156 return root->node;
1157
1158 old_root = &tm->old_root;
1159 old_generation = tm->generation;
1160
1161 tm = tree_mod_log_search(root->fs_info, old_root->logical, time_seq);
1162 /*
1163 * there was an item in the log when __tree_mod_log_oldest_root
1164 * returned. this one must not go away, because the time_seq passed to
1165 * us must be blocking its removal.
1166 */
1167 BUG_ON(!tm);
1168
1169 if (old_root->logical == root->node->start) {
1170 /* there are logged operations for the current root */
1171 eb = btrfs_clone_extent_buffer(root->node);
1172 } else {
1173 /* there's a root replace operation for the current root */
1174 eb = alloc_dummy_extent_buffer(tm->index << PAGE_CACHE_SHIFT,
1175 root->nodesize);
1176 btrfs_set_header_bytenr(eb, eb->start);
1177 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1178 btrfs_set_header_owner(eb, root->root_key.objectid);
1179 }
1180 if (!eb)
1181 return NULL;
1182 btrfs_set_header_level(eb, old_root->level);
1183 btrfs_set_header_generation(eb, old_generation);
1184 __tree_mod_log_rewind(eb, time_seq, tm);
1185
1186 return eb;
1187}
1188
538static inline int should_cow_block(struct btrfs_trans_handle *trans, 1189static inline int should_cow_block(struct btrfs_trans_handle *trans,
539 struct btrfs_root *root, 1190 struct btrfs_root *root,
540 struct extent_buffer *buf) 1191 struct extent_buffer *buf)
@@ -739,7 +1390,11 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
739 if (!cur) 1390 if (!cur)
740 return -EIO; 1391 return -EIO;
741 } else if (!uptodate) { 1392 } else if (!uptodate) {
742 btrfs_read_buffer(cur, gen); 1393 err = btrfs_read_buffer(cur, gen);
1394 if (err) {
1395 free_extent_buffer(cur);
1396 return err;
1397 }
743 } 1398 }
744 } 1399 }
745 if (search_start == 0) 1400 if (search_start == 0)
@@ -854,20 +1509,18 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
854static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, 1509static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
855 int level, int *slot) 1510 int level, int *slot)
856{ 1511{
857 if (level == 0) { 1512 if (level == 0)
858 return generic_bin_search(eb, 1513 return generic_bin_search(eb,
859 offsetof(struct btrfs_leaf, items), 1514 offsetof(struct btrfs_leaf, items),
860 sizeof(struct btrfs_item), 1515 sizeof(struct btrfs_item),
861 key, btrfs_header_nritems(eb), 1516 key, btrfs_header_nritems(eb),
862 slot); 1517 slot);
863 } else { 1518 else
864 return generic_bin_search(eb, 1519 return generic_bin_search(eb,
865 offsetof(struct btrfs_node, ptrs), 1520 offsetof(struct btrfs_node, ptrs),
866 sizeof(struct btrfs_key_ptr), 1521 sizeof(struct btrfs_key_ptr),
867 key, btrfs_header_nritems(eb), 1522 key, btrfs_header_nritems(eb),
868 slot); 1523 slot);
869 }
870 return -1;
871} 1524}
872 1525
873int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 1526int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
@@ -974,6 +1627,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
974 goto enospc; 1627 goto enospc;
975 } 1628 }
976 1629
1630 tree_mod_log_set_root_pointer(root, child);
977 rcu_assign_pointer(root->node, child); 1631 rcu_assign_pointer(root->node, child);
978 1632
979 add_root_to_dirty_list(root); 1633 add_root_to_dirty_list(root);
@@ -987,7 +1641,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
987 free_extent_buffer(mid); 1641 free_extent_buffer(mid);
988 1642
989 root_sub_used(root, mid->len); 1643 root_sub_used(root, mid->len);
990 btrfs_free_tree_block(trans, root, mid, 0, 1, 0); 1644 btrfs_free_tree_block(trans, root, mid, 0, 1);
991 /* once for the root ptr */ 1645 /* once for the root ptr */
992 free_extent_buffer_stale(mid); 1646 free_extent_buffer_stale(mid);
993 return 0; 1647 return 0;
@@ -1040,14 +1694,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1040 if (btrfs_header_nritems(right) == 0) { 1694 if (btrfs_header_nritems(right) == 0) {
1041 clean_tree_block(trans, root, right); 1695 clean_tree_block(trans, root, right);
1042 btrfs_tree_unlock(right); 1696 btrfs_tree_unlock(right);
1043 del_ptr(trans, root, path, level + 1, pslot + 1); 1697 del_ptr(trans, root, path, level + 1, pslot + 1, 1);
1044 root_sub_used(root, right->len); 1698 root_sub_used(root, right->len);
1045 btrfs_free_tree_block(trans, root, right, 0, 1, 0); 1699 btrfs_free_tree_block(trans, root, right, 0, 1);
1046 free_extent_buffer_stale(right); 1700 free_extent_buffer_stale(right);
1047 right = NULL; 1701 right = NULL;
1048 } else { 1702 } else {
1049 struct btrfs_disk_key right_key; 1703 struct btrfs_disk_key right_key;
1050 btrfs_node_key(right, &right_key, 0); 1704 btrfs_node_key(right, &right_key, 0);
1705 tree_mod_log_set_node_key(root->fs_info, parent,
1706 &right_key, pslot + 1, 0);
1051 btrfs_set_node_key(parent, &right_key, pslot + 1); 1707 btrfs_set_node_key(parent, &right_key, pslot + 1);
1052 btrfs_mark_buffer_dirty(parent); 1708 btrfs_mark_buffer_dirty(parent);
1053 } 1709 }
@@ -1082,15 +1738,17 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1082 if (btrfs_header_nritems(mid) == 0) { 1738 if (btrfs_header_nritems(mid) == 0) {
1083 clean_tree_block(trans, root, mid); 1739 clean_tree_block(trans, root, mid);
1084 btrfs_tree_unlock(mid); 1740 btrfs_tree_unlock(mid);
1085 del_ptr(trans, root, path, level + 1, pslot); 1741 del_ptr(trans, root, path, level + 1, pslot, 1);
1086 root_sub_used(root, mid->len); 1742 root_sub_used(root, mid->len);
1087 btrfs_free_tree_block(trans, root, mid, 0, 1, 0); 1743 btrfs_free_tree_block(trans, root, mid, 0, 1);
1088 free_extent_buffer_stale(mid); 1744 free_extent_buffer_stale(mid);
1089 mid = NULL; 1745 mid = NULL;
1090 } else { 1746 } else {
1091 /* update the parent key to reflect our changes */ 1747 /* update the parent key to reflect our changes */
1092 struct btrfs_disk_key mid_key; 1748 struct btrfs_disk_key mid_key;
1093 btrfs_node_key(mid, &mid_key, 0); 1749 btrfs_node_key(mid, &mid_key, 0);
1750 tree_mod_log_set_node_key(root->fs_info, parent, &mid_key,
1751 pslot, 0);
1094 btrfs_set_node_key(parent, &mid_key, pslot); 1752 btrfs_set_node_key(parent, &mid_key, pslot);
1095 btrfs_mark_buffer_dirty(parent); 1753 btrfs_mark_buffer_dirty(parent);
1096 } 1754 }
@@ -1188,6 +1846,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1188 struct btrfs_disk_key disk_key; 1846 struct btrfs_disk_key disk_key;
1189 orig_slot += left_nr; 1847 orig_slot += left_nr;
1190 btrfs_node_key(mid, &disk_key, 0); 1848 btrfs_node_key(mid, &disk_key, 0);
1849 tree_mod_log_set_node_key(root->fs_info, parent,
1850 &disk_key, pslot, 0);
1191 btrfs_set_node_key(parent, &disk_key, pslot); 1851 btrfs_set_node_key(parent, &disk_key, pslot);
1192 btrfs_mark_buffer_dirty(parent); 1852 btrfs_mark_buffer_dirty(parent);
1193 if (btrfs_header_nritems(left) > orig_slot) { 1853 if (btrfs_header_nritems(left) > orig_slot) {
@@ -1239,6 +1899,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1239 struct btrfs_disk_key disk_key; 1899 struct btrfs_disk_key disk_key;
1240 1900
1241 btrfs_node_key(right, &disk_key, 0); 1901 btrfs_node_key(right, &disk_key, 0);
1902 tree_mod_log_set_node_key(root->fs_info, parent,
1903 &disk_key, pslot + 1, 0);
1242 btrfs_set_node_key(parent, &disk_key, pslot + 1); 1904 btrfs_set_node_key(parent, &disk_key, pslot + 1);
1243 btrfs_mark_buffer_dirty(parent); 1905 btrfs_mark_buffer_dirty(parent);
1244 1906
@@ -1496,7 +2158,7 @@ static int
1496read_block_for_search(struct btrfs_trans_handle *trans, 2158read_block_for_search(struct btrfs_trans_handle *trans,
1497 struct btrfs_root *root, struct btrfs_path *p, 2159 struct btrfs_root *root, struct btrfs_path *p,
1498 struct extent_buffer **eb_ret, int level, int slot, 2160 struct extent_buffer **eb_ret, int level, int slot,
1499 struct btrfs_key *key) 2161 struct btrfs_key *key, u64 time_seq)
1500{ 2162{
1501 u64 blocknr; 2163 u64 blocknr;
1502 u64 gen; 2164 u64 gen;
@@ -1850,7 +2512,7 @@ cow_done:
1850 } 2512 }
1851 2513
1852 err = read_block_for_search(trans, root, p, 2514 err = read_block_for_search(trans, root, p,
1853 &b, level, slot, key); 2515 &b, level, slot, key, 0);
1854 if (err == -EAGAIN) 2516 if (err == -EAGAIN)
1855 goto again; 2517 goto again;
1856 if (err) { 2518 if (err) {
@@ -1922,6 +2584,115 @@ done:
1922} 2584}
1923 2585
1924/* 2586/*
2587 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2588 * current state of the tree together with the operations recorded in the tree
2589 * modification log to search for the key in a previous version of this tree, as
2590 * denoted by the time_seq parameter.
2591 *
2592 * Naturally, there is no support for insert, delete or cow operations.
2593 *
2594 * The resulting path and return value will be set up as if we called
2595 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2596 */
2597int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2598 struct btrfs_path *p, u64 time_seq)
2599{
2600 struct extent_buffer *b;
2601 int slot;
2602 int ret;
2603 int err;
2604 int level;
2605 int lowest_unlock = 1;
2606 u8 lowest_level = 0;
2607
2608 lowest_level = p->lowest_level;
2609 WARN_ON(p->nodes[0] != NULL);
2610
2611 if (p->search_commit_root) {
2612 BUG_ON(time_seq);
2613 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2614 }
2615
2616again:
2617 b = get_old_root(root, time_seq);
2618 extent_buffer_get(b);
2619 level = btrfs_header_level(b);
2620 btrfs_tree_read_lock(b);
2621 p->locks[level] = BTRFS_READ_LOCK;
2622
2623 while (b) {
2624 level = btrfs_header_level(b);
2625 p->nodes[level] = b;
2626 btrfs_clear_path_blocking(p, NULL, 0);
2627
2628 /*
2629 * we have a lock on b and as long as we aren't changing
2630 * the tree, there is no way to for the items in b to change.
2631 * It is safe to drop the lock on our parent before we
2632 * go through the expensive btree search on b.
2633 */
2634 btrfs_unlock_up_safe(p, level + 1);
2635
2636 ret = bin_search(b, key, level, &slot);
2637
2638 if (level != 0) {
2639 int dec = 0;
2640 if (ret && slot > 0) {
2641 dec = 1;
2642 slot -= 1;
2643 }
2644 p->slots[level] = slot;
2645 unlock_up(p, level, lowest_unlock, 0, NULL);
2646
2647 if (level == lowest_level) {
2648 if (dec)
2649 p->slots[level]++;
2650 goto done;
2651 }
2652
2653 err = read_block_for_search(NULL, root, p, &b, level,
2654 slot, key, time_seq);
2655 if (err == -EAGAIN)
2656 goto again;
2657 if (err) {
2658 ret = err;
2659 goto done;
2660 }
2661
2662 level = btrfs_header_level(b);
2663 err = btrfs_try_tree_read_lock(b);
2664 if (!err) {
2665 btrfs_set_path_blocking(p);
2666 btrfs_tree_read_lock(b);
2667 btrfs_clear_path_blocking(p, b,
2668 BTRFS_READ_LOCK);
2669 }
2670 p->locks[level] = BTRFS_READ_LOCK;
2671 p->nodes[level] = b;
2672 b = tree_mod_log_rewind(root->fs_info, b, time_seq);
2673 if (b != p->nodes[level]) {
2674 btrfs_tree_unlock_rw(p->nodes[level],
2675 p->locks[level]);
2676 p->locks[level] = 0;
2677 p->nodes[level] = b;
2678 }
2679 } else {
2680 p->slots[level] = slot;
2681 unlock_up(p, level, lowest_unlock, 0, NULL);
2682 goto done;
2683 }
2684 }
2685 ret = 1;
2686done:
2687 if (!p->leave_spinning)
2688 btrfs_set_path_blocking(p);
2689 if (ret < 0)
2690 btrfs_release_path(p);
2691
2692 return ret;
2693}
2694
2695/*
1925 * adjust the pointers going up the tree, starting at level 2696 * adjust the pointers going up the tree, starting at level
1926 * making sure the right key of each node is points to 'key'. 2697 * making sure the right key of each node is points to 'key'.
1927 * This is used after shifting pointers to the left, so it stops 2698 * This is used after shifting pointers to the left, so it stops
@@ -1941,6 +2712,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,
1941 if (!path->nodes[i]) 2712 if (!path->nodes[i])
1942 break; 2713 break;
1943 t = path->nodes[i]; 2714 t = path->nodes[i];
2715 tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1);
1944 btrfs_set_node_key(t, key, tslot); 2716 btrfs_set_node_key(t, key, tslot);
1945 btrfs_mark_buffer_dirty(path->nodes[i]); 2717 btrfs_mark_buffer_dirty(path->nodes[i]);
1946 if (tslot != 0) 2718 if (tslot != 0)
@@ -2023,12 +2795,16 @@ static int push_node_left(struct btrfs_trans_handle *trans,
2023 } else 2795 } else
2024 push_items = min(src_nritems - 8, push_items); 2796 push_items = min(src_nritems - 8, push_items);
2025 2797
2798 tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
2799 push_items);
2026 copy_extent_buffer(dst, src, 2800 copy_extent_buffer(dst, src,
2027 btrfs_node_key_ptr_offset(dst_nritems), 2801 btrfs_node_key_ptr_offset(dst_nritems),
2028 btrfs_node_key_ptr_offset(0), 2802 btrfs_node_key_ptr_offset(0),
2029 push_items * sizeof(struct btrfs_key_ptr)); 2803 push_items * sizeof(struct btrfs_key_ptr));
2030 2804
2031 if (push_items < src_nritems) { 2805 if (push_items < src_nritems) {
2806 tree_mod_log_eb_move(root->fs_info, src, 0, push_items,
2807 src_nritems - push_items);
2032 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), 2808 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
2033 btrfs_node_key_ptr_offset(push_items), 2809 btrfs_node_key_ptr_offset(push_items),
2034 (src_nritems - push_items) * 2810 (src_nritems - push_items) *
@@ -2082,11 +2858,14 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
2082 if (max_push < push_items) 2858 if (max_push < push_items)
2083 push_items = max_push; 2859 push_items = max_push;
2084 2860
2861 tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
2085 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), 2862 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
2086 btrfs_node_key_ptr_offset(0), 2863 btrfs_node_key_ptr_offset(0),
2087 (dst_nritems) * 2864 (dst_nritems) *
2088 sizeof(struct btrfs_key_ptr)); 2865 sizeof(struct btrfs_key_ptr));
2089 2866
2867 tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
2868 src_nritems - push_items, push_items);
2090 copy_extent_buffer(dst, src, 2869 copy_extent_buffer(dst, src,
2091 btrfs_node_key_ptr_offset(0), 2870 btrfs_node_key_ptr_offset(0),
2092 btrfs_node_key_ptr_offset(src_nritems - push_items), 2871 btrfs_node_key_ptr_offset(src_nritems - push_items),
@@ -2129,7 +2908,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2129 2908
2130 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 2909 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2131 root->root_key.objectid, &lower_key, 2910 root->root_key.objectid, &lower_key,
2132 level, root->node->start, 0, 0); 2911 level, root->node->start, 0);
2133 if (IS_ERR(c)) 2912 if (IS_ERR(c))
2134 return PTR_ERR(c); 2913 return PTR_ERR(c);
2135 2914
@@ -2161,6 +2940,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2161 btrfs_mark_buffer_dirty(c); 2940 btrfs_mark_buffer_dirty(c);
2162 2941
2163 old = root->node; 2942 old = root->node;
2943 tree_mod_log_set_root_pointer(root, c);
2164 rcu_assign_pointer(root->node, c); 2944 rcu_assign_pointer(root->node, c);
2165 2945
2166 /* the super has an extra ref to root->node */ 2946 /* the super has an extra ref to root->node */
@@ -2184,10 +2964,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2184static void insert_ptr(struct btrfs_trans_handle *trans, 2964static void insert_ptr(struct btrfs_trans_handle *trans,
2185 struct btrfs_root *root, struct btrfs_path *path, 2965 struct btrfs_root *root, struct btrfs_path *path,
2186 struct btrfs_disk_key *key, u64 bytenr, 2966 struct btrfs_disk_key *key, u64 bytenr,
2187 int slot, int level) 2967 int slot, int level, int tree_mod_log)
2188{ 2968{
2189 struct extent_buffer *lower; 2969 struct extent_buffer *lower;
2190 int nritems; 2970 int nritems;
2971 int ret;
2191 2972
2192 BUG_ON(!path->nodes[level]); 2973 BUG_ON(!path->nodes[level]);
2193 btrfs_assert_tree_locked(path->nodes[level]); 2974 btrfs_assert_tree_locked(path->nodes[level]);
@@ -2196,11 +2977,19 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
2196 BUG_ON(slot > nritems); 2977 BUG_ON(slot > nritems);
2197 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root)); 2978 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
2198 if (slot != nritems) { 2979 if (slot != nritems) {
2980 if (tree_mod_log && level)
2981 tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
2982 slot, nritems - slot);
2199 memmove_extent_buffer(lower, 2983 memmove_extent_buffer(lower,
2200 btrfs_node_key_ptr_offset(slot + 1), 2984 btrfs_node_key_ptr_offset(slot + 1),
2201 btrfs_node_key_ptr_offset(slot), 2985 btrfs_node_key_ptr_offset(slot),
2202 (nritems - slot) * sizeof(struct btrfs_key_ptr)); 2986 (nritems - slot) * sizeof(struct btrfs_key_ptr));
2203 } 2987 }
2988 if (tree_mod_log && level) {
2989 ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
2990 MOD_LOG_KEY_ADD);
2991 BUG_ON(ret < 0);
2992 }
2204 btrfs_set_node_key(lower, key, slot); 2993 btrfs_set_node_key(lower, key, slot);
2205 btrfs_set_node_blockptr(lower, slot, bytenr); 2994 btrfs_set_node_blockptr(lower, slot, bytenr);
2206 WARN_ON(trans->transid == 0); 2995 WARN_ON(trans->transid == 0);
@@ -2252,7 +3041,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2252 3041
2253 split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 3042 split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2254 root->root_key.objectid, 3043 root->root_key.objectid,
2255 &disk_key, level, c->start, 0, 0); 3044 &disk_key, level, c->start, 0);
2256 if (IS_ERR(split)) 3045 if (IS_ERR(split))
2257 return PTR_ERR(split); 3046 return PTR_ERR(split);
2258 3047
@@ -2271,7 +3060,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2271 (unsigned long)btrfs_header_chunk_tree_uuid(split), 3060 (unsigned long)btrfs_header_chunk_tree_uuid(split),
2272 BTRFS_UUID_SIZE); 3061 BTRFS_UUID_SIZE);
2273 3062
2274 3063 tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid);
2275 copy_extent_buffer(split, c, 3064 copy_extent_buffer(split, c,
2276 btrfs_node_key_ptr_offset(0), 3065 btrfs_node_key_ptr_offset(0),
2277 btrfs_node_key_ptr_offset(mid), 3066 btrfs_node_key_ptr_offset(mid),
@@ -2284,7 +3073,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2284 btrfs_mark_buffer_dirty(split); 3073 btrfs_mark_buffer_dirty(split);
2285 3074
2286 insert_ptr(trans, root, path, &disk_key, split->start, 3075 insert_ptr(trans, root, path, &disk_key, split->start,
2287 path->slots[level + 1] + 1, level + 1); 3076 path->slots[level + 1] + 1, level + 1, 1);
2288 3077
2289 if (path->slots[level] >= mid) { 3078 if (path->slots[level] >= mid) {
2290 path->slots[level] -= mid; 3079 path->slots[level] -= mid;
@@ -2821,7 +3610,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
2821 btrfs_set_header_nritems(l, mid); 3610 btrfs_set_header_nritems(l, mid);
2822 btrfs_item_key(right, &disk_key, 0); 3611 btrfs_item_key(right, &disk_key, 0);
2823 insert_ptr(trans, root, path, &disk_key, right->start, 3612 insert_ptr(trans, root, path, &disk_key, right->start,
2824 path->slots[1] + 1, 1); 3613 path->slots[1] + 1, 1, 0);
2825 3614
2826 btrfs_mark_buffer_dirty(right); 3615 btrfs_mark_buffer_dirty(right);
2827 btrfs_mark_buffer_dirty(l); 3616 btrfs_mark_buffer_dirty(l);
@@ -3004,7 +3793,7 @@ again:
3004 3793
3005 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 3794 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
3006 root->root_key.objectid, 3795 root->root_key.objectid,
3007 &disk_key, 0, l->start, 0, 0); 3796 &disk_key, 0, l->start, 0);
3008 if (IS_ERR(right)) 3797 if (IS_ERR(right))
3009 return PTR_ERR(right); 3798 return PTR_ERR(right);
3010 3799
@@ -3028,7 +3817,7 @@ again:
3028 if (mid <= slot) { 3817 if (mid <= slot) {
3029 btrfs_set_header_nritems(right, 0); 3818 btrfs_set_header_nritems(right, 0);
3030 insert_ptr(trans, root, path, &disk_key, right->start, 3819 insert_ptr(trans, root, path, &disk_key, right->start,
3031 path->slots[1] + 1, 1); 3820 path->slots[1] + 1, 1, 0);
3032 btrfs_tree_unlock(path->nodes[0]); 3821 btrfs_tree_unlock(path->nodes[0]);
3033 free_extent_buffer(path->nodes[0]); 3822 free_extent_buffer(path->nodes[0]);
3034 path->nodes[0] = right; 3823 path->nodes[0] = right;
@@ -3037,7 +3826,7 @@ again:
3037 } else { 3826 } else {
3038 btrfs_set_header_nritems(right, 0); 3827 btrfs_set_header_nritems(right, 0);
3039 insert_ptr(trans, root, path, &disk_key, right->start, 3828 insert_ptr(trans, root, path, &disk_key, right->start,
3040 path->slots[1], 1); 3829 path->slots[1], 1, 0);
3041 btrfs_tree_unlock(path->nodes[0]); 3830 btrfs_tree_unlock(path->nodes[0]);
3042 free_extent_buffer(path->nodes[0]); 3831 free_extent_buffer(path->nodes[0]);
3043 path->nodes[0] = right; 3832 path->nodes[0] = right;
@@ -3749,19 +4538,29 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
3749 * empty a node. 4538 * empty a node.
3750 */ 4539 */
3751static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4540static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3752 struct btrfs_path *path, int level, int slot) 4541 struct btrfs_path *path, int level, int slot,
4542 int tree_mod_log)
3753{ 4543{
3754 struct extent_buffer *parent = path->nodes[level]; 4544 struct extent_buffer *parent = path->nodes[level];
3755 u32 nritems; 4545 u32 nritems;
4546 int ret;
3756 4547
3757 nritems = btrfs_header_nritems(parent); 4548 nritems = btrfs_header_nritems(parent);
3758 if (slot != nritems - 1) { 4549 if (slot != nritems - 1) {
4550 if (tree_mod_log && level)
4551 tree_mod_log_eb_move(root->fs_info, parent, slot,
4552 slot + 1, nritems - slot - 1);
3759 memmove_extent_buffer(parent, 4553 memmove_extent_buffer(parent,
3760 btrfs_node_key_ptr_offset(slot), 4554 btrfs_node_key_ptr_offset(slot),
3761 btrfs_node_key_ptr_offset(slot + 1), 4555 btrfs_node_key_ptr_offset(slot + 1),
3762 sizeof(struct btrfs_key_ptr) * 4556 sizeof(struct btrfs_key_ptr) *
3763 (nritems - slot - 1)); 4557 (nritems - slot - 1));
4558 } else if (tree_mod_log && level) {
4559 ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4560 MOD_LOG_KEY_REMOVE);
4561 BUG_ON(ret < 0);
3764 } 4562 }
4563
3765 nritems--; 4564 nritems--;
3766 btrfs_set_header_nritems(parent, nritems); 4565 btrfs_set_header_nritems(parent, nritems);
3767 if (nritems == 0 && parent == root->node) { 4566 if (nritems == 0 && parent == root->node) {
@@ -3793,7 +4592,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
3793 struct extent_buffer *leaf) 4592 struct extent_buffer *leaf)
3794{ 4593{
3795 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4594 WARN_ON(btrfs_header_generation(leaf) != trans->transid);
3796 del_ptr(trans, root, path, 1, path->slots[1]); 4595 del_ptr(trans, root, path, 1, path->slots[1], 1);
3797 4596
3798 /* 4597 /*
3799 * btrfs_free_extent is expensive, we want to make sure we 4598 * btrfs_free_extent is expensive, we want to make sure we
@@ -3804,7 +4603,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
3804 root_sub_used(root, leaf->len); 4603 root_sub_used(root, leaf->len);
3805 4604
3806 extent_buffer_get(leaf); 4605 extent_buffer_get(leaf);
3807 btrfs_free_tree_block(trans, root, leaf, 0, 1, 0); 4606 btrfs_free_tree_block(trans, root, leaf, 0, 1);
3808 free_extent_buffer_stale(leaf); 4607 free_extent_buffer_stale(leaf);
3809} 4608}
3810/* 4609/*
@@ -4271,7 +5070,7 @@ again:
4271 next = c; 5070 next = c;
4272 next_rw_lock = path->locks[level]; 5071 next_rw_lock = path->locks[level];
4273 ret = read_block_for_search(NULL, root, path, &next, level, 5072 ret = read_block_for_search(NULL, root, path, &next, level,
4274 slot, &key); 5073 slot, &key, 0);
4275 if (ret == -EAGAIN) 5074 if (ret == -EAGAIN)
4276 goto again; 5075 goto again;
4277 5076
@@ -4308,7 +5107,7 @@ again:
4308 break; 5107 break;
4309 5108
4310 ret = read_block_for_search(NULL, root, path, &next, level, 5109 ret = read_block_for_search(NULL, root, path, &next, level,
4311 0, &key); 5110 0, &key, 0);
4312 if (ret == -EAGAIN) 5111 if (ret == -EAGAIN)
4313 goto again; 5112 goto again;
4314 5113
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8fd72331d600..0151ca1ac657 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -173,6 +173,9 @@ static int btrfs_csum_sizes[] = { 4, 0 };
173#define BTRFS_FT_XATTR 8 173#define BTRFS_FT_XATTR 8
174#define BTRFS_FT_MAX 9 174#define BTRFS_FT_MAX 9
175 175
176/* ioprio of readahead is set to idle */
177#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
178
176/* 179/*
177 * The key defines the order in the tree, and so it also defines (optimal) 180 * The key defines the order in the tree, and so it also defines (optimal)
178 * block layout. 181 * block layout.
@@ -823,6 +826,14 @@ struct btrfs_csum_item {
823 u8 csum; 826 u8 csum;
824} __attribute__ ((__packed__)); 827} __attribute__ ((__packed__));
825 828
829struct btrfs_dev_stats_item {
830 /*
831 * grow this item struct at the end for future enhancements and keep
832 * the existing values unchanged
833 */
834 __le64 values[BTRFS_DEV_STAT_VALUES_MAX];
835} __attribute__ ((__packed__));
836
826/* different types of block groups (and chunks) */ 837/* different types of block groups (and chunks) */
827#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) 838#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0)
828#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) 839#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1)
@@ -1129,6 +1140,15 @@ struct btrfs_fs_info {
1129 spinlock_t delayed_iput_lock; 1140 spinlock_t delayed_iput_lock;
1130 struct list_head delayed_iputs; 1141 struct list_head delayed_iputs;
1131 1142
1143 /* this protects tree_mod_seq_list */
1144 spinlock_t tree_mod_seq_lock;
1145 atomic_t tree_mod_seq;
1146 struct list_head tree_mod_seq_list;
1147
1148 /* this protects tree_mod_log */
1149 rwlock_t tree_mod_log_lock;
1150 struct rb_root tree_mod_log;
1151
1132 atomic_t nr_async_submits; 1152 atomic_t nr_async_submits;
1133 atomic_t async_submit_draining; 1153 atomic_t async_submit_draining;
1134 atomic_t nr_async_bios; 1154 atomic_t nr_async_bios;
@@ -1375,7 +1395,7 @@ struct btrfs_root {
1375 struct list_head root_list; 1395 struct list_head root_list;
1376 1396
1377 spinlock_t orphan_lock; 1397 spinlock_t orphan_lock;
1378 struct list_head orphan_list; 1398 atomic_t orphan_inodes;
1379 struct btrfs_block_rsv *orphan_block_rsv; 1399 struct btrfs_block_rsv *orphan_block_rsv;
1380 int orphan_item_inserted; 1400 int orphan_item_inserted;
1381 int orphan_cleanup_state; 1401 int orphan_cleanup_state;
@@ -1508,6 +1528,12 @@ struct btrfs_ioctl_defrag_range_args {
1508#define BTRFS_BALANCE_ITEM_KEY 248 1528#define BTRFS_BALANCE_ITEM_KEY 248
1509 1529
1510/* 1530/*
1531 * Persistantly stores the io stats in the device tree.
1532 * One key for all stats, (0, BTRFS_DEV_STATS_KEY, devid).
1533 */
1534#define BTRFS_DEV_STATS_KEY 249
1535
1536/*
1511 * string items are for debugging. They just store a short string of 1537 * string items are for debugging. They just store a short string of
1512 * data in the FS 1538 * data in the FS
1513 */ 1539 */
@@ -2415,6 +2441,30 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
2415 return btrfs_item_size(eb, e) - offset; 2441 return btrfs_item_size(eb, e) - offset;
2416} 2442}
2417 2443
2444/* btrfs_dev_stats_item */
2445static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb,
2446 struct btrfs_dev_stats_item *ptr,
2447 int index)
2448{
2449 u64 val;
2450
2451 read_extent_buffer(eb, &val,
2452 offsetof(struct btrfs_dev_stats_item, values) +
2453 ((unsigned long)ptr) + (index * sizeof(u64)),
2454 sizeof(val));
2455 return val;
2456}
2457
2458static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb,
2459 struct btrfs_dev_stats_item *ptr,
2460 int index, u64 val)
2461{
2462 write_extent_buffer(eb, &val,
2463 offsetof(struct btrfs_dev_stats_item, values) +
2464 ((unsigned long)ptr) + (index * sizeof(u64)),
2465 sizeof(val));
2466}
2467
2418static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) 2468static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
2419{ 2469{
2420 return sb->s_fs_info; 2470 return sb->s_fs_info;
@@ -2496,11 +2546,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
2496 struct btrfs_root *root, u32 blocksize, 2546 struct btrfs_root *root, u32 blocksize,
2497 u64 parent, u64 root_objectid, 2547 u64 parent, u64 root_objectid,
2498 struct btrfs_disk_key *key, int level, 2548 struct btrfs_disk_key *key, int level,
2499 u64 hint, u64 empty_size, int for_cow); 2549 u64 hint, u64 empty_size);
2500void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 2550void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
2501 struct btrfs_root *root, 2551 struct btrfs_root *root,
2502 struct extent_buffer *buf, 2552 struct extent_buffer *buf,
2503 u64 parent, int last_ref, int for_cow); 2553 u64 parent, int last_ref);
2504struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 2554struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
2505 struct btrfs_root *root, 2555 struct btrfs_root *root,
2506 u64 bytenr, u32 blocksize, 2556 u64 bytenr, u32 blocksize,
@@ -2659,6 +2709,8 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
2659int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root 2709int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2660 *root, struct btrfs_key *key, struct btrfs_path *p, int 2710 *root, struct btrfs_key *key, struct btrfs_path *p, int
2661 ins_len, int cow); 2711 ins_len, int cow);
2712int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2713 struct btrfs_path *p, u64 time_seq);
2662int btrfs_realloc_node(struct btrfs_trans_handle *trans, 2714int btrfs_realloc_node(struct btrfs_trans_handle *trans,
2663 struct btrfs_root *root, struct extent_buffer *parent, 2715 struct btrfs_root *root, struct extent_buffer *parent,
2664 int start_slot, int cache_only, u64 *last_ret, 2716 int start_slot, int cache_only, u64 *last_ret,
@@ -3098,4 +3150,23 @@ void btrfs_reada_detach(void *handle);
3098int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, 3150int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
3099 u64 start, int err); 3151 u64 start, int err);
3100 3152
3153/* delayed seq elem */
3154struct seq_list {
3155 struct list_head list;
3156 u64 seq;
3157 u32 flags;
3158};
3159
3160void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
3161 struct seq_list *elem);
3162void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
3163 struct seq_list *elem);
3164
3165static inline int is_fstree(u64 rootid)
3166{
3167 if (rootid == BTRFS_FS_TREE_OBJECTID ||
3168 (s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
3169 return 1;
3170 return 0;
3171}
3101#endif 3172#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 03e3748d84d0..c18d0442ae6d 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -669,8 +669,8 @@ static int btrfs_delayed_inode_reserve_metadata(
669 return ret; 669 return ret;
670 } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { 670 } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
671 spin_lock(&BTRFS_I(inode)->lock); 671 spin_lock(&BTRFS_I(inode)->lock);
672 if (BTRFS_I(inode)->delalloc_meta_reserved) { 672 if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
673 BTRFS_I(inode)->delalloc_meta_reserved = 0; 673 &BTRFS_I(inode)->runtime_flags)) {
674 spin_unlock(&BTRFS_I(inode)->lock); 674 spin_unlock(&BTRFS_I(inode)->lock);
675 release = true; 675 release = true;
676 goto migrate; 676 goto migrate;
@@ -1706,7 +1706,7 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
1706 btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode)); 1706 btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
1707 btrfs_set_stack_inode_generation(inode_item, 1707 btrfs_set_stack_inode_generation(inode_item,
1708 BTRFS_I(inode)->generation); 1708 BTRFS_I(inode)->generation);
1709 btrfs_set_stack_inode_sequence(inode_item, BTRFS_I(inode)->sequence); 1709 btrfs_set_stack_inode_sequence(inode_item, inode->i_version);
1710 btrfs_set_stack_inode_transid(inode_item, trans->transid); 1710 btrfs_set_stack_inode_transid(inode_item, trans->transid);
1711 btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev); 1711 btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
1712 btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags); 1712 btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
@@ -1754,7 +1754,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
1754 set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); 1754 set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
1755 inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); 1755 inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
1756 BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); 1756 BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
1757 BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item); 1757 inode->i_version = btrfs_stack_inode_sequence(inode_item);
1758 inode->i_rdev = 0; 1758 inode->i_rdev = 0;
1759 *rdev = btrfs_stack_inode_rdev(inode_item); 1759 *rdev = btrfs_stack_inode_rdev(inode_item);
1760 BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item); 1760 BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 69f22e3ab3bc..13ae7b04790e 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -525,7 +525,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
525 ref->is_head = 0; 525 ref->is_head = 0;
526 ref->in_tree = 1; 526 ref->in_tree = 1;
527 527
528 if (need_ref_seq(for_cow, ref_root)) 528 if (is_fstree(ref_root))
529 seq = inc_delayed_seq(delayed_refs); 529 seq = inc_delayed_seq(delayed_refs);
530 ref->seq = seq; 530 ref->seq = seq;
531 531
@@ -584,7 +584,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
584 ref->is_head = 0; 584 ref->is_head = 0;
585 ref->in_tree = 1; 585 ref->in_tree = 1;
586 586
587 if (need_ref_seq(for_cow, ref_root)) 587 if (is_fstree(ref_root))
588 seq = inc_delayed_seq(delayed_refs); 588 seq = inc_delayed_seq(delayed_refs);
589 ref->seq = seq; 589 ref->seq = seq;
590 590
@@ -658,10 +658,11 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
658 add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, 658 add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
659 num_bytes, parent, ref_root, level, action, 659 num_bytes, parent, ref_root, level, action,
660 for_cow); 660 for_cow);
661 if (!need_ref_seq(for_cow, ref_root) && 661 if (!is_fstree(ref_root) &&
662 waitqueue_active(&delayed_refs->seq_wait)) 662 waitqueue_active(&delayed_refs->seq_wait))
663 wake_up(&delayed_refs->seq_wait); 663 wake_up(&delayed_refs->seq_wait);
664 spin_unlock(&delayed_refs->lock); 664 spin_unlock(&delayed_refs->lock);
665
665 return 0; 666 return 0;
666} 667}
667 668
@@ -706,10 +707,11 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
706 add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, 707 add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
707 num_bytes, parent, ref_root, owner, offset, 708 num_bytes, parent, ref_root, owner, offset,
708 action, for_cow); 709 action, for_cow);
709 if (!need_ref_seq(for_cow, ref_root) && 710 if (!is_fstree(ref_root) &&
710 waitqueue_active(&delayed_refs->seq_wait)) 711 waitqueue_active(&delayed_refs->seq_wait))
711 wake_up(&delayed_refs->seq_wait); 712 wake_up(&delayed_refs->seq_wait);
712 spin_unlock(&delayed_refs->lock); 713 spin_unlock(&delayed_refs->lock);
714
713 return 0; 715 return 0;
714} 716}
715 717
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index d8f244d94925..413927fb9957 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -195,11 +195,6 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
195int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 195int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
196 struct list_head *cluster, u64 search_start); 196 struct list_head *cluster, u64 search_start);
197 197
198struct seq_list {
199 struct list_head list;
200 u64 seq;
201};
202
203static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs) 198static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs)
204{ 199{
205 assert_spin_locked(&delayed_refs->lock); 200 assert_spin_locked(&delayed_refs->lock);
@@ -230,25 +225,6 @@ int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
230 u64 seq); 225 u64 seq);
231 226
232/* 227/*
233 * delayed refs with a ref_seq > 0 must be held back during backref walking.
234 * this only applies to items in one of the fs-trees. for_cow items never need
235 * to be held back, so they won't get a ref_seq number.
236 */
237static inline int need_ref_seq(int for_cow, u64 rootid)
238{
239 if (for_cow)
240 return 0;
241
242 if (rootid == BTRFS_FS_TREE_OBJECTID)
243 return 1;
244
245 if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
246 return 1;
247
248 return 0;
249}
250
251/*
252 * a node might live in a head or a regular ref, this lets you 228 * a node might live in a head or a regular ref, this lets you
253 * test for the proper type to use. 229 * test for the proper type to use.
254 */ 230 */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e1fe74a2ce16..7ae51decf6d3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1153,7 +1153,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1153 root->orphan_block_rsv = NULL; 1153 root->orphan_block_rsv = NULL;
1154 1154
1155 INIT_LIST_HEAD(&root->dirty_list); 1155 INIT_LIST_HEAD(&root->dirty_list);
1156 INIT_LIST_HEAD(&root->orphan_list);
1157 INIT_LIST_HEAD(&root->root_list); 1156 INIT_LIST_HEAD(&root->root_list);
1158 spin_lock_init(&root->orphan_lock); 1157 spin_lock_init(&root->orphan_lock);
1159 spin_lock_init(&root->inode_lock); 1158 spin_lock_init(&root->inode_lock);
@@ -1166,6 +1165,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1166 atomic_set(&root->log_commit[0], 0); 1165 atomic_set(&root->log_commit[0], 0);
1167 atomic_set(&root->log_commit[1], 0); 1166 atomic_set(&root->log_commit[1], 0);
1168 atomic_set(&root->log_writers, 0); 1167 atomic_set(&root->log_writers, 0);
1168 atomic_set(&root->orphan_inodes, 0);
1169 root->log_batch = 0; 1169 root->log_batch = 0;
1170 root->log_transid = 0; 1170 root->log_transid = 0;
1171 root->last_log_commit = 0; 1171 root->last_log_commit = 0;
@@ -1252,7 +1252,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1252 1252
1253 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 1253 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
1254 BTRFS_TREE_LOG_OBJECTID, NULL, 1254 BTRFS_TREE_LOG_OBJECTID, NULL,
1255 0, 0, 0, 0); 1255 0, 0, 0);
1256 if (IS_ERR(leaf)) { 1256 if (IS_ERR(leaf)) {
1257 kfree(root); 1257 kfree(root);
1258 return ERR_CAST(leaf); 1258 return ERR_CAST(leaf);
@@ -1914,11 +1914,14 @@ int open_ctree(struct super_block *sb,
1914 spin_lock_init(&fs_info->delayed_iput_lock); 1914 spin_lock_init(&fs_info->delayed_iput_lock);
1915 spin_lock_init(&fs_info->defrag_inodes_lock); 1915 spin_lock_init(&fs_info->defrag_inodes_lock);
1916 spin_lock_init(&fs_info->free_chunk_lock); 1916 spin_lock_init(&fs_info->free_chunk_lock);
1917 spin_lock_init(&fs_info->tree_mod_seq_lock);
1918 rwlock_init(&fs_info->tree_mod_log_lock);
1917 mutex_init(&fs_info->reloc_mutex); 1919 mutex_init(&fs_info->reloc_mutex);
1918 1920
1919 init_completion(&fs_info->kobj_unregister); 1921 init_completion(&fs_info->kobj_unregister);
1920 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 1922 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
1921 INIT_LIST_HEAD(&fs_info->space_info); 1923 INIT_LIST_HEAD(&fs_info->space_info);
1924 INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
1922 btrfs_mapping_init(&fs_info->mapping_tree); 1925 btrfs_mapping_init(&fs_info->mapping_tree);
1923 btrfs_init_block_rsv(&fs_info->global_block_rsv); 1926 btrfs_init_block_rsv(&fs_info->global_block_rsv);
1924 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv); 1927 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
@@ -1931,12 +1934,14 @@ int open_ctree(struct super_block *sb,
1931 atomic_set(&fs_info->async_submit_draining, 0); 1934 atomic_set(&fs_info->async_submit_draining, 0);
1932 atomic_set(&fs_info->nr_async_bios, 0); 1935 atomic_set(&fs_info->nr_async_bios, 0);
1933 atomic_set(&fs_info->defrag_running, 0); 1936 atomic_set(&fs_info->defrag_running, 0);
1937 atomic_set(&fs_info->tree_mod_seq, 0);
1934 fs_info->sb = sb; 1938 fs_info->sb = sb;
1935 fs_info->max_inline = 8192 * 1024; 1939 fs_info->max_inline = 8192 * 1024;
1936 fs_info->metadata_ratio = 0; 1940 fs_info->metadata_ratio = 0;
1937 fs_info->defrag_inodes = RB_ROOT; 1941 fs_info->defrag_inodes = RB_ROOT;
1938 fs_info->trans_no_join = 0; 1942 fs_info->trans_no_join = 0;
1939 fs_info->free_chunk_space = 0; 1943 fs_info->free_chunk_space = 0;
1944 fs_info->tree_mod_log = RB_ROOT;
1940 1945
1941 /* readahead state */ 1946 /* readahead state */
1942 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); 1947 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
@@ -2001,7 +2006,8 @@ int open_ctree(struct super_block *sb,
2001 BTRFS_I(fs_info->btree_inode)->root = tree_root; 2006 BTRFS_I(fs_info->btree_inode)->root = tree_root;
2002 memset(&BTRFS_I(fs_info->btree_inode)->location, 0, 2007 memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
2003 sizeof(struct btrfs_key)); 2008 sizeof(struct btrfs_key));
2004 BTRFS_I(fs_info->btree_inode)->dummy_inode = 1; 2009 set_bit(BTRFS_INODE_DUMMY,
2010 &BTRFS_I(fs_info->btree_inode)->runtime_flags);
2005 insert_inode_hash(fs_info->btree_inode); 2011 insert_inode_hash(fs_info->btree_inode);
2006 2012
2007 spin_lock_init(&fs_info->block_group_cache_lock); 2013 spin_lock_init(&fs_info->block_group_cache_lock);
@@ -2353,6 +2359,13 @@ retry_root_backup:
2353 fs_info->generation = generation; 2359 fs_info->generation = generation;
2354 fs_info->last_trans_committed = generation; 2360 fs_info->last_trans_committed = generation;
2355 2361
2362 ret = btrfs_init_dev_stats(fs_info);
2363 if (ret) {
2364 printk(KERN_ERR "btrfs: failed to init dev_stats: %d\n",
2365 ret);
2366 goto fail_block_groups;
2367 }
2368
2356 ret = btrfs_init_space_info(fs_info); 2369 ret = btrfs_init_space_info(fs_info);
2357 if (ret) { 2370 if (ret) {
2358 printk(KERN_ERR "Failed to initial space info: %d\n", ret); 2371 printk(KERN_ERR "Failed to initial space info: %d\n", ret);
@@ -2556,18 +2569,19 @@ recovery_tree_root:
2556 2569
2557static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) 2570static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
2558{ 2571{
2559 char b[BDEVNAME_SIZE];
2560
2561 if (uptodate) { 2572 if (uptodate) {
2562 set_buffer_uptodate(bh); 2573 set_buffer_uptodate(bh);
2563 } else { 2574 } else {
2575 struct btrfs_device *device = (struct btrfs_device *)
2576 bh->b_private;
2577
2564 printk_ratelimited(KERN_WARNING "lost page write due to " 2578 printk_ratelimited(KERN_WARNING "lost page write due to "
2565 "I/O error on %s\n", 2579 "I/O error on %s\n", device->name);
2566 bdevname(bh->b_bdev, b));
2567 /* note, we dont' set_buffer_write_io_error because we have 2580 /* note, we dont' set_buffer_write_io_error because we have
2568 * our own ways of dealing with the IO errors 2581 * our own ways of dealing with the IO errors
2569 */ 2582 */
2570 clear_buffer_uptodate(bh); 2583 clear_buffer_uptodate(bh);
2584 btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
2571 } 2585 }
2572 unlock_buffer(bh); 2586 unlock_buffer(bh);
2573 put_bh(bh); 2587 put_bh(bh);
@@ -2682,6 +2696,7 @@ static int write_dev_supers(struct btrfs_device *device,
2682 set_buffer_uptodate(bh); 2696 set_buffer_uptodate(bh);
2683 lock_buffer(bh); 2697 lock_buffer(bh);
2684 bh->b_end_io = btrfs_end_buffer_write_sync; 2698 bh->b_end_io = btrfs_end_buffer_write_sync;
2699 bh->b_private = device;
2685 } 2700 }
2686 2701
2687 /* 2702 /*
@@ -2740,6 +2755,9 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
2740 } 2755 }
2741 if (!bio_flagged(bio, BIO_UPTODATE)) { 2756 if (!bio_flagged(bio, BIO_UPTODATE)) {
2742 ret = -EIO; 2757 ret = -EIO;
2758 if (!bio_flagged(bio, BIO_EOPNOTSUPP))
2759 btrfs_dev_stat_inc_and_print(device,
2760 BTRFS_DEV_STAT_FLUSH_ERRS);
2743 } 2761 }
2744 2762
2745 /* drop the reference from the wait == 0 run */ 2763 /* drop the reference from the wait == 0 run */
@@ -2902,19 +2920,6 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
2902 return ret; 2920 return ret;
2903} 2921}
2904 2922
2905/* Kill all outstanding I/O */
2906void btrfs_abort_devices(struct btrfs_root *root)
2907{
2908 struct list_head *head;
2909 struct btrfs_device *dev;
2910 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2911 head = &root->fs_info->fs_devices->devices;
2912 list_for_each_entry_rcu(dev, head, dev_list) {
2913 blk_abort_queue(dev->bdev->bd_disk->queue);
2914 }
2915 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2916}
2917
2918void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) 2923void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
2919{ 2924{
2920 spin_lock(&fs_info->fs_roots_radix_lock); 2925 spin_lock(&fs_info->fs_roots_radix_lock);
@@ -3671,17 +3676,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
3671 return 0; 3676 return 0;
3672} 3677}
3673 3678
3674static int btree_writepage_io_failed_hook(struct bio *bio, struct page *page,
3675 u64 start, u64 end,
3676 struct extent_state *state)
3677{
3678 struct super_block *sb = page->mapping->host->i_sb;
3679 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
3680 btrfs_error(fs_info, -EIO,
3681 "Error occured while writing out btree at %llu", start);
3682 return -EIO;
3683}
3684
3685static struct extent_io_ops btree_extent_io_ops = { 3679static struct extent_io_ops btree_extent_io_ops = {
3686 .write_cache_pages_lock_hook = btree_lock_page_hook, 3680 .write_cache_pages_lock_hook = btree_lock_page_hook,
3687 .readpage_end_io_hook = btree_readpage_end_io_hook, 3681 .readpage_end_io_hook = btree_readpage_end_io_hook,
@@ -3689,5 +3683,4 @@ static struct extent_io_ops btree_extent_io_ops = {
3689 .submit_bio_hook = btree_submit_bio_hook, 3683 .submit_bio_hook = btree_submit_bio_hook,
3690 /* note we're sharing with inode.c for the merge bio hook */ 3684 /* note we're sharing with inode.c for the merge bio hook */
3691 .merge_bio_hook = btrfs_merge_bio_hook, 3685 .merge_bio_hook = btrfs_merge_bio_hook,
3692 .writepage_io_failed_hook = btree_writepage_io_failed_hook,
3693}; 3686};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index ab1830aaf0ed..05b3fab39f7e 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -89,7 +89,6 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
89int btrfs_cleanup_transaction(struct btrfs_root *root); 89int btrfs_cleanup_transaction(struct btrfs_root *root);
90void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, 90void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
91 struct btrfs_root *root); 91 struct btrfs_root *root);
92void btrfs_abort_devices(struct btrfs_root *root);
93 92
94#ifdef CONFIG_DEBUG_LOCK_ALLOC 93#ifdef CONFIG_DEBUG_LOCK_ALLOC
95void btrfs_init_lockdep(void); 94void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 49fd7b66d57b..4b5a1e1bdefb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3578,7 +3578,7 @@ again:
3578 space_info->chunk_alloc = 0; 3578 space_info->chunk_alloc = 0;
3579 spin_unlock(&space_info->lock); 3579 spin_unlock(&space_info->lock);
3580out: 3580out:
3581 mutex_unlock(&extent_root->fs_info->chunk_mutex); 3581 mutex_unlock(&fs_info->chunk_mutex);
3582 return ret; 3582 return ret;
3583} 3583}
3584 3584
@@ -4355,10 +4355,9 @@ static unsigned drop_outstanding_extent(struct inode *inode)
4355 BTRFS_I(inode)->outstanding_extents--; 4355 BTRFS_I(inode)->outstanding_extents--;
4356 4356
4357 if (BTRFS_I(inode)->outstanding_extents == 0 && 4357 if (BTRFS_I(inode)->outstanding_extents == 0 &&
4358 BTRFS_I(inode)->delalloc_meta_reserved) { 4358 test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4359 &BTRFS_I(inode)->runtime_flags))
4359 drop_inode_space = 1; 4360 drop_inode_space = 1;
4360 BTRFS_I(inode)->delalloc_meta_reserved = 0;
4361 }
4362 4361
4363 /* 4362 /*
4364 * If we have more or the same amount of outsanding extents than we have 4363 * If we have more or the same amount of outsanding extents than we have
@@ -4465,7 +4464,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4465 * Add an item to reserve for updating the inode when we complete the 4464 * Add an item to reserve for updating the inode when we complete the
4466 * delalloc io. 4465 * delalloc io.
4467 */ 4466 */
4468 if (!BTRFS_I(inode)->delalloc_meta_reserved) { 4467 if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4468 &BTRFS_I(inode)->runtime_flags)) {
4469 nr_extents++; 4469 nr_extents++;
4470 extra_reserve = 1; 4470 extra_reserve = 1;
4471 } 4471 }
@@ -4511,7 +4511,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4511 4511
4512 spin_lock(&BTRFS_I(inode)->lock); 4512 spin_lock(&BTRFS_I(inode)->lock);
4513 if (extra_reserve) { 4513 if (extra_reserve) {
4514 BTRFS_I(inode)->delalloc_meta_reserved = 1; 4514 set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4515 &BTRFS_I(inode)->runtime_flags);
4515 nr_extents--; 4516 nr_extents--;
4516 } 4517 }
4517 BTRFS_I(inode)->reserved_extents += nr_extents; 4518 BTRFS_I(inode)->reserved_extents += nr_extents;
@@ -5217,7 +5218,7 @@ out:
5217void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 5218void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5218 struct btrfs_root *root, 5219 struct btrfs_root *root,
5219 struct extent_buffer *buf, 5220 struct extent_buffer *buf,
5220 u64 parent, int last_ref, int for_cow) 5221 u64 parent, int last_ref)
5221{ 5222{
5222 struct btrfs_block_group_cache *cache = NULL; 5223 struct btrfs_block_group_cache *cache = NULL;
5223 int ret; 5224 int ret;
@@ -5227,7 +5228,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5227 buf->start, buf->len, 5228 buf->start, buf->len,
5228 parent, root->root_key.objectid, 5229 parent, root->root_key.objectid,
5229 btrfs_header_level(buf), 5230 btrfs_header_level(buf),
5230 BTRFS_DROP_DELAYED_REF, NULL, for_cow); 5231 BTRFS_DROP_DELAYED_REF, NULL, 0);
5231 BUG_ON(ret); /* -ENOMEM */ 5232 BUG_ON(ret); /* -ENOMEM */
5232 } 5233 }
5233 5234
@@ -6249,7 +6250,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6249 struct btrfs_root *root, u32 blocksize, 6250 struct btrfs_root *root, u32 blocksize,
6250 u64 parent, u64 root_objectid, 6251 u64 parent, u64 root_objectid,
6251 struct btrfs_disk_key *key, int level, 6252 struct btrfs_disk_key *key, int level,
6252 u64 hint, u64 empty_size, int for_cow) 6253 u64 hint, u64 empty_size)
6253{ 6254{
6254 struct btrfs_key ins; 6255 struct btrfs_key ins;
6255 struct btrfs_block_rsv *block_rsv; 6256 struct btrfs_block_rsv *block_rsv;
@@ -6297,7 +6298,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6297 ins.objectid, 6298 ins.objectid,
6298 ins.offset, parent, root_objectid, 6299 ins.offset, parent, root_objectid,
6299 level, BTRFS_ADD_DELAYED_EXTENT, 6300 level, BTRFS_ADD_DELAYED_EXTENT,
6300 extent_op, for_cow); 6301 extent_op, 0);
6301 BUG_ON(ret); /* -ENOMEM */ 6302 BUG_ON(ret); /* -ENOMEM */
6302 } 6303 }
6303 return buf; 6304 return buf;
@@ -6715,7 +6716,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6715 btrfs_header_owner(path->nodes[level + 1])); 6716 btrfs_header_owner(path->nodes[level + 1]));
6716 } 6717 }
6717 6718
6718 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0); 6719 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
6719out: 6720out:
6720 wc->refs[level] = 0; 6721 wc->refs[level] = 0;
6721 wc->flags[level] = 0; 6722 wc->flags[level] = 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c9018a05036e..2c8f7b204617 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -186,7 +186,6 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
186 return parent; 186 return parent;
187 } 187 }
188 188
189 entry = rb_entry(node, struct tree_entry, rb_node);
190 rb_link_node(node, parent, p); 189 rb_link_node(node, parent, p);
191 rb_insert_color(node, root); 190 rb_insert_color(node, root);
192 return NULL; 191 return NULL;
@@ -413,7 +412,7 @@ static struct extent_state *next_state(struct extent_state *state)
413 412
414/* 413/*
415 * utility function to clear some bits in an extent state struct. 414 * utility function to clear some bits in an extent state struct.
416 * it will optionally wake up any one waiting on this state (wake == 1) 415 * it will optionally wake up any one waiting on this state (wake == 1).
417 * 416 *
418 * If no bits are set on the state struct after clearing things, the 417 * If no bits are set on the state struct after clearing things, the
419 * struct is freed and removed from the tree 418 * struct is freed and removed from the tree
@@ -570,10 +569,8 @@ hit_next:
570 if (err) 569 if (err)
571 goto out; 570 goto out;
572 if (state->end <= end) { 571 if (state->end <= end) {
573 clear_state_bit(tree, state, &bits, wake); 572 state = clear_state_bit(tree, state, &bits, wake);
574 if (last_end == (u64)-1) 573 goto next;
575 goto out;
576 start = last_end + 1;
577 } 574 }
578 goto search_again; 575 goto search_again;
579 } 576 }
@@ -781,7 +778,6 @@ hit_next:
781 * Just lock what we found and keep going 778 * Just lock what we found and keep going
782 */ 779 */
783 if (state->start == start && state->end <= end) { 780 if (state->start == start && state->end <= end) {
784 struct rb_node *next_node;
785 if (state->state & exclusive_bits) { 781 if (state->state & exclusive_bits) {
786 *failed_start = state->start; 782 *failed_start = state->start;
787 err = -EEXIST; 783 err = -EEXIST;
@@ -789,20 +785,15 @@ hit_next:
789 } 785 }
790 786
791 set_state_bits(tree, state, &bits); 787 set_state_bits(tree, state, &bits);
792
793 cache_state(state, cached_state); 788 cache_state(state, cached_state);
794 merge_state(tree, state); 789 merge_state(tree, state);
795 if (last_end == (u64)-1) 790 if (last_end == (u64)-1)
796 goto out; 791 goto out;
797
798 start = last_end + 1; 792 start = last_end + 1;
799 next_node = rb_next(&state->rb_node); 793 state = next_state(state);
800 if (next_node && start < end && prealloc && !need_resched()) { 794 if (start < end && state && state->start == start &&
801 state = rb_entry(next_node, struct extent_state, 795 !need_resched())
802 rb_node); 796 goto hit_next;
803 if (state->start == start)
804 goto hit_next;
805 }
806 goto search_again; 797 goto search_again;
807 } 798 }
808 799
@@ -845,6 +836,10 @@ hit_next:
845 if (last_end == (u64)-1) 836 if (last_end == (u64)-1)
846 goto out; 837 goto out;
847 start = last_end + 1; 838 start = last_end + 1;
839 state = next_state(state);
840 if (start < end && state && state->start == start &&
841 !need_resched())
842 goto hit_next;
848 } 843 }
849 goto search_again; 844 goto search_again;
850 } 845 }
@@ -994,21 +989,14 @@ hit_next:
994 * Just lock what we found and keep going 989 * Just lock what we found and keep going
995 */ 990 */
996 if (state->start == start && state->end <= end) { 991 if (state->start == start && state->end <= end) {
997 struct rb_node *next_node;
998
999 set_state_bits(tree, state, &bits); 992 set_state_bits(tree, state, &bits);
1000 clear_state_bit(tree, state, &clear_bits, 0); 993 state = clear_state_bit(tree, state, &clear_bits, 0);
1001 if (last_end == (u64)-1) 994 if (last_end == (u64)-1)
1002 goto out; 995 goto out;
1003
1004 start = last_end + 1; 996 start = last_end + 1;
1005 next_node = rb_next(&state->rb_node); 997 if (start < end && state && state->start == start &&
1006 if (next_node && start < end && prealloc && !need_resched()) { 998 !need_resched())
1007 state = rb_entry(next_node, struct extent_state, 999 goto hit_next;
1008 rb_node);
1009 if (state->start == start)
1010 goto hit_next;
1011 }
1012 goto search_again; 1000 goto search_again;
1013 } 1001 }
1014 1002
@@ -1042,10 +1030,13 @@ hit_next:
1042 goto out; 1030 goto out;
1043 if (state->end <= end) { 1031 if (state->end <= end) {
1044 set_state_bits(tree, state, &bits); 1032 set_state_bits(tree, state, &bits);
1045 clear_state_bit(tree, state, &clear_bits, 0); 1033 state = clear_state_bit(tree, state, &clear_bits, 0);
1046 if (last_end == (u64)-1) 1034 if (last_end == (u64)-1)
1047 goto out; 1035 goto out;
1048 start = last_end + 1; 1036 start = last_end + 1;
1037 if (start < end && state && state->start == start &&
1038 !need_resched())
1039 goto hit_next;
1049 } 1040 }
1050 goto search_again; 1041 goto search_again;
1051 } 1042 }
@@ -1173,9 +1164,8 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1173 cached_state, mask); 1164 cached_state, mask);
1174} 1165}
1175 1166
1176static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, 1167int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1177 u64 end, struct extent_state **cached_state, 1168 struct extent_state **cached_state, gfp_t mask)
1178 gfp_t mask)
1179{ 1169{
1180 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, 1170 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
1181 cached_state, mask); 1171 cached_state, mask);
@@ -1293,7 +1283,7 @@ out:
1293 * returned if we find something, and *start_ret and *end_ret are 1283 * returned if we find something, and *start_ret and *end_ret are
1294 * set to reflect the state struct that was found. 1284 * set to reflect the state struct that was found.
1295 * 1285 *
1296 * If nothing was found, 1 is returned, < 0 on error 1286 * If nothing was found, 1 is returned. If found something, return 0.
1297 */ 1287 */
1298int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1288int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1299 u64 *start_ret, u64 *end_ret, int bits) 1289 u64 *start_ret, u64 *end_ret, int bits)
@@ -1923,6 +1913,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1923 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { 1913 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1924 /* try to remap that extent elsewhere? */ 1914 /* try to remap that extent elsewhere? */
1925 bio_put(bio); 1915 bio_put(bio);
1916 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
1926 return -EIO; 1917 return -EIO;
1927 } 1918 }
1928 1919
@@ -2222,17 +2213,7 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
2222 uptodate = 0; 2213 uptodate = 0;
2223 } 2214 }
2224 2215
2225 if (!uptodate && tree->ops &&
2226 tree->ops->writepage_io_failed_hook) {
2227 ret = tree->ops->writepage_io_failed_hook(NULL, page,
2228 start, end, NULL);
2229 /* Writeback already completed */
2230 if (ret == 0)
2231 return 1;
2232 }
2233
2234 if (!uptodate) { 2216 if (!uptodate) {
2235 clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
2236 ClearPageUptodate(page); 2217 ClearPageUptodate(page);
2237 SetPageError(page); 2218 SetPageError(page);
2238 } 2219 }
@@ -2347,10 +2328,23 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2347 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 2328 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
2348 ret = tree->ops->readpage_end_io_hook(page, start, end, 2329 ret = tree->ops->readpage_end_io_hook(page, start, end,
2349 state, mirror); 2330 state, mirror);
2350 if (ret) 2331 if (ret) {
2332 /* no IO indicated but software detected errors
2333 * in the block, either checksum errors or
2334 * issues with the contents */
2335 struct btrfs_root *root =
2336 BTRFS_I(page->mapping->host)->root;
2337 struct btrfs_device *device;
2338
2351 uptodate = 0; 2339 uptodate = 0;
2352 else 2340 device = btrfs_find_device_for_logical(
2341 root, start, mirror);
2342 if (device)
2343 btrfs_dev_stat_inc_and_print(device,
2344 BTRFS_DEV_STAT_CORRUPTION_ERRS);
2345 } else {
2353 clean_io_failure(start, page); 2346 clean_io_failure(start, page);
2347 }
2354 } 2348 }
2355 2349
2356 if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) { 2350 if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
@@ -3164,7 +3158,7 @@ static int write_one_eb(struct extent_buffer *eb,
3164 u64 offset = eb->start; 3158 u64 offset = eb->start;
3165 unsigned long i, num_pages; 3159 unsigned long i, num_pages;
3166 int rw = (epd->sync_io ? WRITE_SYNC : WRITE); 3160 int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
3167 int ret; 3161 int ret = 0;
3168 3162
3169 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3163 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3170 num_pages = num_extent_pages(eb->start, eb->len); 3164 num_pages = num_extent_pages(eb->start, eb->len);
@@ -3930,6 +3924,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3930 eb->start = start; 3924 eb->start = start;
3931 eb->len = len; 3925 eb->len = len;
3932 eb->tree = tree; 3926 eb->tree = tree;
3927 eb->bflags = 0;
3933 rwlock_init(&eb->lock); 3928 rwlock_init(&eb->lock);
3934 atomic_set(&eb->write_locks, 0); 3929 atomic_set(&eb->write_locks, 0);
3935 atomic_set(&eb->read_locks, 0); 3930 atomic_set(&eb->read_locks, 0);
@@ -3967,6 +3962,60 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3967 return eb; 3962 return eb;
3968} 3963}
3969 3964
3965struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
3966{
3967 unsigned long i;
3968 struct page *p;
3969 struct extent_buffer *new;
3970 unsigned long num_pages = num_extent_pages(src->start, src->len);
3971
3972 new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_ATOMIC);
3973 if (new == NULL)
3974 return NULL;
3975
3976 for (i = 0; i < num_pages; i++) {
3977 p = alloc_page(GFP_ATOMIC);
3978 BUG_ON(!p);
3979 attach_extent_buffer_page(new, p);
3980 WARN_ON(PageDirty(p));
3981 SetPageUptodate(p);
3982 new->pages[i] = p;
3983 }
3984
3985 copy_extent_buffer(new, src, 0, 0, src->len);
3986 set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
3987 set_bit(EXTENT_BUFFER_DUMMY, &new->bflags);
3988
3989 return new;
3990}
3991
3992struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
3993{
3994 struct extent_buffer *eb;
3995 unsigned long num_pages = num_extent_pages(0, len);
3996 unsigned long i;
3997
3998 eb = __alloc_extent_buffer(NULL, start, len, GFP_ATOMIC);
3999 if (!eb)
4000 return NULL;
4001
4002 for (i = 0; i < num_pages; i++) {
4003 eb->pages[i] = alloc_page(GFP_ATOMIC);
4004 if (!eb->pages[i])
4005 goto err;
4006 }
4007 set_extent_buffer_uptodate(eb);
4008 btrfs_set_header_nritems(eb, 0);
4009 set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
4010
4011 return eb;
4012err:
4013 for (i--; i > 0; i--)
4014 __free_page(eb->pages[i]);
4015 __free_extent_buffer(eb);
4016 return NULL;
4017}
4018
3970static int extent_buffer_under_io(struct extent_buffer *eb) 4019static int extent_buffer_under_io(struct extent_buffer *eb)
3971{ 4020{
3972 return (atomic_read(&eb->io_pages) || 4021 return (atomic_read(&eb->io_pages) ||
@@ -3981,18 +4030,21 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
3981 unsigned long start_idx) 4030 unsigned long start_idx)
3982{ 4031{
3983 unsigned long index; 4032 unsigned long index;
4033 unsigned long num_pages;
3984 struct page *page; 4034 struct page *page;
4035 int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
3985 4036
3986 BUG_ON(extent_buffer_under_io(eb)); 4037 BUG_ON(extent_buffer_under_io(eb));
3987 4038
3988 index = num_extent_pages(eb->start, eb->len); 4039 num_pages = num_extent_pages(eb->start, eb->len);
4040 index = start_idx + num_pages;
3989 if (start_idx >= index) 4041 if (start_idx >= index)
3990 return; 4042 return;
3991 4043
3992 do { 4044 do {
3993 index--; 4045 index--;
3994 page = extent_buffer_page(eb, index); 4046 page = extent_buffer_page(eb, index);
3995 if (page) { 4047 if (page && mapped) {
3996 spin_lock(&page->mapping->private_lock); 4048 spin_lock(&page->mapping->private_lock);
3997 /* 4049 /*
3998 * We do this since we'll remove the pages after we've 4050 * We do this since we'll remove the pages after we've
@@ -4017,6 +4069,8 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
4017 } 4069 }
4018 spin_unlock(&page->mapping->private_lock); 4070 spin_unlock(&page->mapping->private_lock);
4019 4071
4072 }
4073 if (page) {
4020 /* One for when we alloced the page */ 4074 /* One for when we alloced the page */
4021 page_cache_release(page); 4075 page_cache_release(page);
4022 } 4076 }
@@ -4235,14 +4289,18 @@ static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
4235{ 4289{
4236 WARN_ON(atomic_read(&eb->refs) == 0); 4290 WARN_ON(atomic_read(&eb->refs) == 0);
4237 if (atomic_dec_and_test(&eb->refs)) { 4291 if (atomic_dec_and_test(&eb->refs)) {
4238 struct extent_io_tree *tree = eb->tree; 4292 if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) {
4293 spin_unlock(&eb->refs_lock);
4294 } else {
4295 struct extent_io_tree *tree = eb->tree;
4239 4296
4240 spin_unlock(&eb->refs_lock); 4297 spin_unlock(&eb->refs_lock);
4241 4298
4242 spin_lock(&tree->buffer_lock); 4299 spin_lock(&tree->buffer_lock);
4243 radix_tree_delete(&tree->buffer, 4300 radix_tree_delete(&tree->buffer,
4244 eb->start >> PAGE_CACHE_SHIFT); 4301 eb->start >> PAGE_CACHE_SHIFT);
4245 spin_unlock(&tree->buffer_lock); 4302 spin_unlock(&tree->buffer_lock);
4303 }
4246 4304
4247 /* Should be safe to release our pages at this point */ 4305 /* Should be safe to release our pages at this point */
4248 btrfs_release_extent_buffer_page(eb, 0); 4306 btrfs_release_extent_buffer_page(eb, 0);
@@ -4260,6 +4318,10 @@ void free_extent_buffer(struct extent_buffer *eb)
4260 4318
4261 spin_lock(&eb->refs_lock); 4319 spin_lock(&eb->refs_lock);
4262 if (atomic_read(&eb->refs) == 2 && 4320 if (atomic_read(&eb->refs) == 2 &&
4321 test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
4322 atomic_dec(&eb->refs);
4323
4324 if (atomic_read(&eb->refs) == 2 &&
4263 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && 4325 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
4264 !extent_buffer_under_io(eb) && 4326 !extent_buffer_under_io(eb) &&
4265 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4327 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index b516c3b8dec6..25900af5b15d 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -39,6 +39,7 @@
39#define EXTENT_BUFFER_STALE 6 39#define EXTENT_BUFFER_STALE 6
40#define EXTENT_BUFFER_WRITEBACK 7 40#define EXTENT_BUFFER_WRITEBACK 7
41#define EXTENT_BUFFER_IOERR 8 41#define EXTENT_BUFFER_IOERR 8
42#define EXTENT_BUFFER_DUMMY 9
42 43
43/* these are flags for extent_clear_unlock_delalloc */ 44/* these are flags for extent_clear_unlock_delalloc */
44#define EXTENT_CLEAR_UNLOCK_PAGE 0x1 45#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -75,9 +76,6 @@ struct extent_io_ops {
75 unsigned long bio_flags); 76 unsigned long bio_flags);
76 int (*readpage_io_hook)(struct page *page, u64 start, u64 end); 77 int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
77 int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); 78 int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
78 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
79 u64 start, u64 end,
80 struct extent_state *state);
81 int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, 79 int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
82 struct extent_state *state, int mirror); 80 struct extent_state *state, int mirror);
83 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, 81 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
@@ -225,6 +223,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
225 struct extent_state **cached_state, gfp_t mask); 223 struct extent_state **cached_state, gfp_t mask);
226int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 224int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
227 struct extent_state **cached_state, gfp_t mask); 225 struct extent_state **cached_state, gfp_t mask);
226int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
227 struct extent_state **cached_state, gfp_t mask);
228int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 228int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
229 gfp_t mask); 229 gfp_t mask);
230int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 230int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
@@ -265,6 +265,8 @@ void set_page_extent_mapped(struct page *page);
265 265
266struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, 266struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
267 u64 start, unsigned long len); 267 u64 start, unsigned long len);
268struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len);
269struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
268struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 270struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
269 u64 start, unsigned long len); 271 u64 start, unsigned long len);
270void free_extent_buffer(struct extent_buffer *eb); 272void free_extent_buffer(struct extent_buffer *eb);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 53bf2d764bbc..876cddd6b2f0 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -65,6 +65,21 @@ struct inode_defrag {
65 int cycled; 65 int cycled;
66}; 66};
67 67
68static int __compare_inode_defrag(struct inode_defrag *defrag1,
69 struct inode_defrag *defrag2)
70{
71 if (defrag1->root > defrag2->root)
72 return 1;
73 else if (defrag1->root < defrag2->root)
74 return -1;
75 else if (defrag1->ino > defrag2->ino)
76 return 1;
77 else if (defrag1->ino < defrag2->ino)
78 return -1;
79 else
80 return 0;
81}
82
68/* pop a record for an inode into the defrag tree. The lock 83/* pop a record for an inode into the defrag tree. The lock
69 * must be held already 84 * must be held already
70 * 85 *
@@ -81,15 +96,17 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
81 struct inode_defrag *entry; 96 struct inode_defrag *entry;
82 struct rb_node **p; 97 struct rb_node **p;
83 struct rb_node *parent = NULL; 98 struct rb_node *parent = NULL;
99 int ret;
84 100
85 p = &root->fs_info->defrag_inodes.rb_node; 101 p = &root->fs_info->defrag_inodes.rb_node;
86 while (*p) { 102 while (*p) {
87 parent = *p; 103 parent = *p;
88 entry = rb_entry(parent, struct inode_defrag, rb_node); 104 entry = rb_entry(parent, struct inode_defrag, rb_node);
89 105
90 if (defrag->ino < entry->ino) 106 ret = __compare_inode_defrag(defrag, entry);
107 if (ret < 0)
91 p = &parent->rb_left; 108 p = &parent->rb_left;
92 else if (defrag->ino > entry->ino) 109 else if (ret > 0)
93 p = &parent->rb_right; 110 p = &parent->rb_right;
94 else { 111 else {
95 /* if we're reinserting an entry for 112 /* if we're reinserting an entry for
@@ -103,7 +120,7 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
103 goto exists; 120 goto exists;
104 } 121 }
105 } 122 }
106 BTRFS_I(inode)->in_defrag = 1; 123 set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
107 rb_link_node(&defrag->rb_node, parent, p); 124 rb_link_node(&defrag->rb_node, parent, p);
108 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); 125 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
109 return; 126 return;
@@ -131,7 +148,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
131 if (btrfs_fs_closing(root->fs_info)) 148 if (btrfs_fs_closing(root->fs_info))
132 return 0; 149 return 0;
133 150
134 if (BTRFS_I(inode)->in_defrag) 151 if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
135 return 0; 152 return 0;
136 153
137 if (trans) 154 if (trans)
@@ -148,7 +165,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
148 defrag->root = root->root_key.objectid; 165 defrag->root = root->root_key.objectid;
149 166
150 spin_lock(&root->fs_info->defrag_inodes_lock); 167 spin_lock(&root->fs_info->defrag_inodes_lock);
151 if (!BTRFS_I(inode)->in_defrag) 168 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
152 __btrfs_add_inode_defrag(inode, defrag); 169 __btrfs_add_inode_defrag(inode, defrag);
153 else 170 else
154 kfree(defrag); 171 kfree(defrag);
@@ -159,28 +176,35 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
159/* 176/*
160 * must be called with the defrag_inodes lock held 177 * must be called with the defrag_inodes lock held
161 */ 178 */
162struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino, 179struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
180 u64 root, u64 ino,
163 struct rb_node **next) 181 struct rb_node **next)
164{ 182{
165 struct inode_defrag *entry = NULL; 183 struct inode_defrag *entry = NULL;
184 struct inode_defrag tmp;
166 struct rb_node *p; 185 struct rb_node *p;
167 struct rb_node *parent = NULL; 186 struct rb_node *parent = NULL;
187 int ret;
188
189 tmp.ino = ino;
190 tmp.root = root;
168 191
169 p = info->defrag_inodes.rb_node; 192 p = info->defrag_inodes.rb_node;
170 while (p) { 193 while (p) {
171 parent = p; 194 parent = p;
172 entry = rb_entry(parent, struct inode_defrag, rb_node); 195 entry = rb_entry(parent, struct inode_defrag, rb_node);
173 196
174 if (ino < entry->ino) 197 ret = __compare_inode_defrag(&tmp, entry);
198 if (ret < 0)
175 p = parent->rb_left; 199 p = parent->rb_left;
176 else if (ino > entry->ino) 200 else if (ret > 0)
177 p = parent->rb_right; 201 p = parent->rb_right;
178 else 202 else
179 return entry; 203 return entry;
180 } 204 }
181 205
182 if (next) { 206 if (next) {
183 while (parent && ino > entry->ino) { 207 while (parent && __compare_inode_defrag(&tmp, entry) > 0) {
184 parent = rb_next(parent); 208 parent = rb_next(parent);
185 entry = rb_entry(parent, struct inode_defrag, rb_node); 209 entry = rb_entry(parent, struct inode_defrag, rb_node);
186 } 210 }
@@ -202,6 +226,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
202 struct btrfs_key key; 226 struct btrfs_key key;
203 struct btrfs_ioctl_defrag_range_args range; 227 struct btrfs_ioctl_defrag_range_args range;
204 u64 first_ino = 0; 228 u64 first_ino = 0;
229 u64 root_objectid = 0;
205 int num_defrag; 230 int num_defrag;
206 int defrag_batch = 1024; 231 int defrag_batch = 1024;
207 232
@@ -214,11 +239,14 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
214 n = NULL; 239 n = NULL;
215 240
216 /* find an inode to defrag */ 241 /* find an inode to defrag */
217 defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n); 242 defrag = btrfs_find_defrag_inode(fs_info, root_objectid,
243 first_ino, &n);
218 if (!defrag) { 244 if (!defrag) {
219 if (n) 245 if (n) {
220 defrag = rb_entry(n, struct inode_defrag, rb_node); 246 defrag = rb_entry(n, struct inode_defrag,
221 else if (first_ino) { 247 rb_node);
248 } else if (root_objectid || first_ino) {
249 root_objectid = 0;
222 first_ino = 0; 250 first_ino = 0;
223 continue; 251 continue;
224 } else { 252 } else {
@@ -228,6 +256,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
228 256
229 /* remove it from the rbtree */ 257 /* remove it from the rbtree */
230 first_ino = defrag->ino + 1; 258 first_ino = defrag->ino + 1;
259 root_objectid = defrag->root;
231 rb_erase(&defrag->rb_node, &fs_info->defrag_inodes); 260 rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
232 261
233 if (btrfs_fs_closing(fs_info)) 262 if (btrfs_fs_closing(fs_info))
@@ -252,7 +281,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
252 goto next; 281 goto next;
253 282
254 /* do a chunk of defrag */ 283 /* do a chunk of defrag */
255 BTRFS_I(inode)->in_defrag = 0; 284 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
256 range.start = defrag->last_offset; 285 range.start = defrag->last_offset;
257 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, 286 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
258 defrag_batch); 287 defrag_batch);
@@ -1409,7 +1438,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1409 mutex_unlock(&inode->i_mutex); 1438 mutex_unlock(&inode->i_mutex);
1410 goto out; 1439 goto out;
1411 } 1440 }
1412 BTRFS_I(inode)->sequence++;
1413 1441
1414 start_pos = round_down(pos, root->sectorsize); 1442 start_pos = round_down(pos, root->sectorsize);
1415 if (start_pos > i_size_read(inode)) { 1443 if (start_pos > i_size_read(inode)) {
@@ -1466,8 +1494,8 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
1466 * flush down new bytes that may have been written if the 1494 * flush down new bytes that may have been written if the
1467 * application were using truncate to replace a file in place. 1495 * application were using truncate to replace a file in place.
1468 */ 1496 */
1469 if (BTRFS_I(inode)->ordered_data_close) { 1497 if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
1470 BTRFS_I(inode)->ordered_data_close = 0; 1498 &BTRFS_I(inode)->runtime_flags)) {
1471 btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); 1499 btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
1472 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) 1500 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
1473 filemap_flush(inode->i_mapping); 1501 filemap_flush(inode->i_mapping);
@@ -1498,14 +1526,15 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1498 1526
1499 trace_btrfs_sync_file(file, datasync); 1527 trace_btrfs_sync_file(file, datasync);
1500 1528
1501 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
1502 if (ret)
1503 return ret;
1504 mutex_lock(&inode->i_mutex); 1529 mutex_lock(&inode->i_mutex);
1505 1530
1506 /* we wait first, since the writeback may change the inode */ 1531 /*
1532 * we wait first, since the writeback may change the inode, also wait
1533 * ordered range does a filemape_write_and_wait_range which is why we
1534 * don't do it above like other file systems.
1535 */
1507 root->log_batch++; 1536 root->log_batch++;
1508 btrfs_wait_ordered_range(inode, 0, (u64)-1); 1537 btrfs_wait_ordered_range(inode, start, end);
1509 root->log_batch++; 1538 root->log_batch++;
1510 1539
1511 /* 1540 /*
@@ -1523,7 +1552,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1523 * syncing 1552 * syncing
1524 */ 1553 */
1525 smp_mb(); 1554 smp_mb();
1526 if (BTRFS_I(inode)->last_trans <= 1555 if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
1556 BTRFS_I(inode)->last_trans <=
1527 root->fs_info->last_trans_committed) { 1557 root->fs_info->last_trans_committed) {
1528 BTRFS_I(inode)->last_trans = 0; 1558 BTRFS_I(inode)->last_trans = 0;
1529 mutex_unlock(&inode->i_mutex); 1559 mutex_unlock(&inode->i_mutex);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 202008ec367d..19a0d85b451c 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -33,6 +33,8 @@
33 33
34static int link_free_space(struct btrfs_free_space_ctl *ctl, 34static int link_free_space(struct btrfs_free_space_ctl *ctl,
35 struct btrfs_free_space *info); 35 struct btrfs_free_space *info);
36static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
37 struct btrfs_free_space *info);
36 38
37static struct inode *__lookup_free_space_inode(struct btrfs_root *root, 39static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
38 struct btrfs_path *path, 40 struct btrfs_path *path,
@@ -584,6 +586,44 @@ static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
584 return 0; 586 return 0;
585} 587}
586 588
589/*
590 * Since we attach pinned extents after the fact we can have contiguous sections
591 * of free space that are split up in entries. This poses a problem with the
592 * tree logging stuff since it could have allocated across what appears to be 2
593 * entries since we would have merged the entries when adding the pinned extents
594 * back to the free space cache. So run through the space cache that we just
595 * loaded and merge contiguous entries. This will make the log replay stuff not
596 * blow up and it will make for nicer allocator behavior.
597 */
598static void merge_space_tree(struct btrfs_free_space_ctl *ctl)
599{
600 struct btrfs_free_space *e, *prev = NULL;
601 struct rb_node *n;
602
603again:
604 spin_lock(&ctl->tree_lock);
605 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
606 e = rb_entry(n, struct btrfs_free_space, offset_index);
607 if (!prev)
608 goto next;
609 if (e->bitmap || prev->bitmap)
610 goto next;
611 if (prev->offset + prev->bytes == e->offset) {
612 unlink_free_space(ctl, prev);
613 unlink_free_space(ctl, e);
614 prev->bytes += e->bytes;
615 kmem_cache_free(btrfs_free_space_cachep, e);
616 link_free_space(ctl, prev);
617 prev = NULL;
618 spin_unlock(&ctl->tree_lock);
619 goto again;
620 }
621next:
622 prev = e;
623 }
624 spin_unlock(&ctl->tree_lock);
625}
626
587int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, 627int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
588 struct btrfs_free_space_ctl *ctl, 628 struct btrfs_free_space_ctl *ctl,
589 struct btrfs_path *path, u64 offset) 629 struct btrfs_path *path, u64 offset)
@@ -726,6 +766,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
726 } 766 }
727 767
728 io_ctl_drop_pages(&io_ctl); 768 io_ctl_drop_pages(&io_ctl);
769 merge_space_tree(ctl);
729 ret = 1; 770 ret = 1;
730out: 771out:
731 io_ctl_free(&io_ctl); 772 io_ctl_free(&io_ctl);
@@ -972,9 +1013,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
972 goto out; 1013 goto out;
973 1014
974 1015
975 ret = filemap_write_and_wait(inode->i_mapping); 1016 btrfs_wait_ordered_range(inode, 0, (u64)-1);
976 if (ret)
977 goto out;
978 1017
979 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 1018 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
980 key.offset = offset; 1019 key.offset = offset;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ceb7b9c9edcc..e9991adc0960 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -89,7 +89,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
89 89
90static int btrfs_setsize(struct inode *inode, loff_t newsize); 90static int btrfs_setsize(struct inode *inode, loff_t newsize);
91static int btrfs_truncate(struct inode *inode); 91static int btrfs_truncate(struct inode *inode);
92static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); 92static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
93static noinline int cow_file_range(struct inode *inode, 93static noinline int cow_file_range(struct inode *inode,
94 struct page *locked_page, 94 struct page *locked_page,
95 u64 start, u64 end, int *page_started, 95 u64 start, u64 end, int *page_started,
@@ -257,10 +257,13 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
257 ret = insert_inline_extent(trans, root, inode, start, 257 ret = insert_inline_extent(trans, root, inode, start,
258 inline_len, compressed_size, 258 inline_len, compressed_size,
259 compress_type, compressed_pages); 259 compress_type, compressed_pages);
260 if (ret) { 260 if (ret && ret != -ENOSPC) {
261 btrfs_abort_transaction(trans, root, ret); 261 btrfs_abort_transaction(trans, root, ret);
262 return ret; 262 return ret;
263 } else if (ret == -ENOSPC) {
264 return 1;
263 } 265 }
266
264 btrfs_delalloc_release_metadata(inode, end + 1 - start); 267 btrfs_delalloc_release_metadata(inode, end + 1 - start);
265 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 268 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
266 return 0; 269 return 0;
@@ -1572,11 +1575,11 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1572 if (btrfs_is_free_space_inode(root, inode)) 1575 if (btrfs_is_free_space_inode(root, inode))
1573 metadata = 2; 1576 metadata = 2;
1574 1577
1575 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
1576 if (ret)
1577 return ret;
1578
1579 if (!(rw & REQ_WRITE)) { 1578 if (!(rw & REQ_WRITE)) {
1579 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
1580 if (ret)
1581 return ret;
1582
1580 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1583 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1581 return btrfs_submit_compressed_read(inode, bio, 1584 return btrfs_submit_compressed_read(inode, bio,
1582 mirror_num, bio_flags); 1585 mirror_num, bio_flags);
@@ -1815,25 +1818,24 @@ out:
1815 * an ordered extent if the range of bytes in the file it covers are 1818 * an ordered extent if the range of bytes in the file it covers are
1816 * fully written. 1819 * fully written.
1817 */ 1820 */
1818static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) 1821static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1819{ 1822{
1823 struct inode *inode = ordered_extent->inode;
1820 struct btrfs_root *root = BTRFS_I(inode)->root; 1824 struct btrfs_root *root = BTRFS_I(inode)->root;
1821 struct btrfs_trans_handle *trans = NULL; 1825 struct btrfs_trans_handle *trans = NULL;
1822 struct btrfs_ordered_extent *ordered_extent = NULL;
1823 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1826 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1824 struct extent_state *cached_state = NULL; 1827 struct extent_state *cached_state = NULL;
1825 int compress_type = 0; 1828 int compress_type = 0;
1826 int ret; 1829 int ret;
1827 bool nolock; 1830 bool nolock;
1828 1831
1829 ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
1830 end - start + 1);
1831 if (!ret)
1832 return 0;
1833 BUG_ON(!ordered_extent); /* Logic error */
1834
1835 nolock = btrfs_is_free_space_inode(root, inode); 1832 nolock = btrfs_is_free_space_inode(root, inode);
1836 1833
1834 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
1835 ret = -EIO;
1836 goto out;
1837 }
1838
1837 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1839 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
1838 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ 1840 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
1839 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1841 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
@@ -1889,12 +1891,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1889 ordered_extent->file_offset, 1891 ordered_extent->file_offset,
1890 ordered_extent->len); 1892 ordered_extent->len);
1891 } 1893 }
1892 unlock_extent_cached(io_tree, ordered_extent->file_offset, 1894
1893 ordered_extent->file_offset +
1894 ordered_extent->len - 1, &cached_state, GFP_NOFS);
1895 if (ret < 0) { 1895 if (ret < 0) {
1896 btrfs_abort_transaction(trans, root, ret); 1896 btrfs_abort_transaction(trans, root, ret);
1897 goto out; 1897 goto out_unlock;
1898 } 1898 }
1899 1899
1900 add_pending_csums(trans, inode, ordered_extent->file_offset, 1900 add_pending_csums(trans, inode, ordered_extent->file_offset,
@@ -1905,10 +1905,14 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1905 ret = btrfs_update_inode_fallback(trans, root, inode); 1905 ret = btrfs_update_inode_fallback(trans, root, inode);
1906 if (ret) { /* -ENOMEM or corruption */ 1906 if (ret) { /* -ENOMEM or corruption */
1907 btrfs_abort_transaction(trans, root, ret); 1907 btrfs_abort_transaction(trans, root, ret);
1908 goto out; 1908 goto out_unlock;
1909 } 1909 }
1910 } 1910 }
1911 ret = 0; 1911 ret = 0;
1912out_unlock:
1913 unlock_extent_cached(io_tree, ordered_extent->file_offset,
1914 ordered_extent->file_offset +
1915 ordered_extent->len - 1, &cached_state, GFP_NOFS);
1912out: 1916out:
1913 if (root != root->fs_info->tree_root) 1917 if (root != root->fs_info->tree_root)
1914 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 1918 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
@@ -1919,26 +1923,57 @@ out:
1919 btrfs_end_transaction(trans, root); 1923 btrfs_end_transaction(trans, root);
1920 } 1924 }
1921 1925
1926 if (ret)
1927 clear_extent_uptodate(io_tree, ordered_extent->file_offset,
1928 ordered_extent->file_offset +
1929 ordered_extent->len - 1, NULL, GFP_NOFS);
1930
1931 /*
1932 * This needs to be dont to make sure anybody waiting knows we are done
1933 * upating everything for this ordered extent.
1934 */
1935 btrfs_remove_ordered_extent(inode, ordered_extent);
1936
1922 /* once for us */ 1937 /* once for us */
1923 btrfs_put_ordered_extent(ordered_extent); 1938 btrfs_put_ordered_extent(ordered_extent);
1924 /* once for the tree */ 1939 /* once for the tree */
1925 btrfs_put_ordered_extent(ordered_extent); 1940 btrfs_put_ordered_extent(ordered_extent);
1926 1941
1927 return 0; 1942 return ret;
1928out_unlock: 1943}
1929 unlock_extent_cached(io_tree, ordered_extent->file_offset, 1944
1930 ordered_extent->file_offset + 1945static void finish_ordered_fn(struct btrfs_work *work)
1931 ordered_extent->len - 1, &cached_state, GFP_NOFS); 1946{
1932 goto out; 1947 struct btrfs_ordered_extent *ordered_extent;
1948 ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
1949 btrfs_finish_ordered_io(ordered_extent);
1933} 1950}
1934 1951
1935static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, 1952static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
1936 struct extent_state *state, int uptodate) 1953 struct extent_state *state, int uptodate)
1937{ 1954{
1955 struct inode *inode = page->mapping->host;
1956 struct btrfs_root *root = BTRFS_I(inode)->root;
1957 struct btrfs_ordered_extent *ordered_extent = NULL;
1958 struct btrfs_workers *workers;
1959
1938 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); 1960 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
1939 1961
1940 ClearPagePrivate2(page); 1962 ClearPagePrivate2(page);
1941 return btrfs_finish_ordered_io(page->mapping->host, start, end); 1963 if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
1964 end - start + 1, uptodate))
1965 return 0;
1966
1967 ordered_extent->work.func = finish_ordered_fn;
1968 ordered_extent->work.flags = 0;
1969
1970 if (btrfs_is_free_space_inode(root, inode))
1971 workers = &root->fs_info->endio_freespace_worker;
1972 else
1973 workers = &root->fs_info->endio_write_workers;
1974 btrfs_queue_worker(workers, &ordered_extent->work);
1975
1976 return 0;
1942} 1977}
1943 1978
1944/* 1979/*
@@ -2072,12 +2107,12 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2072 struct btrfs_block_rsv *block_rsv; 2107 struct btrfs_block_rsv *block_rsv;
2073 int ret; 2108 int ret;
2074 2109
2075 if (!list_empty(&root->orphan_list) || 2110 if (atomic_read(&root->orphan_inodes) ||
2076 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) 2111 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
2077 return; 2112 return;
2078 2113
2079 spin_lock(&root->orphan_lock); 2114 spin_lock(&root->orphan_lock);
2080 if (!list_empty(&root->orphan_list)) { 2115 if (atomic_read(&root->orphan_inodes)) {
2081 spin_unlock(&root->orphan_lock); 2116 spin_unlock(&root->orphan_lock);
2082 return; 2117 return;
2083 } 2118 }
@@ -2134,8 +2169,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2134 block_rsv = NULL; 2169 block_rsv = NULL;
2135 } 2170 }
2136 2171
2137 if (list_empty(&BTRFS_I(inode)->i_orphan)) { 2172 if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2138 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2173 &BTRFS_I(inode)->runtime_flags)) {
2139#if 0 2174#if 0
2140 /* 2175 /*
2141 * For proper ENOSPC handling, we should do orphan 2176 * For proper ENOSPC handling, we should do orphan
@@ -2148,12 +2183,12 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2148 insert = 1; 2183 insert = 1;
2149#endif 2184#endif
2150 insert = 1; 2185 insert = 1;
2186 atomic_dec(&root->orphan_inodes);
2151 } 2187 }
2152 2188
2153 if (!BTRFS_I(inode)->orphan_meta_reserved) { 2189 if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
2154 BTRFS_I(inode)->orphan_meta_reserved = 1; 2190 &BTRFS_I(inode)->runtime_flags))
2155 reserve = 1; 2191 reserve = 1;
2156 }
2157 spin_unlock(&root->orphan_lock); 2192 spin_unlock(&root->orphan_lock);
2158 2193
2159 /* grab metadata reservation from transaction handle */ 2194 /* grab metadata reservation from transaction handle */
@@ -2166,6 +2201,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2166 if (insert >= 1) { 2201 if (insert >= 1) {
2167 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); 2202 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
2168 if (ret && ret != -EEXIST) { 2203 if (ret && ret != -EEXIST) {
2204 clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2205 &BTRFS_I(inode)->runtime_flags);
2169 btrfs_abort_transaction(trans, root, ret); 2206 btrfs_abort_transaction(trans, root, ret);
2170 return ret; 2207 return ret;
2171 } 2208 }
@@ -2196,15 +2233,13 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
2196 int ret = 0; 2233 int ret = 0;
2197 2234
2198 spin_lock(&root->orphan_lock); 2235 spin_lock(&root->orphan_lock);
2199 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 2236 if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2200 list_del_init(&BTRFS_I(inode)->i_orphan); 2237 &BTRFS_I(inode)->runtime_flags))
2201 delete_item = 1; 2238 delete_item = 1;
2202 }
2203 2239
2204 if (BTRFS_I(inode)->orphan_meta_reserved) { 2240 if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
2205 BTRFS_I(inode)->orphan_meta_reserved = 0; 2241 &BTRFS_I(inode)->runtime_flags))
2206 release_rsv = 1; 2242 release_rsv = 1;
2207 }
2208 spin_unlock(&root->orphan_lock); 2243 spin_unlock(&root->orphan_lock);
2209 2244
2210 if (trans && delete_item) { 2245 if (trans && delete_item) {
@@ -2212,8 +2247,10 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
2212 BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ 2247 BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
2213 } 2248 }
2214 2249
2215 if (release_rsv) 2250 if (release_rsv) {
2216 btrfs_orphan_release_metadata(inode); 2251 btrfs_orphan_release_metadata(inode);
2252 atomic_dec(&root->orphan_inodes);
2253 }
2217 2254
2218 return 0; 2255 return 0;
2219} 2256}
@@ -2341,6 +2378,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2341 ret = PTR_ERR(trans); 2378 ret = PTR_ERR(trans);
2342 goto out; 2379 goto out;
2343 } 2380 }
2381 printk(KERN_ERR "auto deleting %Lu\n",
2382 found_key.objectid);
2344 ret = btrfs_del_orphan_item(trans, root, 2383 ret = btrfs_del_orphan_item(trans, root,
2345 found_key.objectid); 2384 found_key.objectid);
2346 BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ 2385 BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
@@ -2352,9 +2391,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2352 * add this inode to the orphan list so btrfs_orphan_del does 2391 * add this inode to the orphan list so btrfs_orphan_del does
2353 * the proper thing when we hit it 2392 * the proper thing when we hit it
2354 */ 2393 */
2355 spin_lock(&root->orphan_lock); 2394 set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2356 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2395 &BTRFS_I(inode)->runtime_flags);
2357 spin_unlock(&root->orphan_lock);
2358 2396
2359 /* if we have links, this was a truncate, lets do that */ 2397 /* if we have links, this was a truncate, lets do that */
2360 if (inode->i_nlink) { 2398 if (inode->i_nlink) {
@@ -2510,7 +2548,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
2510 2548
2511 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); 2549 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
2512 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); 2550 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
2513 BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item); 2551 inode->i_version = btrfs_inode_sequence(leaf, inode_item);
2514 inode->i_generation = BTRFS_I(inode)->generation; 2552 inode->i_generation = BTRFS_I(inode)->generation;
2515 inode->i_rdev = 0; 2553 inode->i_rdev = 0;
2516 rdev = btrfs_inode_rdev(leaf, inode_item); 2554 rdev = btrfs_inode_rdev(leaf, inode_item);
@@ -2594,7 +2632,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2594 2632
2595 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); 2633 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
2596 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); 2634 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
2597 btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence); 2635 btrfs_set_inode_sequence(leaf, item, inode->i_version);
2598 btrfs_set_inode_transid(leaf, item, trans->transid); 2636 btrfs_set_inode_transid(leaf, item, trans->transid);
2599 btrfs_set_inode_rdev(leaf, item, inode->i_rdev); 2637 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2600 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); 2638 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
@@ -2752,6 +2790,8 @@ err:
2752 goto out; 2790 goto out;
2753 2791
2754 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 2792 btrfs_i_size_write(dir, dir->i_size - name_len * 2);
2793 inode_inc_iversion(inode);
2794 inode_inc_iversion(dir);
2755 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; 2795 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
2756 btrfs_update_inode(trans, root, dir); 2796 btrfs_update_inode(trans, root, dir);
2757out: 2797out:
@@ -3089,6 +3129,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
3089 } 3129 }
3090 3130
3091 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 3131 btrfs_i_size_write(dir, dir->i_size - name_len * 2);
3132 inode_inc_iversion(dir);
3092 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 3133 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
3093 ret = btrfs_update_inode(trans, root, dir); 3134 ret = btrfs_update_inode(trans, root, dir);
3094 if (ret) 3135 if (ret)
@@ -3607,7 +3648,8 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
3607 * any new writes get down to disk quickly. 3648 * any new writes get down to disk quickly.
3608 */ 3649 */
3609 if (newsize == 0) 3650 if (newsize == 0)
3610 BTRFS_I(inode)->ordered_data_close = 1; 3651 set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
3652 &BTRFS_I(inode)->runtime_flags);
3611 3653
3612 /* we don't support swapfiles, so vmtruncate shouldn't fail */ 3654 /* we don't support swapfiles, so vmtruncate shouldn't fail */
3613 truncate_setsize(inode, newsize); 3655 truncate_setsize(inode, newsize);
@@ -3638,6 +3680,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
3638 3680
3639 if (attr->ia_valid) { 3681 if (attr->ia_valid) {
3640 setattr_copy(inode, attr); 3682 setattr_copy(inode, attr);
3683 inode_inc_iversion(inode);
3641 err = btrfs_dirty_inode(inode); 3684 err = btrfs_dirty_inode(inode);
3642 3685
3643 if (!err && attr->ia_valid & ATTR_MODE) 3686 if (!err && attr->ia_valid & ATTR_MODE)
@@ -3671,7 +3714,8 @@ void btrfs_evict_inode(struct inode *inode)
3671 btrfs_wait_ordered_range(inode, 0, (u64)-1); 3714 btrfs_wait_ordered_range(inode, 0, (u64)-1);
3672 3715
3673 if (root->fs_info->log_root_recovering) { 3716 if (root->fs_info->log_root_recovering) {
3674 BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan)); 3717 BUG_ON(!test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3718 &BTRFS_I(inode)->runtime_flags));
3675 goto no_delete; 3719 goto no_delete;
3676 } 3720 }
3677 3721
@@ -4066,7 +4110,7 @@ static struct inode *new_simple_dir(struct super_block *s,
4066 4110
4067 BTRFS_I(inode)->root = root; 4111 BTRFS_I(inode)->root = root;
4068 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); 4112 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
4069 BTRFS_I(inode)->dummy_inode = 1; 4113 set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
4070 4114
4071 inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; 4115 inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
4072 inode->i_op = &btrfs_dir_ro_inode_operations; 4116 inode->i_op = &btrfs_dir_ro_inode_operations;
@@ -4370,7 +4414,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4370 int ret = 0; 4414 int ret = 0;
4371 bool nolock = false; 4415 bool nolock = false;
4372 4416
4373 if (BTRFS_I(inode)->dummy_inode) 4417 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
4374 return 0; 4418 return 0;
4375 4419
4376 if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode)) 4420 if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode))
@@ -4403,7 +4447,7 @@ int btrfs_dirty_inode(struct inode *inode)
4403 struct btrfs_trans_handle *trans; 4447 struct btrfs_trans_handle *trans;
4404 int ret; 4448 int ret;
4405 4449
4406 if (BTRFS_I(inode)->dummy_inode) 4450 if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
4407 return 0; 4451 return 0;
4408 4452
4409 trans = btrfs_join_transaction(root); 4453 trans = btrfs_join_transaction(root);
@@ -4730,6 +4774,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
4730 4774
4731 btrfs_i_size_write(parent_inode, parent_inode->i_size + 4775 btrfs_i_size_write(parent_inode, parent_inode->i_size +
4732 name_len * 2); 4776 name_len * 2);
4777 inode_inc_iversion(parent_inode);
4733 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; 4778 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
4734 ret = btrfs_update_inode(trans, root, parent_inode); 4779 ret = btrfs_update_inode(trans, root, parent_inode);
4735 if (ret) 4780 if (ret)
@@ -4937,6 +4982,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4937 } 4982 }
4938 4983
4939 btrfs_inc_nlink(inode); 4984 btrfs_inc_nlink(inode);
4985 inode_inc_iversion(inode);
4940 inode->i_ctime = CURRENT_TIME; 4986 inode->i_ctime = CURRENT_TIME;
4941 ihold(inode); 4987 ihold(inode);
4942 4988
@@ -5903,9 +5949,7 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
5903 struct btrfs_dio_private *dip = bio->bi_private; 5949 struct btrfs_dio_private *dip = bio->bi_private;
5904 struct inode *inode = dip->inode; 5950 struct inode *inode = dip->inode;
5905 struct btrfs_root *root = BTRFS_I(inode)->root; 5951 struct btrfs_root *root = BTRFS_I(inode)->root;
5906 struct btrfs_trans_handle *trans;
5907 struct btrfs_ordered_extent *ordered = NULL; 5952 struct btrfs_ordered_extent *ordered = NULL;
5908 struct extent_state *cached_state = NULL;
5909 u64 ordered_offset = dip->logical_offset; 5953 u64 ordered_offset = dip->logical_offset;
5910 u64 ordered_bytes = dip->bytes; 5954 u64 ordered_bytes = dip->bytes;
5911 int ret; 5955 int ret;
@@ -5915,73 +5959,14 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
5915again: 5959again:
5916 ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, 5960 ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
5917 &ordered_offset, 5961 &ordered_offset,
5918 ordered_bytes); 5962 ordered_bytes, !err);
5919 if (!ret) 5963 if (!ret)
5920 goto out_test; 5964 goto out_test;
5921 5965
5922 BUG_ON(!ordered); 5966 ordered->work.func = finish_ordered_fn;
5923 5967 ordered->work.flags = 0;
5924 trans = btrfs_join_transaction(root); 5968 btrfs_queue_worker(&root->fs_info->endio_write_workers,
5925 if (IS_ERR(trans)) { 5969 &ordered->work);
5926 err = -ENOMEM;
5927 goto out;
5928 }
5929 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5930
5931 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
5932 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5933 if (!ret)
5934 err = btrfs_update_inode_fallback(trans, root, inode);
5935 goto out;
5936 }
5937
5938 lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5939 ordered->file_offset + ordered->len - 1, 0,
5940 &cached_state);
5941
5942 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
5943 ret = btrfs_mark_extent_written(trans, inode,
5944 ordered->file_offset,
5945 ordered->file_offset +
5946 ordered->len);
5947 if (ret) {
5948 err = ret;
5949 goto out_unlock;
5950 }
5951 } else {
5952 ret = insert_reserved_file_extent(trans, inode,
5953 ordered->file_offset,
5954 ordered->start,
5955 ordered->disk_len,
5956 ordered->len,
5957 ordered->len,
5958 0, 0, 0,
5959 BTRFS_FILE_EXTENT_REG);
5960 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
5961 ordered->file_offset, ordered->len);
5962 if (ret) {
5963 err = ret;
5964 WARN_ON(1);
5965 goto out_unlock;
5966 }
5967 }
5968
5969 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
5970 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5971 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
5972 btrfs_update_inode_fallback(trans, root, inode);
5973 ret = 0;
5974out_unlock:
5975 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5976 ordered->file_offset + ordered->len - 1,
5977 &cached_state, GFP_NOFS);
5978out:
5979 btrfs_delalloc_release_metadata(inode, ordered->len);
5980 btrfs_end_transaction(trans, root);
5981 ordered_offset = ordered->file_offset + ordered->len;
5982 btrfs_put_ordered_extent(ordered);
5983 btrfs_put_ordered_extent(ordered);
5984
5985out_test: 5970out_test:
5986 /* 5971 /*
5987 * our bio might span multiple ordered extents. If we haven't 5972 * our bio might span multiple ordered extents. If we haven't
@@ -5990,12 +5975,12 @@ out_test:
5990 if (ordered_offset < dip->logical_offset + dip->bytes) { 5975 if (ordered_offset < dip->logical_offset + dip->bytes) {
5991 ordered_bytes = dip->logical_offset + dip->bytes - 5976 ordered_bytes = dip->logical_offset + dip->bytes -
5992 ordered_offset; 5977 ordered_offset;
5978 ordered = NULL;
5993 goto again; 5979 goto again;
5994 } 5980 }
5995out_done: 5981out_done:
5996 bio->bi_private = dip->private; 5982 bio->bi_private = dip->private;
5997 5983
5998 kfree(dip->csums);
5999 kfree(dip); 5984 kfree(dip);
6000 5985
6001 /* If we had an error make sure to clear the uptodate flag */ 5986 /* If we had an error make sure to clear the uptodate flag */
@@ -6063,9 +6048,12 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
6063 int ret; 6048 int ret;
6064 6049
6065 bio_get(bio); 6050 bio_get(bio);
6066 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 6051
6067 if (ret) 6052 if (!write) {
6068 goto err; 6053 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
6054 if (ret)
6055 goto err;
6056 }
6069 6057
6070 if (skip_sum) 6058 if (skip_sum)
6071 goto map; 6059 goto map;
@@ -6485,13 +6473,13 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
6485 6473
6486static void btrfs_invalidatepage(struct page *page, unsigned long offset) 6474static void btrfs_invalidatepage(struct page *page, unsigned long offset)
6487{ 6475{
6476 struct inode *inode = page->mapping->host;
6488 struct extent_io_tree *tree; 6477 struct extent_io_tree *tree;
6489 struct btrfs_ordered_extent *ordered; 6478 struct btrfs_ordered_extent *ordered;
6490 struct extent_state *cached_state = NULL; 6479 struct extent_state *cached_state = NULL;
6491 u64 page_start = page_offset(page); 6480 u64 page_start = page_offset(page);
6492 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 6481 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
6493 6482
6494
6495 /* 6483 /*
6496 * we have the page locked, so new writeback can't start, 6484 * we have the page locked, so new writeback can't start,
6497 * and the dirty bit won't be cleared while we are here. 6485 * and the dirty bit won't be cleared while we are here.
@@ -6501,13 +6489,13 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
6501 */ 6489 */
6502 wait_on_page_writeback(page); 6490 wait_on_page_writeback(page);
6503 6491
6504 tree = &BTRFS_I(page->mapping->host)->io_tree; 6492 tree = &BTRFS_I(inode)->io_tree;
6505 if (offset) { 6493 if (offset) {
6506 btrfs_releasepage(page, GFP_NOFS); 6494 btrfs_releasepage(page, GFP_NOFS);
6507 return; 6495 return;
6508 } 6496 }
6509 lock_extent_bits(tree, page_start, page_end, 0, &cached_state); 6497 lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
6510 ordered = btrfs_lookup_ordered_extent(page->mapping->host, 6498 ordered = btrfs_lookup_ordered_extent(inode,
6511 page_offset(page)); 6499 page_offset(page));
6512 if (ordered) { 6500 if (ordered) {
6513 /* 6501 /*
@@ -6522,9 +6510,10 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
6522 * whoever cleared the private bit is responsible 6510 * whoever cleared the private bit is responsible
6523 * for the finish_ordered_io 6511 * for the finish_ordered_io
6524 */ 6512 */
6525 if (TestClearPagePrivate2(page)) { 6513 if (TestClearPagePrivate2(page) &&
6526 btrfs_finish_ordered_io(page->mapping->host, 6514 btrfs_dec_test_ordered_pending(inode, &ordered, page_start,
6527 page_start, page_end); 6515 PAGE_CACHE_SIZE, 1)) {
6516 btrfs_finish_ordered_io(ordered);
6528 } 6517 }
6529 btrfs_put_ordered_extent(ordered); 6518 btrfs_put_ordered_extent(ordered);
6530 cached_state = NULL; 6519 cached_state = NULL;
@@ -6771,7 +6760,8 @@ static int btrfs_truncate(struct inode *inode)
6771 * using truncate to replace the contents of the file will 6760 * using truncate to replace the contents of the file will
6772 * end up with a zero length file after a crash. 6761 * end up with a zero length file after a crash.
6773 */ 6762 */
6774 if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) 6763 if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
6764 &BTRFS_I(inode)->runtime_flags))
6775 btrfs_add_ordered_operation(trans, root, inode); 6765 btrfs_add_ordered_operation(trans, root, inode);
6776 6766
6777 while (1) { 6767 while (1) {
@@ -6894,7 +6884,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6894 ei->root = NULL; 6884 ei->root = NULL;
6895 ei->space_info = NULL; 6885 ei->space_info = NULL;
6896 ei->generation = 0; 6886 ei->generation = 0;
6897 ei->sequence = 0;
6898 ei->last_trans = 0; 6887 ei->last_trans = 0;
6899 ei->last_sub_trans = 0; 6888 ei->last_sub_trans = 0;
6900 ei->logged_trans = 0; 6889 ei->logged_trans = 0;
@@ -6909,11 +6898,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6909 ei->outstanding_extents = 0; 6898 ei->outstanding_extents = 0;
6910 ei->reserved_extents = 0; 6899 ei->reserved_extents = 0;
6911 6900
6912 ei->ordered_data_close = 0; 6901 ei->runtime_flags = 0;
6913 ei->orphan_meta_reserved = 0;
6914 ei->dummy_inode = 0;
6915 ei->in_defrag = 0;
6916 ei->delalloc_meta_reserved = 0;
6917 ei->force_compress = BTRFS_COMPRESS_NONE; 6902 ei->force_compress = BTRFS_COMPRESS_NONE;
6918 6903
6919 ei->delayed_node = NULL; 6904 ei->delayed_node = NULL;
@@ -6927,7 +6912,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6927 mutex_init(&ei->log_mutex); 6912 mutex_init(&ei->log_mutex);
6928 mutex_init(&ei->delalloc_mutex); 6913 mutex_init(&ei->delalloc_mutex);
6929 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 6914 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
6930 INIT_LIST_HEAD(&ei->i_orphan);
6931 INIT_LIST_HEAD(&ei->delalloc_inodes); 6915 INIT_LIST_HEAD(&ei->delalloc_inodes);
6932 INIT_LIST_HEAD(&ei->ordered_operations); 6916 INIT_LIST_HEAD(&ei->ordered_operations);
6933 RB_CLEAR_NODE(&ei->rb_node); 6917 RB_CLEAR_NODE(&ei->rb_node);
@@ -6972,13 +6956,12 @@ void btrfs_destroy_inode(struct inode *inode)
6972 spin_unlock(&root->fs_info->ordered_extent_lock); 6956 spin_unlock(&root->fs_info->ordered_extent_lock);
6973 } 6957 }
6974 6958
6975 spin_lock(&root->orphan_lock); 6959 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
6976 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 6960 &BTRFS_I(inode)->runtime_flags)) {
6977 printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n", 6961 printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n",
6978 (unsigned long long)btrfs_ino(inode)); 6962 (unsigned long long)btrfs_ino(inode));
6979 list_del_init(&BTRFS_I(inode)->i_orphan); 6963 atomic_dec(&root->orphan_inodes);
6980 } 6964 }
6981 spin_unlock(&root->orphan_lock);
6982 6965
6983 while (1) { 6966 while (1) {
6984 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 6967 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -7193,6 +7176,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
7193 if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode)) 7176 if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
7194 btrfs_add_ordered_operation(trans, root, old_inode); 7177 btrfs_add_ordered_operation(trans, root, old_inode);
7195 7178
7179 inode_inc_iversion(old_dir);
7180 inode_inc_iversion(new_dir);
7181 inode_inc_iversion(old_inode);
7196 old_dir->i_ctime = old_dir->i_mtime = ctime; 7182 old_dir->i_ctime = old_dir->i_mtime = ctime;
7197 new_dir->i_ctime = new_dir->i_mtime = ctime; 7183 new_dir->i_ctime = new_dir->i_mtime = ctime;
7198 old_inode->i_ctime = ctime; 7184 old_inode->i_ctime = ctime;
@@ -7219,6 +7205,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
7219 } 7205 }
7220 7206
7221 if (new_inode) { 7207 if (new_inode) {
7208 inode_inc_iversion(new_inode);
7222 new_inode->i_ctime = CURRENT_TIME; 7209 new_inode->i_ctime = CURRENT_TIME;
7223 if (unlikely(btrfs_ino(new_inode) == 7210 if (unlikely(btrfs_ino(new_inode) ==
7224 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 7211 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
@@ -7490,6 +7477,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
7490 cur_offset += ins.offset; 7477 cur_offset += ins.offset;
7491 *alloc_hint = ins.objectid + ins.offset; 7478 *alloc_hint = ins.objectid + ins.offset;
7492 7479
7480 inode_inc_iversion(inode);
7493 inode->i_ctime = CURRENT_TIME; 7481 inode->i_ctime = CURRENT_TIME;
7494 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 7482 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
7495 if (!(mode & FALLOC_FL_KEEP_SIZE) && 7483 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 14f8e1faa46e..24b776c08d99 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -261,6 +261,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
261 } 261 }
262 262
263 btrfs_update_iflags(inode); 263 btrfs_update_iflags(inode);
264 inode_inc_iversion(inode);
264 inode->i_ctime = CURRENT_TIME; 265 inode->i_ctime = CURRENT_TIME;
265 ret = btrfs_update_inode(trans, root, inode); 266 ret = btrfs_update_inode(trans, root, inode);
266 267
@@ -367,7 +368,7 @@ static noinline int create_subvol(struct btrfs_root *root,
367 return PTR_ERR(trans); 368 return PTR_ERR(trans);
368 369
369 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 370 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
370 0, objectid, NULL, 0, 0, 0, 0); 371 0, objectid, NULL, 0, 0, 0);
371 if (IS_ERR(leaf)) { 372 if (IS_ERR(leaf)) {
372 ret = PTR_ERR(leaf); 373 ret = PTR_ERR(leaf);
373 goto fail; 374 goto fail;
@@ -2262,10 +2263,12 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
2262 di_args->bytes_used = dev->bytes_used; 2263 di_args->bytes_used = dev->bytes_used;
2263 di_args->total_bytes = dev->total_bytes; 2264 di_args->total_bytes = dev->total_bytes;
2264 memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); 2265 memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
2265 if (dev->name) 2266 if (dev->name) {
2266 strncpy(di_args->path, dev->name, sizeof(di_args->path)); 2267 strncpy(di_args->path, dev->name, sizeof(di_args->path));
2267 else 2268 di_args->path[sizeof(di_args->path) - 1] = 0;
2269 } else {
2268 di_args->path[0] = '\0'; 2270 di_args->path[0] = '\0';
2271 }
2269 2272
2270out: 2273out:
2271 if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args))) 2274 if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
@@ -2622,6 +2625,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2622 btrfs_mark_buffer_dirty(leaf); 2625 btrfs_mark_buffer_dirty(leaf);
2623 btrfs_release_path(path); 2626 btrfs_release_path(path);
2624 2627
2628 inode_inc_iversion(inode);
2625 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 2629 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2626 2630
2627 /* 2631 /*
@@ -2914,7 +2918,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
2914 up_read(&info->groups_sem); 2918 up_read(&info->groups_sem);
2915 } 2919 }
2916 2920
2917 user_dest = (struct btrfs_ioctl_space_info *) 2921 user_dest = (struct btrfs_ioctl_space_info __user *)
2918 (arg + sizeof(struct btrfs_ioctl_space_args)); 2922 (arg + sizeof(struct btrfs_ioctl_space_args));
2919 2923
2920 if (copy_to_user(user_dest, dest_orig, alloc_size)) 2924 if (copy_to_user(user_dest, dest_orig, alloc_size))
@@ -3042,6 +3046,28 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
3042 return ret; 3046 return ret;
3043} 3047}
3044 3048
3049static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
3050 void __user *arg, int reset_after_read)
3051{
3052 struct btrfs_ioctl_get_dev_stats *sa;
3053 int ret;
3054
3055 if (reset_after_read && !capable(CAP_SYS_ADMIN))
3056 return -EPERM;
3057
3058 sa = memdup_user(arg, sizeof(*sa));
3059 if (IS_ERR(sa))
3060 return PTR_ERR(sa);
3061
3062 ret = btrfs_get_dev_stats(root, sa, reset_after_read);
3063
3064 if (copy_to_user(arg, sa, sizeof(*sa)))
3065 ret = -EFAULT;
3066
3067 kfree(sa);
3068 return ret;
3069}
3070
3045static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) 3071static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
3046{ 3072{
3047 int ret = 0; 3073 int ret = 0;
@@ -3212,8 +3238,9 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
3212 } 3238 }
3213} 3239}
3214 3240
3215static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg) 3241static long btrfs_ioctl_balance(struct file *file, void __user *arg)
3216{ 3242{
3243 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3217 struct btrfs_fs_info *fs_info = root->fs_info; 3244 struct btrfs_fs_info *fs_info = root->fs_info;
3218 struct btrfs_ioctl_balance_args *bargs; 3245 struct btrfs_ioctl_balance_args *bargs;
3219 struct btrfs_balance_control *bctl; 3246 struct btrfs_balance_control *bctl;
@@ -3225,6 +3252,10 @@ static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
3225 if (fs_info->sb->s_flags & MS_RDONLY) 3252 if (fs_info->sb->s_flags & MS_RDONLY)
3226 return -EROFS; 3253 return -EROFS;
3227 3254
3255 ret = mnt_want_write(file->f_path.mnt);
3256 if (ret)
3257 return ret;
3258
3228 mutex_lock(&fs_info->volume_mutex); 3259 mutex_lock(&fs_info->volume_mutex);
3229 mutex_lock(&fs_info->balance_mutex); 3260 mutex_lock(&fs_info->balance_mutex);
3230 3261
@@ -3291,6 +3322,7 @@ out_bargs:
3291out: 3322out:
3292 mutex_unlock(&fs_info->balance_mutex); 3323 mutex_unlock(&fs_info->balance_mutex);
3293 mutex_unlock(&fs_info->volume_mutex); 3324 mutex_unlock(&fs_info->volume_mutex);
3325 mnt_drop_write(file->f_path.mnt);
3294 return ret; 3326 return ret;
3295} 3327}
3296 3328
@@ -3386,7 +3418,7 @@ long btrfs_ioctl(struct file *file, unsigned int
3386 case BTRFS_IOC_DEV_INFO: 3418 case BTRFS_IOC_DEV_INFO:
3387 return btrfs_ioctl_dev_info(root, argp); 3419 return btrfs_ioctl_dev_info(root, argp);
3388 case BTRFS_IOC_BALANCE: 3420 case BTRFS_IOC_BALANCE:
3389 return btrfs_ioctl_balance(root, NULL); 3421 return btrfs_ioctl_balance(file, NULL);
3390 case BTRFS_IOC_CLONE: 3422 case BTRFS_IOC_CLONE:
3391 return btrfs_ioctl_clone(file, arg, 0, 0, 0); 3423 return btrfs_ioctl_clone(file, arg, 0, 0, 0);
3392 case BTRFS_IOC_CLONE_RANGE: 3424 case BTRFS_IOC_CLONE_RANGE:
@@ -3419,11 +3451,15 @@ long btrfs_ioctl(struct file *file, unsigned int
3419 case BTRFS_IOC_SCRUB_PROGRESS: 3451 case BTRFS_IOC_SCRUB_PROGRESS:
3420 return btrfs_ioctl_scrub_progress(root, argp); 3452 return btrfs_ioctl_scrub_progress(root, argp);
3421 case BTRFS_IOC_BALANCE_V2: 3453 case BTRFS_IOC_BALANCE_V2:
3422 return btrfs_ioctl_balance(root, argp); 3454 return btrfs_ioctl_balance(file, argp);
3423 case BTRFS_IOC_BALANCE_CTL: 3455 case BTRFS_IOC_BALANCE_CTL:
3424 return btrfs_ioctl_balance_ctl(root, arg); 3456 return btrfs_ioctl_balance_ctl(root, arg);
3425 case BTRFS_IOC_BALANCE_PROGRESS: 3457 case BTRFS_IOC_BALANCE_PROGRESS:
3426 return btrfs_ioctl_balance_progress(root, argp); 3458 return btrfs_ioctl_balance_progress(root, argp);
3459 case BTRFS_IOC_GET_DEV_STATS:
3460 return btrfs_ioctl_get_dev_stats(root, argp, 0);
3461 case BTRFS_IOC_GET_AND_RESET_DEV_STATS:
3462 return btrfs_ioctl_get_dev_stats(root, argp, 1);
3427 } 3463 }
3428 3464
3429 return -ENOTTY; 3465 return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 086e6bdae1c4..497c530724cf 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -266,6 +266,35 @@ struct btrfs_ioctl_logical_ino_args {
266 __u64 inodes; 266 __u64 inodes;
267}; 267};
268 268
269enum btrfs_dev_stat_values {
270 /* disk I/O failure stats */
271 BTRFS_DEV_STAT_WRITE_ERRS, /* EIO or EREMOTEIO from lower layers */
272 BTRFS_DEV_STAT_READ_ERRS, /* EIO or EREMOTEIO from lower layers */
273 BTRFS_DEV_STAT_FLUSH_ERRS, /* EIO or EREMOTEIO from lower layers */
274
275 /* stats for indirect indications for I/O failures */
276 BTRFS_DEV_STAT_CORRUPTION_ERRS, /* checksum error, bytenr error or
277 * contents is illegal: this is an
278 * indication that the block was damaged
279 * during read or write, or written to
280 * wrong location or read from wrong
281 * location */
282 BTRFS_DEV_STAT_GENERATION_ERRS, /* an indication that blocks have not
283 * been written */
284
285 BTRFS_DEV_STAT_VALUES_MAX
286};
287
288struct btrfs_ioctl_get_dev_stats {
289 __u64 devid; /* in */
290 __u64 nr_items; /* in/out */
291
292 /* out values: */
293 __u64 values[BTRFS_DEV_STAT_VALUES_MAX];
294
295 __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */
296};
297
269#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ 298#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
270 struct btrfs_ioctl_vol_args) 299 struct btrfs_ioctl_vol_args)
271#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ 300#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -330,5 +359,9 @@ struct btrfs_ioctl_logical_ino_args {
330 struct btrfs_ioctl_ino_path_args) 359 struct btrfs_ioctl_ino_path_args)
331#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ 360#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
332 struct btrfs_ioctl_ino_path_args) 361 struct btrfs_ioctl_ino_path_args)
362#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
363 struct btrfs_ioctl_get_dev_stats)
364#define BTRFS_IOC_GET_AND_RESET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 53, \
365 struct btrfs_ioctl_get_dev_stats)
333 366
334#endif 367#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index bbf6d0d9aebe..9e138cdc36c5 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -196,7 +196,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
196 entry->len = len; 196 entry->len = len;
197 entry->disk_len = disk_len; 197 entry->disk_len = disk_len;
198 entry->bytes_left = len; 198 entry->bytes_left = len;
199 entry->inode = inode; 199 entry->inode = igrab(inode);
200 entry->compress_type = compress_type; 200 entry->compress_type = compress_type;
201 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) 201 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
202 set_bit(type, &entry->flags); 202 set_bit(type, &entry->flags);
@@ -212,12 +212,12 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
212 212
213 trace_btrfs_ordered_extent_add(inode, entry); 213 trace_btrfs_ordered_extent_add(inode, entry);
214 214
215 spin_lock(&tree->lock); 215 spin_lock_irq(&tree->lock);
216 node = tree_insert(&tree->tree, file_offset, 216 node = tree_insert(&tree->tree, file_offset,
217 &entry->rb_node); 217 &entry->rb_node);
218 if (node) 218 if (node)
219 ordered_data_tree_panic(inode, -EEXIST, file_offset); 219 ordered_data_tree_panic(inode, -EEXIST, file_offset);
220 spin_unlock(&tree->lock); 220 spin_unlock_irq(&tree->lock);
221 221
222 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 222 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
223 list_add_tail(&entry->root_extent_list, 223 list_add_tail(&entry->root_extent_list,
@@ -264,9 +264,9 @@ void btrfs_add_ordered_sum(struct inode *inode,
264 struct btrfs_ordered_inode_tree *tree; 264 struct btrfs_ordered_inode_tree *tree;
265 265
266 tree = &BTRFS_I(inode)->ordered_tree; 266 tree = &BTRFS_I(inode)->ordered_tree;
267 spin_lock(&tree->lock); 267 spin_lock_irq(&tree->lock);
268 list_add_tail(&sum->list, &entry->list); 268 list_add_tail(&sum->list, &entry->list);
269 spin_unlock(&tree->lock); 269 spin_unlock_irq(&tree->lock);
270} 270}
271 271
272/* 272/*
@@ -283,18 +283,19 @@ void btrfs_add_ordered_sum(struct inode *inode,
283 */ 283 */
284int btrfs_dec_test_first_ordered_pending(struct inode *inode, 284int btrfs_dec_test_first_ordered_pending(struct inode *inode,
285 struct btrfs_ordered_extent **cached, 285 struct btrfs_ordered_extent **cached,
286 u64 *file_offset, u64 io_size) 286 u64 *file_offset, u64 io_size, int uptodate)
287{ 287{
288 struct btrfs_ordered_inode_tree *tree; 288 struct btrfs_ordered_inode_tree *tree;
289 struct rb_node *node; 289 struct rb_node *node;
290 struct btrfs_ordered_extent *entry = NULL; 290 struct btrfs_ordered_extent *entry = NULL;
291 int ret; 291 int ret;
292 unsigned long flags;
292 u64 dec_end; 293 u64 dec_end;
293 u64 dec_start; 294 u64 dec_start;
294 u64 to_dec; 295 u64 to_dec;
295 296
296 tree = &BTRFS_I(inode)->ordered_tree; 297 tree = &BTRFS_I(inode)->ordered_tree;
297 spin_lock(&tree->lock); 298 spin_lock_irqsave(&tree->lock, flags);
298 node = tree_search(tree, *file_offset); 299 node = tree_search(tree, *file_offset);
299 if (!node) { 300 if (!node) {
300 ret = 1; 301 ret = 1;
@@ -323,6 +324,9 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
323 (unsigned long long)to_dec); 324 (unsigned long long)to_dec);
324 } 325 }
325 entry->bytes_left -= to_dec; 326 entry->bytes_left -= to_dec;
327 if (!uptodate)
328 set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
329
326 if (entry->bytes_left == 0) 330 if (entry->bytes_left == 0)
327 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 331 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
328 else 332 else
@@ -332,7 +336,7 @@ out:
332 *cached = entry; 336 *cached = entry;
333 atomic_inc(&entry->refs); 337 atomic_inc(&entry->refs);
334 } 338 }
335 spin_unlock(&tree->lock); 339 spin_unlock_irqrestore(&tree->lock, flags);
336 return ret == 0; 340 return ret == 0;
337} 341}
338 342
@@ -347,15 +351,21 @@ out:
347 */ 351 */
348int btrfs_dec_test_ordered_pending(struct inode *inode, 352int btrfs_dec_test_ordered_pending(struct inode *inode,
349 struct btrfs_ordered_extent **cached, 353 struct btrfs_ordered_extent **cached,
350 u64 file_offset, u64 io_size) 354 u64 file_offset, u64 io_size, int uptodate)
351{ 355{
352 struct btrfs_ordered_inode_tree *tree; 356 struct btrfs_ordered_inode_tree *tree;
353 struct rb_node *node; 357 struct rb_node *node;
354 struct btrfs_ordered_extent *entry = NULL; 358 struct btrfs_ordered_extent *entry = NULL;
359 unsigned long flags;
355 int ret; 360 int ret;
356 361
357 tree = &BTRFS_I(inode)->ordered_tree; 362 tree = &BTRFS_I(inode)->ordered_tree;
358 spin_lock(&tree->lock); 363 spin_lock_irqsave(&tree->lock, flags);
364 if (cached && *cached) {
365 entry = *cached;
366 goto have_entry;
367 }
368
359 node = tree_search(tree, file_offset); 369 node = tree_search(tree, file_offset);
360 if (!node) { 370 if (!node) {
361 ret = 1; 371 ret = 1;
@@ -363,6 +373,7 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
363 } 373 }
364 374
365 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 375 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
376have_entry:
366 if (!offset_in_entry(entry, file_offset)) { 377 if (!offset_in_entry(entry, file_offset)) {
367 ret = 1; 378 ret = 1;
368 goto out; 379 goto out;
@@ -374,6 +385,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
374 (unsigned long long)io_size); 385 (unsigned long long)io_size);
375 } 386 }
376 entry->bytes_left -= io_size; 387 entry->bytes_left -= io_size;
388 if (!uptodate)
389 set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
390
377 if (entry->bytes_left == 0) 391 if (entry->bytes_left == 0)
378 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 392 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
379 else 393 else
@@ -383,7 +397,7 @@ out:
383 *cached = entry; 397 *cached = entry;
384 atomic_inc(&entry->refs); 398 atomic_inc(&entry->refs);
385 } 399 }
386 spin_unlock(&tree->lock); 400 spin_unlock_irqrestore(&tree->lock, flags);
387 return ret == 0; 401 return ret == 0;
388} 402}
389 403
@@ -399,6 +413,8 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
399 trace_btrfs_ordered_extent_put(entry->inode, entry); 413 trace_btrfs_ordered_extent_put(entry->inode, entry);
400 414
401 if (atomic_dec_and_test(&entry->refs)) { 415 if (atomic_dec_and_test(&entry->refs)) {
416 if (entry->inode)
417 btrfs_add_delayed_iput(entry->inode);
402 while (!list_empty(&entry->list)) { 418 while (!list_empty(&entry->list)) {
403 cur = entry->list.next; 419 cur = entry->list.next;
404 sum = list_entry(cur, struct btrfs_ordered_sum, list); 420 sum = list_entry(cur, struct btrfs_ordered_sum, list);
@@ -411,21 +427,22 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
411 427
412/* 428/*
413 * remove an ordered extent from the tree. No references are dropped 429 * remove an ordered extent from the tree. No references are dropped
414 * and you must wake_up entry->wait. You must hold the tree lock 430 * and waiters are woken up.
415 * while you call this function.
416 */ 431 */
417static void __btrfs_remove_ordered_extent(struct inode *inode, 432void btrfs_remove_ordered_extent(struct inode *inode,
418 struct btrfs_ordered_extent *entry) 433 struct btrfs_ordered_extent *entry)
419{ 434{
420 struct btrfs_ordered_inode_tree *tree; 435 struct btrfs_ordered_inode_tree *tree;
421 struct btrfs_root *root = BTRFS_I(inode)->root; 436 struct btrfs_root *root = BTRFS_I(inode)->root;
422 struct rb_node *node; 437 struct rb_node *node;
423 438
424 tree = &BTRFS_I(inode)->ordered_tree; 439 tree = &BTRFS_I(inode)->ordered_tree;
440 spin_lock_irq(&tree->lock);
425 node = &entry->rb_node; 441 node = &entry->rb_node;
426 rb_erase(node, &tree->tree); 442 rb_erase(node, &tree->tree);
427 tree->last = NULL; 443 tree->last = NULL;
428 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 444 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
445 spin_unlock_irq(&tree->lock);
429 446
430 spin_lock(&root->fs_info->ordered_extent_lock); 447 spin_lock(&root->fs_info->ordered_extent_lock);
431 list_del_init(&entry->root_extent_list); 448 list_del_init(&entry->root_extent_list);
@@ -442,21 +459,6 @@ static void __btrfs_remove_ordered_extent(struct inode *inode,
442 list_del_init(&BTRFS_I(inode)->ordered_operations); 459 list_del_init(&BTRFS_I(inode)->ordered_operations);
443 } 460 }
444 spin_unlock(&root->fs_info->ordered_extent_lock); 461 spin_unlock(&root->fs_info->ordered_extent_lock);
445}
446
447/*
448 * remove an ordered extent from the tree. No references are dropped
449 * but any waiters are woken.
450 */
451void btrfs_remove_ordered_extent(struct inode *inode,
452 struct btrfs_ordered_extent *entry)
453{
454 struct btrfs_ordered_inode_tree *tree;
455
456 tree = &BTRFS_I(inode)->ordered_tree;
457 spin_lock(&tree->lock);
458 __btrfs_remove_ordered_extent(inode, entry);
459 spin_unlock(&tree->lock);
460 wake_up(&entry->wait); 462 wake_up(&entry->wait);
461} 463}
462 464
@@ -621,19 +623,11 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
621 if (orig_end > INT_LIMIT(loff_t)) 623 if (orig_end > INT_LIMIT(loff_t))
622 orig_end = INT_LIMIT(loff_t); 624 orig_end = INT_LIMIT(loff_t);
623 } 625 }
624again: 626
625 /* start IO across the range first to instantiate any delalloc 627 /* start IO across the range first to instantiate any delalloc
626 * extents 628 * extents
627 */ 629 */
628 filemap_fdatawrite_range(inode->i_mapping, start, orig_end); 630 filemap_write_and_wait_range(inode->i_mapping, start, orig_end);
629
630 /* The compression code will leave pages locked but return from
631 * writepage without setting the page writeback. Starting again
632 * with WB_SYNC_ALL will end up waiting for the IO to actually start.
633 */
634 filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
635
636 filemap_fdatawait_range(inode->i_mapping, start, orig_end);
637 631
638 end = orig_end; 632 end = orig_end;
639 found = 0; 633 found = 0;
@@ -657,11 +651,6 @@ again:
657 break; 651 break;
658 end--; 652 end--;
659 } 653 }
660 if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
661 EXTENT_DELALLOC, 0, NULL)) {
662 schedule_timeout(1);
663 goto again;
664 }
665} 654}
666 655
667/* 656/*
@@ -676,7 +665,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
676 struct btrfs_ordered_extent *entry = NULL; 665 struct btrfs_ordered_extent *entry = NULL;
677 666
678 tree = &BTRFS_I(inode)->ordered_tree; 667 tree = &BTRFS_I(inode)->ordered_tree;
679 spin_lock(&tree->lock); 668 spin_lock_irq(&tree->lock);
680 node = tree_search(tree, file_offset); 669 node = tree_search(tree, file_offset);
681 if (!node) 670 if (!node)
682 goto out; 671 goto out;
@@ -687,7 +676,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
687 if (entry) 676 if (entry)
688 atomic_inc(&entry->refs); 677 atomic_inc(&entry->refs);
689out: 678out:
690 spin_unlock(&tree->lock); 679 spin_unlock_irq(&tree->lock);
691 return entry; 680 return entry;
692} 681}
693 682
@@ -703,7 +692,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
703 struct btrfs_ordered_extent *entry = NULL; 692 struct btrfs_ordered_extent *entry = NULL;
704 693
705 tree = &BTRFS_I(inode)->ordered_tree; 694 tree = &BTRFS_I(inode)->ordered_tree;
706 spin_lock(&tree->lock); 695 spin_lock_irq(&tree->lock);
707 node = tree_search(tree, file_offset); 696 node = tree_search(tree, file_offset);
708 if (!node) { 697 if (!node) {
709 node = tree_search(tree, file_offset + len); 698 node = tree_search(tree, file_offset + len);
@@ -728,7 +717,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
728out: 717out:
729 if (entry) 718 if (entry)
730 atomic_inc(&entry->refs); 719 atomic_inc(&entry->refs);
731 spin_unlock(&tree->lock); 720 spin_unlock_irq(&tree->lock);
732 return entry; 721 return entry;
733} 722}
734 723
@@ -744,7 +733,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
744 struct btrfs_ordered_extent *entry = NULL; 733 struct btrfs_ordered_extent *entry = NULL;
745 734
746 tree = &BTRFS_I(inode)->ordered_tree; 735 tree = &BTRFS_I(inode)->ordered_tree;
747 spin_lock(&tree->lock); 736 spin_lock_irq(&tree->lock);
748 node = tree_search(tree, file_offset); 737 node = tree_search(tree, file_offset);
749 if (!node) 738 if (!node)
750 goto out; 739 goto out;
@@ -752,7 +741,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
752 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 741 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
753 atomic_inc(&entry->refs); 742 atomic_inc(&entry->refs);
754out: 743out:
755 spin_unlock(&tree->lock); 744 spin_unlock_irq(&tree->lock);
756 return entry; 745 return entry;
757} 746}
758 747
@@ -764,7 +753,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
764 struct btrfs_ordered_extent *ordered) 753 struct btrfs_ordered_extent *ordered)
765{ 754{
766 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 755 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
767 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
768 u64 disk_i_size; 756 u64 disk_i_size;
769 u64 new_i_size; 757 u64 new_i_size;
770 u64 i_size_test; 758 u64 i_size_test;
@@ -779,7 +767,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
779 else 767 else
780 offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize); 768 offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
781 769
782 spin_lock(&tree->lock); 770 spin_lock_irq(&tree->lock);
783 disk_i_size = BTRFS_I(inode)->disk_i_size; 771 disk_i_size = BTRFS_I(inode)->disk_i_size;
784 772
785 /* truncate file */ 773 /* truncate file */
@@ -798,14 +786,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
798 } 786 }
799 787
800 /* 788 /*
801 * we can't update the disk_isize if there are delalloc bytes
802 * between disk_i_size and this ordered extent
803 */
804 if (test_range_bit(io_tree, disk_i_size, offset - 1,
805 EXTENT_DELALLOC, 0, NULL)) {
806 goto out;
807 }
808 /*
809 * walk backward from this ordered extent to disk_i_size. 789 * walk backward from this ordered extent to disk_i_size.
810 * if we find an ordered extent then we can't update disk i_size 790 * if we find an ordered extent then we can't update disk i_size
811 * yet 791 * yet
@@ -825,15 +805,18 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
825 } 805 }
826 node = prev; 806 node = prev;
827 } 807 }
828 while (node) { 808 for (; node; node = rb_prev(node)) {
829 test = rb_entry(node, struct btrfs_ordered_extent, rb_node); 809 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
810
811 /* We treat this entry as if it doesnt exist */
812 if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
813 continue;
830 if (test->file_offset + test->len <= disk_i_size) 814 if (test->file_offset + test->len <= disk_i_size)
831 break; 815 break;
832 if (test->file_offset >= i_size) 816 if (test->file_offset >= i_size)
833 break; 817 break;
834 if (test->file_offset >= disk_i_size) 818 if (test->file_offset >= disk_i_size)
835 goto out; 819 goto out;
836 node = rb_prev(node);
837 } 820 }
838 new_i_size = min_t(u64, offset, i_size); 821 new_i_size = min_t(u64, offset, i_size);
839 822
@@ -851,43 +834,49 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
851 else 834 else
852 node = rb_first(&tree->tree); 835 node = rb_first(&tree->tree);
853 } 836 }
854 i_size_test = 0; 837
855 if (node) { 838 /*
856 /* 839 * We are looking for an area between our current extent and the next
857 * do we have an area where IO might have finished 840 * ordered extent to update the i_size to. There are 3 cases here
858 * between our ordered extent and the next one. 841 *
859 */ 842 * 1) We don't actually have anything and we can update to i_size.
843 * 2) We have stuff but they already did their i_size update so again we
844 * can just update to i_size.
845 * 3) We have an outstanding ordered extent so the most we can update
846 * our disk_i_size to is the start of the next offset.
847 */
848 i_size_test = i_size;
849 for (; node; node = rb_next(node)) {
860 test = rb_entry(node, struct btrfs_ordered_extent, rb_node); 850 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
861 if (test->file_offset > offset) 851
852 if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
853 continue;
854 if (test->file_offset > offset) {
862 i_size_test = test->file_offset; 855 i_size_test = test->file_offset;
863 } else { 856 break;
864 i_size_test = i_size; 857 }
865 } 858 }
866 859
867 /* 860 /*
868 * i_size_test is the end of a region after this ordered 861 * i_size_test is the end of a region after this ordered
869 * extent where there are no ordered extents. As long as there 862 * extent where there are no ordered extents, we can safely set
870 * are no delalloc bytes in this area, it is safe to update 863 * disk_i_size to this.
871 * disk_i_size to the end of the region.
872 */ 864 */
873 if (i_size_test > offset && 865 if (i_size_test > offset)
874 !test_range_bit(io_tree, offset, i_size_test - 1,
875 EXTENT_DELALLOC, 0, NULL)) {
876 new_i_size = min_t(u64, i_size_test, i_size); 866 new_i_size = min_t(u64, i_size_test, i_size);
877 }
878 BTRFS_I(inode)->disk_i_size = new_i_size; 867 BTRFS_I(inode)->disk_i_size = new_i_size;
879 ret = 0; 868 ret = 0;
880out: 869out:
881 /* 870 /*
882 * we need to remove the ordered extent with the tree lock held 871 * We need to do this because we can't remove ordered extents until
883 * so that other people calling this function don't find our fully 872 * after the i_disk_size has been updated and then the inode has been
884 * processed ordered entry and skip updating the i_size 873 * updated to reflect the change, so we need to tell anybody who finds
874 * this ordered extent that we've already done all the real work, we
875 * just haven't completed all the other work.
885 */ 876 */
886 if (ordered) 877 if (ordered)
887 __btrfs_remove_ordered_extent(inode, ordered); 878 set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags);
888 spin_unlock(&tree->lock); 879 spin_unlock_irq(&tree->lock);
889 if (ordered)
890 wake_up(&ordered->wait);
891 return ret; 880 return ret;
892} 881}
893 882
@@ -912,7 +901,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
912 if (!ordered) 901 if (!ordered)
913 return 1; 902 return 1;
914 903
915 spin_lock(&tree->lock); 904 spin_lock_irq(&tree->lock);
916 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { 905 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
917 if (disk_bytenr >= ordered_sum->bytenr) { 906 if (disk_bytenr >= ordered_sum->bytenr) {
918 num_sectors = ordered_sum->len / sectorsize; 907 num_sectors = ordered_sum->len / sectorsize;
@@ -927,7 +916,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
927 } 916 }
928 } 917 }
929out: 918out:
930 spin_unlock(&tree->lock); 919 spin_unlock_irq(&tree->lock);
931 btrfs_put_ordered_extent(ordered); 920 btrfs_put_ordered_extent(ordered);
932 return ret; 921 return ret;
933} 922}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index c355ad4dc1a6..e03c560d2997 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -74,6 +74,12 @@ struct btrfs_ordered_sum {
74 74
75#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */ 75#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
76 76
77#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */
78
79#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates wether this ordered extent
80 * has done its due diligence in updating
81 * the isize. */
82
77struct btrfs_ordered_extent { 83struct btrfs_ordered_extent {
78 /* logical offset in the file */ 84 /* logical offset in the file */
79 u64 file_offset; 85 u64 file_offset;
@@ -113,6 +119,8 @@ struct btrfs_ordered_extent {
113 119
114 /* a per root list of all the pending ordered extents */ 120 /* a per root list of all the pending ordered extents */
115 struct list_head root_extent_list; 121 struct list_head root_extent_list;
122
123 struct btrfs_work work;
116}; 124};
117 125
118 126
@@ -143,10 +151,11 @@ void btrfs_remove_ordered_extent(struct inode *inode,
143 struct btrfs_ordered_extent *entry); 151 struct btrfs_ordered_extent *entry);
144int btrfs_dec_test_ordered_pending(struct inode *inode, 152int btrfs_dec_test_ordered_pending(struct inode *inode,
145 struct btrfs_ordered_extent **cached, 153 struct btrfs_ordered_extent **cached,
146 u64 file_offset, u64 io_size); 154 u64 file_offset, u64 io_size, int uptodate);
147int btrfs_dec_test_first_ordered_pending(struct inode *inode, 155int btrfs_dec_test_first_ordered_pending(struct inode *inode,
148 struct btrfs_ordered_extent **cached, 156 struct btrfs_ordered_extent **cached,
149 u64 *file_offset, u64 io_size); 157 u64 *file_offset, u64 io_size,
158 int uptodate);
150int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 159int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
151 u64 start, u64 len, u64 disk_len, int type); 160 u64 start, u64 len, u64 disk_len, int type);
152int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, 161int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index f38e452486b8..5e23684887eb 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -294,6 +294,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
294 btrfs_dev_extent_chunk_offset(l, dev_extent), 294 btrfs_dev_extent_chunk_offset(l, dev_extent),
295 (unsigned long long) 295 (unsigned long long)
296 btrfs_dev_extent_length(l, dev_extent)); 296 btrfs_dev_extent_length(l, dev_extent));
297 case BTRFS_DEV_STATS_KEY:
298 printk(KERN_INFO "\t\tdevice stats\n");
299 break;
297 }; 300 };
298 } 301 }
299} 302}
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index ac5d01085884..48a4882d8ad5 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -718,13 +718,18 @@ static void reada_start_machine_worker(struct btrfs_work *work)
718{ 718{
719 struct reada_machine_work *rmw; 719 struct reada_machine_work *rmw;
720 struct btrfs_fs_info *fs_info; 720 struct btrfs_fs_info *fs_info;
721 int old_ioprio;
721 722
722 rmw = container_of(work, struct reada_machine_work, work); 723 rmw = container_of(work, struct reada_machine_work, work);
723 fs_info = rmw->fs_info; 724 fs_info = rmw->fs_info;
724 725
725 kfree(rmw); 726 kfree(rmw);
726 727
728 old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current),
729 task_nice_ioprio(current));
730 set_task_ioprio(current, BTRFS_IOPRIO_READA);
727 __reada_start_machine(fs_info); 731 __reada_start_machine(fs_info);
732 set_task_ioprio(current, old_ioprio);
728} 733}
729 734
730static void __reada_start_machine(struct btrfs_fs_info *fs_info) 735static void __reada_start_machine(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 2f3d6f917fb3..a38cfa4f251e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -50,7 +50,7 @@ struct scrub_dev;
50struct scrub_page { 50struct scrub_page {
51 struct scrub_block *sblock; 51 struct scrub_block *sblock;
52 struct page *page; 52 struct page *page;
53 struct block_device *bdev; 53 struct btrfs_device *dev;
54 u64 flags; /* extent flags */ 54 u64 flags; /* extent flags */
55 u64 generation; 55 u64 generation;
56 u64 logical; 56 u64 logical;
@@ -86,6 +86,7 @@ struct scrub_block {
86 unsigned int header_error:1; 86 unsigned int header_error:1;
87 unsigned int checksum_error:1; 87 unsigned int checksum_error:1;
88 unsigned int no_io_error_seen:1; 88 unsigned int no_io_error_seen:1;
89 unsigned int generation_error:1; /* also sets header_error */
89 }; 90 };
90}; 91};
91 92
@@ -675,6 +676,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
675 sdev->stat.read_errors++; 676 sdev->stat.read_errors++;
676 sdev->stat.uncorrectable_errors++; 677 sdev->stat.uncorrectable_errors++;
677 spin_unlock(&sdev->stat_lock); 678 spin_unlock(&sdev->stat_lock);
679 btrfs_dev_stat_inc_and_print(sdev->dev,
680 BTRFS_DEV_STAT_READ_ERRS);
678 goto out; 681 goto out;
679 } 682 }
680 683
@@ -686,6 +689,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
686 sdev->stat.read_errors++; 689 sdev->stat.read_errors++;
687 sdev->stat.uncorrectable_errors++; 690 sdev->stat.uncorrectable_errors++;
688 spin_unlock(&sdev->stat_lock); 691 spin_unlock(&sdev->stat_lock);
692 btrfs_dev_stat_inc_and_print(sdev->dev,
693 BTRFS_DEV_STAT_READ_ERRS);
689 goto out; 694 goto out;
690 } 695 }
691 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); 696 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
@@ -699,6 +704,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
699 sdev->stat.read_errors++; 704 sdev->stat.read_errors++;
700 sdev->stat.uncorrectable_errors++; 705 sdev->stat.uncorrectable_errors++;
701 spin_unlock(&sdev->stat_lock); 706 spin_unlock(&sdev->stat_lock);
707 btrfs_dev_stat_inc_and_print(sdev->dev,
708 BTRFS_DEV_STAT_READ_ERRS);
702 goto out; 709 goto out;
703 } 710 }
704 711
@@ -725,12 +732,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
725 spin_unlock(&sdev->stat_lock); 732 spin_unlock(&sdev->stat_lock);
726 if (__ratelimit(&_rs)) 733 if (__ratelimit(&_rs))
727 scrub_print_warning("i/o error", sblock_to_check); 734 scrub_print_warning("i/o error", sblock_to_check);
735 btrfs_dev_stat_inc_and_print(sdev->dev,
736 BTRFS_DEV_STAT_READ_ERRS);
728 } else if (sblock_bad->checksum_error) { 737 } else if (sblock_bad->checksum_error) {
729 spin_lock(&sdev->stat_lock); 738 spin_lock(&sdev->stat_lock);
730 sdev->stat.csum_errors++; 739 sdev->stat.csum_errors++;
731 spin_unlock(&sdev->stat_lock); 740 spin_unlock(&sdev->stat_lock);
732 if (__ratelimit(&_rs)) 741 if (__ratelimit(&_rs))
733 scrub_print_warning("checksum error", sblock_to_check); 742 scrub_print_warning("checksum error", sblock_to_check);
743 btrfs_dev_stat_inc_and_print(sdev->dev,
744 BTRFS_DEV_STAT_CORRUPTION_ERRS);
734 } else if (sblock_bad->header_error) { 745 } else if (sblock_bad->header_error) {
735 spin_lock(&sdev->stat_lock); 746 spin_lock(&sdev->stat_lock);
736 sdev->stat.verify_errors++; 747 sdev->stat.verify_errors++;
@@ -738,6 +749,12 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
738 if (__ratelimit(&_rs)) 749 if (__ratelimit(&_rs))
739 scrub_print_warning("checksum/header error", 750 scrub_print_warning("checksum/header error",
740 sblock_to_check); 751 sblock_to_check);
752 if (sblock_bad->generation_error)
753 btrfs_dev_stat_inc_and_print(sdev->dev,
754 BTRFS_DEV_STAT_GENERATION_ERRS);
755 else
756 btrfs_dev_stat_inc_and_print(sdev->dev,
757 BTRFS_DEV_STAT_CORRUPTION_ERRS);
741 } 758 }
742 759
743 if (sdev->readonly) 760 if (sdev->readonly)
@@ -998,8 +1015,8 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
998 page = sblock->pagev + page_index; 1015 page = sblock->pagev + page_index;
999 page->logical = logical; 1016 page->logical = logical;
1000 page->physical = bbio->stripes[mirror_index].physical; 1017 page->physical = bbio->stripes[mirror_index].physical;
1001 /* for missing devices, bdev is NULL */ 1018 /* for missing devices, dev->bdev is NULL */
1002 page->bdev = bbio->stripes[mirror_index].dev->bdev; 1019 page->dev = bbio->stripes[mirror_index].dev;
1003 page->mirror_num = mirror_index + 1; 1020 page->mirror_num = mirror_index + 1;
1004 page->page = alloc_page(GFP_NOFS); 1021 page->page = alloc_page(GFP_NOFS);
1005 if (!page->page) { 1022 if (!page->page) {
@@ -1043,7 +1060,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
1043 struct scrub_page *page = sblock->pagev + page_num; 1060 struct scrub_page *page = sblock->pagev + page_num;
1044 DECLARE_COMPLETION_ONSTACK(complete); 1061 DECLARE_COMPLETION_ONSTACK(complete);
1045 1062
1046 if (page->bdev == NULL) { 1063 if (page->dev->bdev == NULL) {
1047 page->io_error = 1; 1064 page->io_error = 1;
1048 sblock->no_io_error_seen = 0; 1065 sblock->no_io_error_seen = 0;
1049 continue; 1066 continue;
@@ -1053,7 +1070,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
1053 bio = bio_alloc(GFP_NOFS, 1); 1070 bio = bio_alloc(GFP_NOFS, 1);
1054 if (!bio) 1071 if (!bio)
1055 return -EIO; 1072 return -EIO;
1056 bio->bi_bdev = page->bdev; 1073 bio->bi_bdev = page->dev->bdev;
1057 bio->bi_sector = page->physical >> 9; 1074 bio->bi_sector = page->physical >> 9;
1058 bio->bi_end_io = scrub_complete_bio_end_io; 1075 bio->bi_end_io = scrub_complete_bio_end_io;
1059 bio->bi_private = &complete; 1076 bio->bi_private = &complete;
@@ -1102,11 +1119,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1102 h = (struct btrfs_header *)mapped_buffer; 1119 h = (struct btrfs_header *)mapped_buffer;
1103 1120
1104 if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || 1121 if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) ||
1105 generation != le64_to_cpu(h->generation) ||
1106 memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || 1122 memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
1107 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 1123 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1108 BTRFS_UUID_SIZE)) 1124 BTRFS_UUID_SIZE)) {
1109 sblock->header_error = 1; 1125 sblock->header_error = 1;
1126 } else if (generation != le64_to_cpu(h->generation)) {
1127 sblock->header_error = 1;
1128 sblock->generation_error = 1;
1129 }
1110 csum = h->csum; 1130 csum = h->csum;
1111 } else { 1131 } else {
1112 if (!have_csum) 1132 if (!have_csum)
@@ -1182,7 +1202,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1182 bio = bio_alloc(GFP_NOFS, 1); 1202 bio = bio_alloc(GFP_NOFS, 1);
1183 if (!bio) 1203 if (!bio)
1184 return -EIO; 1204 return -EIO;
1185 bio->bi_bdev = page_bad->bdev; 1205 bio->bi_bdev = page_bad->dev->bdev;
1186 bio->bi_sector = page_bad->physical >> 9; 1206 bio->bi_sector = page_bad->physical >> 9;
1187 bio->bi_end_io = scrub_complete_bio_end_io; 1207 bio->bi_end_io = scrub_complete_bio_end_io;
1188 bio->bi_private = &complete; 1208 bio->bi_private = &complete;
@@ -1196,6 +1216,12 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1196 1216
1197 /* this will also unplug the queue */ 1217 /* this will also unplug the queue */
1198 wait_for_completion(&complete); 1218 wait_for_completion(&complete);
1219 if (!bio_flagged(bio, BIO_UPTODATE)) {
1220 btrfs_dev_stat_inc_and_print(page_bad->dev,
1221 BTRFS_DEV_STAT_WRITE_ERRS);
1222 bio_put(bio);
1223 return -EIO;
1224 }
1199 bio_put(bio); 1225 bio_put(bio);
1200 } 1226 }
1201 1227
@@ -1352,7 +1378,8 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1352 u64 mapped_size; 1378 u64 mapped_size;
1353 void *p; 1379 void *p;
1354 u32 crc = ~(u32)0; 1380 u32 crc = ~(u32)0;
1355 int fail = 0; 1381 int fail_gen = 0;
1382 int fail_cor = 0;
1356 u64 len; 1383 u64 len;
1357 int index; 1384 int index;
1358 1385
@@ -1363,13 +1390,13 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1363 memcpy(on_disk_csum, s->csum, sdev->csum_size); 1390 memcpy(on_disk_csum, s->csum, sdev->csum_size);
1364 1391
1365 if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) 1392 if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr))
1366 ++fail; 1393 ++fail_cor;
1367 1394
1368 if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) 1395 if (sblock->pagev[0].generation != le64_to_cpu(s->generation))
1369 ++fail; 1396 ++fail_gen;
1370 1397
1371 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1398 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
1372 ++fail; 1399 ++fail_cor;
1373 1400
1374 len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; 1401 len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
1375 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; 1402 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
@@ -1394,9 +1421,9 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1394 1421
1395 btrfs_csum_final(crc, calculated_csum); 1422 btrfs_csum_final(crc, calculated_csum);
1396 if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) 1423 if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
1397 ++fail; 1424 ++fail_cor;
1398 1425
1399 if (fail) { 1426 if (fail_cor + fail_gen) {
1400 /* 1427 /*
1401 * if we find an error in a super block, we just report it. 1428 * if we find an error in a super block, we just report it.
1402 * They will get written with the next transaction commit 1429 * They will get written with the next transaction commit
@@ -1405,9 +1432,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1405 spin_lock(&sdev->stat_lock); 1432 spin_lock(&sdev->stat_lock);
1406 ++sdev->stat.super_errors; 1433 ++sdev->stat.super_errors;
1407 spin_unlock(&sdev->stat_lock); 1434 spin_unlock(&sdev->stat_lock);
1435 if (fail_cor)
1436 btrfs_dev_stat_inc_and_print(sdev->dev,
1437 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1438 else
1439 btrfs_dev_stat_inc_and_print(sdev->dev,
1440 BTRFS_DEV_STAT_GENERATION_ERRS);
1408 } 1441 }
1409 1442
1410 return fail; 1443 return fail_cor + fail_gen;
1411} 1444}
1412 1445
1413static void scrub_block_get(struct scrub_block *sblock) 1446static void scrub_block_get(struct scrub_block *sblock)
@@ -1551,7 +1584,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
1551 return -ENOMEM; 1584 return -ENOMEM;
1552 } 1585 }
1553 spage->sblock = sblock; 1586 spage->sblock = sblock;
1554 spage->bdev = sdev->dev->bdev; 1587 spage->dev = sdev->dev;
1555 spage->flags = flags; 1588 spage->flags = flags;
1556 spage->generation = gen; 1589 spage->generation = gen;
1557 spage->logical = logical; 1590 spage->logical = logical;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c5f8fca4195f..96eb9fef7bd2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -188,7 +188,8 @@ void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...)
188 va_start(args, fmt); 188 va_start(args, fmt);
189 189
190 if (fmt[0] == '<' && isdigit(fmt[1]) && fmt[2] == '>') { 190 if (fmt[0] == '<' && isdigit(fmt[1]) && fmt[2] == '>') {
191 strncpy(lvl, fmt, 3); 191 memcpy(lvl, fmt, 3);
192 lvl[3] = '\0';
192 fmt += 3; 193 fmt += 3;
193 type = logtypes[fmt[1] - '0']; 194 type = logtypes[fmt[1] - '0'];
194 } else 195 } else
@@ -435,11 +436,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
435 case Opt_thread_pool: 436 case Opt_thread_pool:
436 intarg = 0; 437 intarg = 0;
437 match_int(&args[0], &intarg); 438 match_int(&args[0], &intarg);
438 if (intarg) { 439 if (intarg)
439 info->thread_pool_size = intarg; 440 info->thread_pool_size = intarg;
440 printk(KERN_INFO "btrfs: thread pool %d\n",
441 info->thread_pool_size);
442 }
443 break; 441 break;
444 case Opt_max_inline: 442 case Opt_max_inline:
445 num = match_strdup(&args[0]); 443 num = match_strdup(&args[0]);
@@ -769,7 +767,7 @@ static int btrfs_fill_super(struct super_block *sb,
769#ifdef CONFIG_BTRFS_FS_POSIX_ACL 767#ifdef CONFIG_BTRFS_FS_POSIX_ACL
770 sb->s_flags |= MS_POSIXACL; 768 sb->s_flags |= MS_POSIXACL;
771#endif 769#endif
772 770 sb->s_flags |= MS_I_VERSION;
773 err = open_ctree(sb, fs_devices, (char *)data); 771 err = open_ctree(sb, fs_devices, (char *)data);
774 if (err) { 772 if (err) {
775 printk("btrfs: open_ctree failed\n"); 773 printk("btrfs: open_ctree failed\n");
@@ -925,63 +923,48 @@ static inline int is_subvolume_inode(struct inode *inode)
925 */ 923 */
926static char *setup_root_args(char *args) 924static char *setup_root_args(char *args)
927{ 925{
928 unsigned copied = 0; 926 unsigned len = strlen(args) + 2 + 1;
929 unsigned len = strlen(args) + 2; 927 char *src, *dst, *buf;
930 char *pos;
931 char *ret;
932 928
933 /* 929 /*
934 * We need the same args as before, but minus 930 * We need the same args as before, but with this substitution:
935 * 931 * s!subvol=[^,]+!subvolid=0!
936 * subvol=a
937 *
938 * and add
939 *
940 * subvolid=0
941 * 932 *
942 * which is a difference of 2 characters, so we allocate strlen(args) + 933 * Since the replacement string is up to 2 bytes longer than the
943 * 2 characters. 934 * original, allocate strlen(args) + 2 + 1 bytes.
944 */ 935 */
945 ret = kzalloc(len * sizeof(char), GFP_NOFS);
946 if (!ret)
947 return NULL;
948 pos = strstr(args, "subvol=");
949 936
937 src = strstr(args, "subvol=");
950 /* This shouldn't happen, but just in case.. */ 938 /* This shouldn't happen, but just in case.. */
951 if (!pos) { 939 if (!src)
952 kfree(ret); 940 return NULL;
941
942 buf = dst = kmalloc(len, GFP_NOFS);
943 if (!buf)
953 return NULL; 944 return NULL;
954 }
955 945
956 /* 946 /*
957 * The subvol=<> arg is not at the front of the string, copy everybody 947 * If the subvol= arg is not at the start of the string,
958 * up to that into ret. 948 * copy whatever precedes it into buf.
959 */ 949 */
960 if (pos != args) { 950 if (src != args) {
961 *pos = '\0'; 951 *src++ = '\0';
962 strcpy(ret, args); 952 strcpy(buf, args);
963 copied += strlen(args); 953 dst += strlen(args);
964 pos++;
965 } 954 }
966 955
967 strncpy(ret + copied, "subvolid=0", len - copied); 956 strcpy(dst, "subvolid=0");
968 957 dst += strlen("subvolid=0");
969 /* Length of subvolid=0 */
970 copied += 10;
971 958
972 /* 959 /*
973 * If there is no , after the subvol= option then we know there's no 960 * If there is a "," after the original subvol=... string,
974 * other options and we can just return. 961 * copy that suffix into our buffer. Otherwise, we're done.
975 */ 962 */
976 pos = strchr(pos, ','); 963 src = strchr(src, ',');
977 if (!pos) 964 if (src)
978 return ret; 965 strcpy(dst, src);
979 966
980 /* Copy the rest of the arguments into our buffer */ 967 return buf;
981 strncpy(ret + copied, pos, len - copied);
982 copied += strlen(pos);
983
984 return ret;
985} 968}
986 969
987static struct dentry *mount_subvol(const char *subvol_name, int flags, 970static struct dentry *mount_subvol(const char *subvol_name, int flags,
@@ -1118,6 +1101,40 @@ error_fs_info:
1118 return ERR_PTR(error); 1101 return ERR_PTR(error);
1119} 1102}
1120 1103
1104static void btrfs_set_max_workers(struct btrfs_workers *workers, int new_limit)
1105{
1106 spin_lock_irq(&workers->lock);
1107 workers->max_workers = new_limit;
1108 spin_unlock_irq(&workers->lock);
1109}
1110
1111static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
1112 int new_pool_size, int old_pool_size)
1113{
1114 if (new_pool_size == old_pool_size)
1115 return;
1116
1117 fs_info->thread_pool_size = new_pool_size;
1118
1119 printk(KERN_INFO "btrfs: resize thread pool %d -> %d\n",
1120 old_pool_size, new_pool_size);
1121
1122 btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size);
1123 btrfs_set_max_workers(&fs_info->workers, new_pool_size);
1124 btrfs_set_max_workers(&fs_info->delalloc_workers, new_pool_size);
1125 btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size);
1126 btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size);
1127 btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size);
1128 btrfs_set_max_workers(&fs_info->endio_workers, new_pool_size);
1129 btrfs_set_max_workers(&fs_info->endio_meta_workers, new_pool_size);
1130 btrfs_set_max_workers(&fs_info->endio_meta_write_workers, new_pool_size);
1131 btrfs_set_max_workers(&fs_info->endio_write_workers, new_pool_size);
1132 btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size);
1133 btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
1134 btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size);
1135 btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size);
1136}
1137
1121static int btrfs_remount(struct super_block *sb, int *flags, char *data) 1138static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1122{ 1139{
1123 struct btrfs_fs_info *fs_info = btrfs_sb(sb); 1140 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -1137,6 +1154,9 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1137 goto restore; 1154 goto restore;
1138 } 1155 }
1139 1156
1157 btrfs_resize_thread_pool(fs_info,
1158 fs_info->thread_pool_size, old_thread_pool_size);
1159
1140 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 1160 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
1141 return 0; 1161 return 0;
1142 1162
@@ -1180,7 +1200,8 @@ restore:
1180 fs_info->compress_type = old_compress_type; 1200 fs_info->compress_type = old_compress_type;
1181 fs_info->max_inline = old_max_inline; 1201 fs_info->max_inline = old_max_inline;
1182 fs_info->alloc_start = old_alloc_start; 1202 fs_info->alloc_start = old_alloc_start;
1183 fs_info->thread_pool_size = old_thread_pool_size; 1203 btrfs_resize_thread_pool(fs_info,
1204 old_thread_pool_size, fs_info->thread_pool_size);
1184 fs_info->metadata_ratio = old_metadata_ratio; 1205 fs_info->metadata_ratio = old_metadata_ratio;
1185 return ret; 1206 return ret;
1186} 1207}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 36422254ef67..1791c6e3d834 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -28,6 +28,7 @@
28#include "locking.h" 28#include "locking.h"
29#include "tree-log.h" 29#include "tree-log.h"
30#include "inode-map.h" 30#include "inode-map.h"
31#include "volumes.h"
31 32
32#define BTRFS_ROOT_TRANS_TAG 0 33#define BTRFS_ROOT_TRANS_TAG 0
33 34
@@ -55,48 +56,49 @@ static noinline void switch_commit_root(struct btrfs_root *root)
55static noinline int join_transaction(struct btrfs_root *root, int nofail) 56static noinline int join_transaction(struct btrfs_root *root, int nofail)
56{ 57{
57 struct btrfs_transaction *cur_trans; 58 struct btrfs_transaction *cur_trans;
59 struct btrfs_fs_info *fs_info = root->fs_info;
58 60
59 spin_lock(&root->fs_info->trans_lock); 61 spin_lock(&fs_info->trans_lock);
60loop: 62loop:
61 /* The file system has been taken offline. No new transactions. */ 63 /* The file system has been taken offline. No new transactions. */
62 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 64 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
63 spin_unlock(&root->fs_info->trans_lock); 65 spin_unlock(&fs_info->trans_lock);
64 return -EROFS; 66 return -EROFS;
65 } 67 }
66 68
67 if (root->fs_info->trans_no_join) { 69 if (fs_info->trans_no_join) {
68 if (!nofail) { 70 if (!nofail) {
69 spin_unlock(&root->fs_info->trans_lock); 71 spin_unlock(&fs_info->trans_lock);
70 return -EBUSY; 72 return -EBUSY;
71 } 73 }
72 } 74 }
73 75
74 cur_trans = root->fs_info->running_transaction; 76 cur_trans = fs_info->running_transaction;
75 if (cur_trans) { 77 if (cur_trans) {
76 if (cur_trans->aborted) { 78 if (cur_trans->aborted) {
77 spin_unlock(&root->fs_info->trans_lock); 79 spin_unlock(&fs_info->trans_lock);
78 return cur_trans->aborted; 80 return cur_trans->aborted;
79 } 81 }
80 atomic_inc(&cur_trans->use_count); 82 atomic_inc(&cur_trans->use_count);
81 atomic_inc(&cur_trans->num_writers); 83 atomic_inc(&cur_trans->num_writers);
82 cur_trans->num_joined++; 84 cur_trans->num_joined++;
83 spin_unlock(&root->fs_info->trans_lock); 85 spin_unlock(&fs_info->trans_lock);
84 return 0; 86 return 0;
85 } 87 }
86 spin_unlock(&root->fs_info->trans_lock); 88 spin_unlock(&fs_info->trans_lock);
87 89
88 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); 90 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
89 if (!cur_trans) 91 if (!cur_trans)
90 return -ENOMEM; 92 return -ENOMEM;
91 93
92 spin_lock(&root->fs_info->trans_lock); 94 spin_lock(&fs_info->trans_lock);
93 if (root->fs_info->running_transaction) { 95 if (fs_info->running_transaction) {
94 /* 96 /*
95 * someone started a transaction after we unlocked. Make sure 97 * someone started a transaction after we unlocked. Make sure
96 * to redo the trans_no_join checks above 98 * to redo the trans_no_join checks above
97 */ 99 */
98 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 100 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
99 cur_trans = root->fs_info->running_transaction; 101 cur_trans = fs_info->running_transaction;
100 goto loop; 102 goto loop;
101 } 103 }
102 104
@@ -121,20 +123,38 @@ loop:
121 cur_trans->delayed_refs.flushing = 0; 123 cur_trans->delayed_refs.flushing = 0;
122 cur_trans->delayed_refs.run_delayed_start = 0; 124 cur_trans->delayed_refs.run_delayed_start = 0;
123 cur_trans->delayed_refs.seq = 1; 125 cur_trans->delayed_refs.seq = 1;
126
127 /*
128 * although the tree mod log is per file system and not per transaction,
129 * the log must never go across transaction boundaries.
130 */
131 smp_mb();
132 if (!list_empty(&fs_info->tree_mod_seq_list)) {
133 printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when "
134 "creating a fresh transaction\n");
135 WARN_ON(1);
136 }
137 if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) {
138 printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
139 "creating a fresh transaction\n");
140 WARN_ON(1);
141 }
142 atomic_set(&fs_info->tree_mod_seq, 0);
143
124 init_waitqueue_head(&cur_trans->delayed_refs.seq_wait); 144 init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);
125 spin_lock_init(&cur_trans->commit_lock); 145 spin_lock_init(&cur_trans->commit_lock);
126 spin_lock_init(&cur_trans->delayed_refs.lock); 146 spin_lock_init(&cur_trans->delayed_refs.lock);
127 INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head); 147 INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
128 148
129 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 149 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
130 list_add_tail(&cur_trans->list, &root->fs_info->trans_list); 150 list_add_tail(&cur_trans->list, &fs_info->trans_list);
131 extent_io_tree_init(&cur_trans->dirty_pages, 151 extent_io_tree_init(&cur_trans->dirty_pages,
132 root->fs_info->btree_inode->i_mapping); 152 fs_info->btree_inode->i_mapping);
133 root->fs_info->generation++; 153 fs_info->generation++;
134 cur_trans->transid = root->fs_info->generation; 154 cur_trans->transid = fs_info->generation;
135 root->fs_info->running_transaction = cur_trans; 155 fs_info->running_transaction = cur_trans;
136 cur_trans->aborted = 0; 156 cur_trans->aborted = 0;
137 spin_unlock(&root->fs_info->trans_lock); 157 spin_unlock(&fs_info->trans_lock);
138 158
139 return 0; 159 return 0;
140} 160}
@@ -758,6 +778,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
758 if (ret) 778 if (ret)
759 return ret; 779 return ret;
760 780
781 ret = btrfs_run_dev_stats(trans, root->fs_info);
782 BUG_ON(ret);
783
761 while (!list_empty(&fs_info->dirty_cowonly_roots)) { 784 while (!list_empty(&fs_info->dirty_cowonly_roots)) {
762 next = fs_info->dirty_cowonly_roots.next; 785 next = fs_info->dirty_cowonly_roots.next;
763 list_del_init(next); 786 list_del_init(next);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index eb1ae908582c..2017d0ff511c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1628,7 +1628,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1628 int i; 1628 int i;
1629 int ret; 1629 int ret;
1630 1630
1631 btrfs_read_buffer(eb, gen); 1631 ret = btrfs_read_buffer(eb, gen);
1632 if (ret)
1633 return ret;
1632 1634
1633 level = btrfs_header_level(eb); 1635 level = btrfs_header_level(eb);
1634 1636
@@ -1749,7 +1751,11 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1749 1751
1750 path->slots[*level]++; 1752 path->slots[*level]++;
1751 if (wc->free) { 1753 if (wc->free) {
1752 btrfs_read_buffer(next, ptr_gen); 1754 ret = btrfs_read_buffer(next, ptr_gen);
1755 if (ret) {
1756 free_extent_buffer(next);
1757 return ret;
1758 }
1753 1759
1754 btrfs_tree_lock(next); 1760 btrfs_tree_lock(next);
1755 btrfs_set_lock_blocking(next); 1761 btrfs_set_lock_blocking(next);
@@ -1766,7 +1772,11 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1766 free_extent_buffer(next); 1772 free_extent_buffer(next);
1767 continue; 1773 continue;
1768 } 1774 }
1769 btrfs_read_buffer(next, ptr_gen); 1775 ret = btrfs_read_buffer(next, ptr_gen);
1776 if (ret) {
1777 free_extent_buffer(next);
1778 return ret;
1779 }
1770 1780
1771 WARN_ON(*level <= 0); 1781 WARN_ON(*level <= 0);
1772 if (path->nodes[*level-1]) 1782 if (path->nodes[*level-1])
@@ -2657,6 +2667,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2657 btrfs_release_path(path); 2667 btrfs_release_path(path);
2658 } 2668 }
2659 btrfs_release_path(path); 2669 btrfs_release_path(path);
2670 if (ret > 0)
2671 ret = 0;
2660 return ret; 2672 return ret;
2661} 2673}
2662 2674
@@ -3028,21 +3040,6 @@ out:
3028 return ret; 3040 return ret;
3029} 3041}
3030 3042
3031static int inode_in_log(struct btrfs_trans_handle *trans,
3032 struct inode *inode)
3033{
3034 struct btrfs_root *root = BTRFS_I(inode)->root;
3035 int ret = 0;
3036
3037 mutex_lock(&root->log_mutex);
3038 if (BTRFS_I(inode)->logged_trans == trans->transid &&
3039 BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
3040 ret = 1;
3041 mutex_unlock(&root->log_mutex);
3042 return ret;
3043}
3044
3045
3046/* 3043/*
3047 * helper function around btrfs_log_inode to make sure newly created 3044 * helper function around btrfs_log_inode to make sure newly created
3048 * parent directories also end up in the log. A minimal inode and backref 3045 * parent directories also end up in the log. A minimal inode and backref
@@ -3083,7 +3080,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
3083 if (ret) 3080 if (ret)
3084 goto end_no_trans; 3081 goto end_no_trans;
3085 3082
3086 if (inode_in_log(trans, inode)) { 3083 if (btrfs_inode_in_log(inode, trans->transid)) {
3087 ret = BTRFS_NO_LOG_SYNC; 3084 ret = BTRFS_NO_LOG_SYNC;
3088 goto end_no_trans; 3085 goto end_no_trans;
3089 } 3086 }
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 12f5147bd2b1..ab942f46b3dd 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -23,9 +23,9 @@
23 * 23 *
24 * ulist = ulist_alloc(); 24 * ulist = ulist_alloc();
25 * ulist_add(ulist, root); 25 * ulist_add(ulist, root);
26 * elem = NULL; 26 * ULIST_ITER_INIT(&uiter);
27 * 27 *
28 * while ((elem = ulist_next(ulist, elem)) { 28 * while ((elem = ulist_next(ulist, &uiter)) {
29 * for (all child nodes n in elem) 29 * for (all child nodes n in elem)
30 * ulist_add(ulist, n); 30 * ulist_add(ulist, n);
31 * do something useful with the node; 31 * do something useful with the node;
@@ -95,7 +95,7 @@ EXPORT_SYMBOL(ulist_reinit);
95 * 95 *
96 * The allocated ulist will be returned in an initialized state. 96 * The allocated ulist will be returned in an initialized state.
97 */ 97 */
98struct ulist *ulist_alloc(unsigned long gfp_mask) 98struct ulist *ulist_alloc(gfp_t gfp_mask)
99{ 99{
100 struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask); 100 struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask);
101 101
@@ -144,13 +144,22 @@ EXPORT_SYMBOL(ulist_free);
144 * unaltered. 144 * unaltered.
145 */ 145 */
146int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, 146int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
147 unsigned long gfp_mask) 147 gfp_t gfp_mask)
148{
149 return ulist_add_merge(ulist, val, aux, NULL, gfp_mask);
150}
151
152int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux,
153 unsigned long *old_aux, gfp_t gfp_mask)
148{ 154{
149 int i; 155 int i;
150 156
151 for (i = 0; i < ulist->nnodes; ++i) { 157 for (i = 0; i < ulist->nnodes; ++i) {
152 if (ulist->nodes[i].val == val) 158 if (ulist->nodes[i].val == val) {
159 if (old_aux)
160 *old_aux = ulist->nodes[i].aux;
153 return 0; 161 return 0;
162 }
154 } 163 }
155 164
156 if (ulist->nnodes >= ulist->nodes_alloced) { 165 if (ulist->nnodes >= ulist->nodes_alloced) {
@@ -188,33 +197,26 @@ EXPORT_SYMBOL(ulist_add);
188/** 197/**
189 * ulist_next - iterate ulist 198 * ulist_next - iterate ulist
190 * @ulist: ulist to iterate 199 * @ulist: ulist to iterate
191 * @prev: previously returned element or %NULL to start iteration 200 * @uiter: iterator variable, initialized with ULIST_ITER_INIT(&iterator)
192 * 201 *
193 * Note: locking must be provided by the caller. In case of rwlocks only read 202 * Note: locking must be provided by the caller. In case of rwlocks only read
194 * locking is needed 203 * locking is needed
195 * 204 *
196 * This function is used to iterate an ulist. The iteration is started with 205 * This function is used to iterate an ulist.
197 * @prev = %NULL. It returns the next element from the ulist or %NULL when the 206 * It returns the next element from the ulist or %NULL when the
198 * end is reached. No guarantee is made with respect to the order in which 207 * end is reached. No guarantee is made with respect to the order in which
199 * the elements are returned. They might neither be returned in order of 208 * the elements are returned. They might neither be returned in order of
200 * addition nor in ascending order. 209 * addition nor in ascending order.
201 * It is allowed to call ulist_add during an enumeration. Newly added items 210 * It is allowed to call ulist_add during an enumeration. Newly added items
202 * are guaranteed to show up in the running enumeration. 211 * are guaranteed to show up in the running enumeration.
203 */ 212 */
204struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev) 213struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter)
205{ 214{
206 int next;
207
208 if (ulist->nnodes == 0) 215 if (ulist->nnodes == 0)
209 return NULL; 216 return NULL;
210 217 if (uiter->i < 0 || uiter->i >= ulist->nnodes)
211 if (!prev)
212 return &ulist->nodes[0];
213
214 next = (prev - ulist->nodes) + 1;
215 if (next < 0 || next >= ulist->nnodes)
216 return NULL; 218 return NULL;
217 219
218 return &ulist->nodes[next]; 220 return &ulist->nodes[uiter->i++];
219} 221}
220EXPORT_SYMBOL(ulist_next); 222EXPORT_SYMBOL(ulist_next);
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 2e25dec58ec0..21bdc8ec8130 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -24,6 +24,10 @@
24 */ 24 */
25#define ULIST_SIZE 16 25#define ULIST_SIZE 16
26 26
27struct ulist_iterator {
28 int i;
29};
30
27/* 31/*
28 * element of the list 32 * element of the list
29 */ 33 */
@@ -59,10 +63,15 @@ struct ulist {
59void ulist_init(struct ulist *ulist); 63void ulist_init(struct ulist *ulist);
60void ulist_fini(struct ulist *ulist); 64void ulist_fini(struct ulist *ulist);
61void ulist_reinit(struct ulist *ulist); 65void ulist_reinit(struct ulist *ulist);
62struct ulist *ulist_alloc(unsigned long gfp_mask); 66struct ulist *ulist_alloc(gfp_t gfp_mask);
63void ulist_free(struct ulist *ulist); 67void ulist_free(struct ulist *ulist);
64int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, 68int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
65 unsigned long gfp_mask); 69 gfp_t gfp_mask);
66struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev); 70int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux,
71 unsigned long *old_aux, gfp_t gfp_mask);
72struct ulist_node *ulist_next(struct ulist *ulist,
73 struct ulist_iterator *uiter);
74
75#define ULIST_ITER_INIT(uiter) ((uiter)->i = 0)
67 76
68#endif 77#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1411b99555a4..7782020996fe 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -23,6 +23,7 @@
23#include <linux/random.h> 23#include <linux/random.h>
24#include <linux/iocontext.h> 24#include <linux/iocontext.h>
25#include <linux/capability.h> 25#include <linux/capability.h>
26#include <linux/ratelimit.h>
26#include <linux/kthread.h> 27#include <linux/kthread.h>
27#include <asm/div64.h> 28#include <asm/div64.h>
28#include "compat.h" 29#include "compat.h"
@@ -39,6 +40,8 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans,
39 struct btrfs_root *root, 40 struct btrfs_root *root,
40 struct btrfs_device *device); 41 struct btrfs_device *device);
41static int btrfs_relocate_sys_chunks(struct btrfs_root *root); 42static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
43static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
44static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
42 45
43static DEFINE_MUTEX(uuid_mutex); 46static DEFINE_MUTEX(uuid_mutex);
44static LIST_HEAD(fs_uuids); 47static LIST_HEAD(fs_uuids);
@@ -361,6 +364,7 @@ static noinline int device_list_add(const char *path,
361 return -ENOMEM; 364 return -ENOMEM;
362 } 365 }
363 device->devid = devid; 366 device->devid = devid;
367 device->dev_stats_valid = 0;
364 device->work.func = pending_bios_fn; 368 device->work.func = pending_bios_fn;
365 memcpy(device->uuid, disk_super->dev_item.uuid, 369 memcpy(device->uuid, disk_super->dev_item.uuid,
366 BTRFS_UUID_SIZE); 370 BTRFS_UUID_SIZE);
@@ -1633,7 +1637,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1633 int ret = 0; 1637 int ret = 0;
1634 1638
1635 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1639 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1636 return -EINVAL; 1640 return -EROFS;
1637 1641
1638 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 1642 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
1639 root->fs_info->bdev_holder); 1643 root->fs_info->bdev_holder);
@@ -4001,13 +4005,58 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4001 return 0; 4005 return 0;
4002} 4006}
4003 4007
4008static void *merge_stripe_index_into_bio_private(void *bi_private,
4009 unsigned int stripe_index)
4010{
4011 /*
4012 * with single, dup, RAID0, RAID1 and RAID10, stripe_index is
4013 * at most 1.
4014 * The alternative solution (instead of stealing bits from the
4015 * pointer) would be to allocate an intermediate structure
4016 * that contains the old private pointer plus the stripe_index.
4017 */
4018 BUG_ON((((uintptr_t)bi_private) & 3) != 0);
4019 BUG_ON(stripe_index > 3);
4020 return (void *)(((uintptr_t)bi_private) | stripe_index);
4021}
4022
4023static struct btrfs_bio *extract_bbio_from_bio_private(void *bi_private)
4024{
4025 return (struct btrfs_bio *)(((uintptr_t)bi_private) & ~((uintptr_t)3));
4026}
4027
4028static unsigned int extract_stripe_index_from_bio_private(void *bi_private)
4029{
4030 return (unsigned int)((uintptr_t)bi_private) & 3;
4031}
4032
4004static void btrfs_end_bio(struct bio *bio, int err) 4033static void btrfs_end_bio(struct bio *bio, int err)
4005{ 4034{
4006 struct btrfs_bio *bbio = bio->bi_private; 4035 struct btrfs_bio *bbio = extract_bbio_from_bio_private(bio->bi_private);
4007 int is_orig_bio = 0; 4036 int is_orig_bio = 0;
4008 4037
4009 if (err) 4038 if (err) {
4010 atomic_inc(&bbio->error); 4039 atomic_inc(&bbio->error);
4040 if (err == -EIO || err == -EREMOTEIO) {
4041 unsigned int stripe_index =
4042 extract_stripe_index_from_bio_private(
4043 bio->bi_private);
4044 struct btrfs_device *dev;
4045
4046 BUG_ON(stripe_index >= bbio->num_stripes);
4047 dev = bbio->stripes[stripe_index].dev;
4048 if (bio->bi_rw & WRITE)
4049 btrfs_dev_stat_inc(dev,
4050 BTRFS_DEV_STAT_WRITE_ERRS);
4051 else
4052 btrfs_dev_stat_inc(dev,
4053 BTRFS_DEV_STAT_READ_ERRS);
4054 if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH)
4055 btrfs_dev_stat_inc(dev,
4056 BTRFS_DEV_STAT_FLUSH_ERRS);
4057 btrfs_dev_stat_print_on_error(dev);
4058 }
4059 }
4011 4060
4012 if (bio == bbio->orig_bio) 4061 if (bio == bbio->orig_bio)
4013 is_orig_bio = 1; 4062 is_orig_bio = 1;
@@ -4149,6 +4198,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4149 bio = first_bio; 4198 bio = first_bio;
4150 } 4199 }
4151 bio->bi_private = bbio; 4200 bio->bi_private = bbio;
4201 bio->bi_private = merge_stripe_index_into_bio_private(
4202 bio->bi_private, (unsigned int)dev_nr);
4152 bio->bi_end_io = btrfs_end_bio; 4203 bio->bi_end_io = btrfs_end_bio;
4153 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; 4204 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
4154 dev = bbio->stripes[dev_nr].dev; 4205 dev = bbio->stripes[dev_nr].dev;
@@ -4509,6 +4560,28 @@ int btrfs_read_sys_array(struct btrfs_root *root)
4509 return ret; 4560 return ret;
4510} 4561}
4511 4562
4563struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root,
4564 u64 logical, int mirror_num)
4565{
4566 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
4567 int ret;
4568 u64 map_length = 0;
4569 struct btrfs_bio *bbio = NULL;
4570 struct btrfs_device *device;
4571
4572 BUG_ON(mirror_num == 0);
4573 ret = btrfs_map_block(map_tree, WRITE, logical, &map_length, &bbio,
4574 mirror_num);
4575 if (ret) {
4576 BUG_ON(bbio != NULL);
4577 return NULL;
4578 }
4579 BUG_ON(mirror_num != bbio->mirror_num);
4580 device = bbio->stripes[mirror_num - 1].dev;
4581 kfree(bbio);
4582 return device;
4583}
4584
4512int btrfs_read_chunk_tree(struct btrfs_root *root) 4585int btrfs_read_chunk_tree(struct btrfs_root *root)
4513{ 4586{
4514 struct btrfs_path *path; 4587 struct btrfs_path *path;
@@ -4583,3 +4656,230 @@ error:
4583 btrfs_free_path(path); 4656 btrfs_free_path(path);
4584 return ret; 4657 return ret;
4585} 4658}
4659
4660static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
4661{
4662 int i;
4663
4664 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
4665 btrfs_dev_stat_reset(dev, i);
4666}
4667
4668int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
4669{
4670 struct btrfs_key key;
4671 struct btrfs_key found_key;
4672 struct btrfs_root *dev_root = fs_info->dev_root;
4673 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
4674 struct extent_buffer *eb;
4675 int slot;
4676 int ret = 0;
4677 struct btrfs_device *device;
4678 struct btrfs_path *path = NULL;
4679 int i;
4680
4681 path = btrfs_alloc_path();
4682 if (!path) {
4683 ret = -ENOMEM;
4684 goto out;
4685 }
4686
4687 mutex_lock(&fs_devices->device_list_mutex);
4688 list_for_each_entry(device, &fs_devices->devices, dev_list) {
4689 int item_size;
4690 struct btrfs_dev_stats_item *ptr;
4691
4692 key.objectid = 0;
4693 key.type = BTRFS_DEV_STATS_KEY;
4694 key.offset = device->devid;
4695 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
4696 if (ret) {
4697 printk(KERN_WARNING "btrfs: no dev_stats entry found for device %s (devid %llu) (OK on first mount after mkfs)\n",
4698 device->name, (unsigned long long)device->devid);
4699 __btrfs_reset_dev_stats(device);
4700 device->dev_stats_valid = 1;
4701 btrfs_release_path(path);
4702 continue;
4703 }
4704 slot = path->slots[0];
4705 eb = path->nodes[0];
4706 btrfs_item_key_to_cpu(eb, &found_key, slot);
4707 item_size = btrfs_item_size_nr(eb, slot);
4708
4709 ptr = btrfs_item_ptr(eb, slot,
4710 struct btrfs_dev_stats_item);
4711
4712 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
4713 if (item_size >= (1 + i) * sizeof(__le64))
4714 btrfs_dev_stat_set(device, i,
4715 btrfs_dev_stats_value(eb, ptr, i));
4716 else
4717 btrfs_dev_stat_reset(device, i);
4718 }
4719
4720 device->dev_stats_valid = 1;
4721 btrfs_dev_stat_print_on_load(device);
4722 btrfs_release_path(path);
4723 }
4724 mutex_unlock(&fs_devices->device_list_mutex);
4725
4726out:
4727 btrfs_free_path(path);
4728 return ret < 0 ? ret : 0;
4729}
4730
4731static int update_dev_stat_item(struct btrfs_trans_handle *trans,
4732 struct btrfs_root *dev_root,
4733 struct btrfs_device *device)
4734{
4735 struct btrfs_path *path;
4736 struct btrfs_key key;
4737 struct extent_buffer *eb;
4738 struct btrfs_dev_stats_item *ptr;
4739 int ret;
4740 int i;
4741
4742 key.objectid = 0;
4743 key.type = BTRFS_DEV_STATS_KEY;
4744 key.offset = device->devid;
4745
4746 path = btrfs_alloc_path();
4747 BUG_ON(!path);
4748 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
4749 if (ret < 0) {
4750 printk(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n",
4751 ret, device->name);
4752 goto out;
4753 }
4754
4755 if (ret == 0 &&
4756 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
4757 /* need to delete old one and insert a new one */
4758 ret = btrfs_del_item(trans, dev_root, path);
4759 if (ret != 0) {
4760 printk(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n",
4761 device->name, ret);
4762 goto out;
4763 }
4764 ret = 1;
4765 }
4766
4767 if (ret == 1) {
4768 /* need to insert a new item */
4769 btrfs_release_path(path);
4770 ret = btrfs_insert_empty_item(trans, dev_root, path,
4771 &key, sizeof(*ptr));
4772 if (ret < 0) {
4773 printk(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n",
4774 device->name, ret);
4775 goto out;
4776 }
4777 }
4778
4779 eb = path->nodes[0];
4780 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
4781 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
4782 btrfs_set_dev_stats_value(eb, ptr, i,
4783 btrfs_dev_stat_read(device, i));
4784 btrfs_mark_buffer_dirty(eb);
4785
4786out:
4787 btrfs_free_path(path);
4788 return ret;
4789}
4790
4791/*
4792 * called from commit_transaction. Writes all changed device stats to disk.
4793 */
4794int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
4795 struct btrfs_fs_info *fs_info)
4796{
4797 struct btrfs_root *dev_root = fs_info->dev_root;
4798 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
4799 struct btrfs_device *device;
4800 int ret = 0;
4801
4802 mutex_lock(&fs_devices->device_list_mutex);
4803 list_for_each_entry(device, &fs_devices->devices, dev_list) {
4804 if (!device->dev_stats_valid || !device->dev_stats_dirty)
4805 continue;
4806
4807 ret = update_dev_stat_item(trans, dev_root, device);
4808 if (!ret)
4809 device->dev_stats_dirty = 0;
4810 }
4811 mutex_unlock(&fs_devices->device_list_mutex);
4812
4813 return ret;
4814}
4815
4816void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
4817{
4818 btrfs_dev_stat_inc(dev, index);
4819 btrfs_dev_stat_print_on_error(dev);
4820}
4821
4822void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
4823{
4824 if (!dev->dev_stats_valid)
4825 return;
4826 printk_ratelimited(KERN_ERR
4827 "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
4828 dev->name,
4829 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
4830 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
4831 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
4832 btrfs_dev_stat_read(dev,
4833 BTRFS_DEV_STAT_CORRUPTION_ERRS),
4834 btrfs_dev_stat_read(dev,
4835 BTRFS_DEV_STAT_GENERATION_ERRS));
4836}
4837
4838static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
4839{
4840 printk(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
4841 dev->name,
4842 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
4843 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
4844 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
4845 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
4846 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
4847}
4848
4849int btrfs_get_dev_stats(struct btrfs_root *root,
4850 struct btrfs_ioctl_get_dev_stats *stats,
4851 int reset_after_read)
4852{
4853 struct btrfs_device *dev;
4854 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
4855 int i;
4856
4857 mutex_lock(&fs_devices->device_list_mutex);
4858 dev = btrfs_find_device(root, stats->devid, NULL, NULL);
4859 mutex_unlock(&fs_devices->device_list_mutex);
4860
4861 if (!dev) {
4862 printk(KERN_WARNING
4863 "btrfs: get dev_stats failed, device not found\n");
4864 return -ENODEV;
4865 } else if (!dev->dev_stats_valid) {
4866 printk(KERN_WARNING
4867 "btrfs: get dev_stats failed, not yet valid\n");
4868 return -ENODEV;
4869 } else if (reset_after_read) {
4870 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
4871 if (stats->nr_items > i)
4872 stats->values[i] =
4873 btrfs_dev_stat_read_and_reset(dev, i);
4874 else
4875 btrfs_dev_stat_reset(dev, i);
4876 }
4877 } else {
4878 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
4879 if (stats->nr_items > i)
4880 stats->values[i] = btrfs_dev_stat_read(dev, i);
4881 }
4882 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
4883 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
4884 return 0;
4885}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index bb6b03f97aaa..3406a88ca83e 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -22,6 +22,7 @@
22#include <linux/bio.h> 22#include <linux/bio.h>
23#include <linux/sort.h> 23#include <linux/sort.h>
24#include "async-thread.h" 24#include "async-thread.h"
25#include "ioctl.h"
25 26
26#define BTRFS_STRIPE_LEN (64 * 1024) 27#define BTRFS_STRIPE_LEN (64 * 1024)
27 28
@@ -106,6 +107,11 @@ struct btrfs_device {
106 struct completion flush_wait; 107 struct completion flush_wait;
107 int nobarriers; 108 int nobarriers;
108 109
110 /* disk I/O failure stats. For detailed description refer to
111 * enum btrfs_dev_stat_values in ioctl.h */
112 int dev_stats_valid;
113 int dev_stats_dirty; /* counters need to be written to disk */
114 atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
109}; 115};
110 116
111struct btrfs_fs_devices { 117struct btrfs_fs_devices {
@@ -281,4 +287,50 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
281int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); 287int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
282int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 288int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
283 u64 *start, u64 *max_avail); 289 u64 *start, u64 *max_avail);
290struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root,
291 u64 logical, int mirror_num);
292void btrfs_dev_stat_print_on_error(struct btrfs_device *device);
293void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
294int btrfs_get_dev_stats(struct btrfs_root *root,
295 struct btrfs_ioctl_get_dev_stats *stats,
296 int reset_after_read);
297int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
298int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
299 struct btrfs_fs_info *fs_info);
300
301static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
302 int index)
303{
304 atomic_inc(dev->dev_stat_values + index);
305 dev->dev_stats_dirty = 1;
306}
307
308static inline int btrfs_dev_stat_read(struct btrfs_device *dev,
309 int index)
310{
311 return atomic_read(dev->dev_stat_values + index);
312}
313
314static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev,
315 int index)
316{
317 int ret;
318
319 ret = atomic_xchg(dev->dev_stat_values + index, 0);
320 dev->dev_stats_dirty = 1;
321 return ret;
322}
323
324static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
325 int index, unsigned long val)
326{
327 atomic_set(dev->dev_stat_values + index, val);
328 dev->dev_stats_dirty = 1;
329}
330
331static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
332 int index)
333{
334 btrfs_dev_stat_set(dev, index, 0);
335}
284#endif 336#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index e7a5659087e6..3f4e2d69e83a 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -196,6 +196,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
196 if (ret) 196 if (ret)
197 goto out; 197 goto out;
198 198
199 inode_inc_iversion(inode);
199 inode->i_ctime = CURRENT_TIME; 200 inode->i_ctime = CURRENT_TIME;
200 ret = btrfs_update_inode(trans, root, inode); 201 ret = btrfs_update_inode(trans, root, inode);
201 BUG_ON(ret); 202 BUG_ON(ret);