aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2014-09-24 13:00:21 -0400
committerTejun Heo <tj@kernel.org>2014-09-24 13:00:21 -0400
commitd06efebf0c37d438fcf07057be00dd40fcfce08d (patch)
tree31a0786d132aadf4cbb9725f3f444ef6e1052128 /fs/btrfs
parentbb2e226b3bef596dd56be97df655d857b4603923 (diff)
parent0a30288da1aec914e158c2d7a3482a85f632750f (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block into for-3.18
This is to receive 0a30288da1ae ("blk-mq, percpu_ref: implement a kludge for SCSI blk-mq stall during probe") which implements __percpu_ref_kill_expedited() to work around SCSI blk-mq stall. The commit reverted and patches to implement proper fix will be added. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Kent Overstreet <kmo@daterainc.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Christoph Hellwig <hch@lst.de>
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/async-thread.c44
-rw-r--r--fs/btrfs/async-thread.h28
-rw-r--r--fs/btrfs/backref.c14
-rw-r--r--fs/btrfs/btrfs_inode.h19
-rw-r--r--fs/btrfs/ctree.c20
-rw-r--r--fs/btrfs/ctree.h4
-rw-r--r--fs/btrfs/delayed-inode.c4
-rw-r--r--fs/btrfs/disk-io.c88
-rw-r--r--fs/btrfs/extent-tree.c308
-rw-r--r--fs/btrfs/extent_io.c5
-rw-r--r--fs/btrfs/file-item.c2
-rw-r--r--fs/btrfs/file.c31
-rw-r--r--fs/btrfs/inode.c359
-rw-r--r--fs/btrfs/ioctl.c68
-rw-r--r--fs/btrfs/ordered-data.c124
-rw-r--r--fs/btrfs/ordered-data.h5
-rw-r--r--fs/btrfs/qgroup.c170
-rw-r--r--fs/btrfs/qgroup.h1
-rw-r--r--fs/btrfs/raid56.c9
-rw-r--r--fs/btrfs/reada.c3
-rw-r--r--fs/btrfs/scrub.c25
-rw-r--r--fs/btrfs/super.c51
-rw-r--r--fs/btrfs/sysfs.c2
-rw-r--r--fs/btrfs/transaction.c33
-rw-r--r--fs/btrfs/transaction.h1
-rw-r--r--fs/btrfs/tree-log.c80
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/ulist.h15
-rw-r--r--fs/btrfs/volumes.c64
29 files changed, 1046 insertions, 533 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 5a201d81049c..fbd76ded9a34 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -22,7 +22,6 @@
22#include <linux/list.h> 22#include <linux/list.h>
23#include <linux/spinlock.h> 23#include <linux/spinlock.h>
24#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/workqueue.h>
26#include "async-thread.h" 25#include "async-thread.h"
27#include "ctree.h" 26#include "ctree.h"
28 27
@@ -55,8 +54,39 @@ struct btrfs_workqueue {
55 struct __btrfs_workqueue *high; 54 struct __btrfs_workqueue *high;
56}; 55};
57 56
58static inline struct __btrfs_workqueue 57static void normal_work_helper(struct btrfs_work *work);
59*__btrfs_alloc_workqueue(const char *name, int flags, int max_active, 58
59#define BTRFS_WORK_HELPER(name) \
60void btrfs_##name(struct work_struct *arg) \
61{ \
62 struct btrfs_work *work = container_of(arg, struct btrfs_work, \
63 normal_work); \
64 normal_work_helper(work); \
65}
66
67BTRFS_WORK_HELPER(worker_helper);
68BTRFS_WORK_HELPER(delalloc_helper);
69BTRFS_WORK_HELPER(flush_delalloc_helper);
70BTRFS_WORK_HELPER(cache_helper);
71BTRFS_WORK_HELPER(submit_helper);
72BTRFS_WORK_HELPER(fixup_helper);
73BTRFS_WORK_HELPER(endio_helper);
74BTRFS_WORK_HELPER(endio_meta_helper);
75BTRFS_WORK_HELPER(endio_meta_write_helper);
76BTRFS_WORK_HELPER(endio_raid56_helper);
77BTRFS_WORK_HELPER(rmw_helper);
78BTRFS_WORK_HELPER(endio_write_helper);
79BTRFS_WORK_HELPER(freespace_write_helper);
80BTRFS_WORK_HELPER(delayed_meta_helper);
81BTRFS_WORK_HELPER(readahead_helper);
82BTRFS_WORK_HELPER(qgroup_rescan_helper);
83BTRFS_WORK_HELPER(extent_refs_helper);
84BTRFS_WORK_HELPER(scrub_helper);
85BTRFS_WORK_HELPER(scrubwrc_helper);
86BTRFS_WORK_HELPER(scrubnc_helper);
87
88static struct __btrfs_workqueue *
89__btrfs_alloc_workqueue(const char *name, int flags, int max_active,
60 int thresh) 90 int thresh)
61{ 91{
62 struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); 92 struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
@@ -232,13 +262,11 @@ static void run_ordered_work(struct __btrfs_workqueue *wq)
232 spin_unlock_irqrestore(lock, flags); 262 spin_unlock_irqrestore(lock, flags);
233} 263}
234 264
235static void normal_work_helper(struct work_struct *arg) 265static void normal_work_helper(struct btrfs_work *work)
236{ 266{
237 struct btrfs_work *work;
238 struct __btrfs_workqueue *wq; 267 struct __btrfs_workqueue *wq;
239 int need_order = 0; 268 int need_order = 0;
240 269
241 work = container_of(arg, struct btrfs_work, normal_work);
242 /* 270 /*
243 * We should not touch things inside work in the following cases: 271 * We should not touch things inside work in the following cases:
244 * 1) after work->func() if it has no ordered_free 272 * 1) after work->func() if it has no ordered_free
@@ -262,7 +290,7 @@ static void normal_work_helper(struct work_struct *arg)
262 trace_btrfs_all_work_done(work); 290 trace_btrfs_all_work_done(work);
263} 291}
264 292
265void btrfs_init_work(struct btrfs_work *work, 293void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t uniq_func,
266 btrfs_func_t func, 294 btrfs_func_t func,
267 btrfs_func_t ordered_func, 295 btrfs_func_t ordered_func,
268 btrfs_func_t ordered_free) 296 btrfs_func_t ordered_free)
@@ -270,7 +298,7 @@ void btrfs_init_work(struct btrfs_work *work,
270 work->func = func; 298 work->func = func;
271 work->ordered_func = ordered_func; 299 work->ordered_func = ordered_func;
272 work->ordered_free = ordered_free; 300 work->ordered_free = ordered_free;
273 INIT_WORK(&work->normal_work, normal_work_helper); 301 INIT_WORK(&work->normal_work, uniq_func);
274 INIT_LIST_HEAD(&work->ordered_list); 302 INIT_LIST_HEAD(&work->ordered_list);
275 work->flags = 0; 303 work->flags = 0;
276} 304}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 9c6b66d15fb0..e9e31c94758f 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -19,12 +19,14 @@
19 19
20#ifndef __BTRFS_ASYNC_THREAD_ 20#ifndef __BTRFS_ASYNC_THREAD_
21#define __BTRFS_ASYNC_THREAD_ 21#define __BTRFS_ASYNC_THREAD_
22#include <linux/workqueue.h>
22 23
23struct btrfs_workqueue; 24struct btrfs_workqueue;
24/* Internal use only */ 25/* Internal use only */
25struct __btrfs_workqueue; 26struct __btrfs_workqueue;
26struct btrfs_work; 27struct btrfs_work;
27typedef void (*btrfs_func_t)(struct btrfs_work *arg); 28typedef void (*btrfs_func_t)(struct btrfs_work *arg);
29typedef void (*btrfs_work_func_t)(struct work_struct *arg);
28 30
29struct btrfs_work { 31struct btrfs_work {
30 btrfs_func_t func; 32 btrfs_func_t func;
@@ -38,11 +40,35 @@ struct btrfs_work {
38 unsigned long flags; 40 unsigned long flags;
39}; 41};
40 42
43#define BTRFS_WORK_HELPER_PROTO(name) \
44void btrfs_##name(struct work_struct *arg)
45
46BTRFS_WORK_HELPER_PROTO(worker_helper);
47BTRFS_WORK_HELPER_PROTO(delalloc_helper);
48BTRFS_WORK_HELPER_PROTO(flush_delalloc_helper);
49BTRFS_WORK_HELPER_PROTO(cache_helper);
50BTRFS_WORK_HELPER_PROTO(submit_helper);
51BTRFS_WORK_HELPER_PROTO(fixup_helper);
52BTRFS_WORK_HELPER_PROTO(endio_helper);
53BTRFS_WORK_HELPER_PROTO(endio_meta_helper);
54BTRFS_WORK_HELPER_PROTO(endio_meta_write_helper);
55BTRFS_WORK_HELPER_PROTO(endio_raid56_helper);
56BTRFS_WORK_HELPER_PROTO(rmw_helper);
57BTRFS_WORK_HELPER_PROTO(endio_write_helper);
58BTRFS_WORK_HELPER_PROTO(freespace_write_helper);
59BTRFS_WORK_HELPER_PROTO(delayed_meta_helper);
60BTRFS_WORK_HELPER_PROTO(readahead_helper);
61BTRFS_WORK_HELPER_PROTO(qgroup_rescan_helper);
62BTRFS_WORK_HELPER_PROTO(extent_refs_helper);
63BTRFS_WORK_HELPER_PROTO(scrub_helper);
64BTRFS_WORK_HELPER_PROTO(scrubwrc_helper);
65BTRFS_WORK_HELPER_PROTO(scrubnc_helper);
66
41struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, 67struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
42 int flags, 68 int flags,
43 int max_active, 69 int max_active,
44 int thresh); 70 int thresh);
45void btrfs_init_work(struct btrfs_work *work, 71void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper,
46 btrfs_func_t func, 72 btrfs_func_t func,
47 btrfs_func_t ordered_func, 73 btrfs_func_t ordered_func,
48 btrfs_func_t ordered_free); 74 btrfs_func_t ordered_free);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index e25564bfcb46..54a201dac7f9 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -276,9 +276,8 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
276 } 276 }
277 if (ret > 0) 277 if (ret > 0)
278 goto next; 278 goto next;
279 ret = ulist_add_merge(parents, eb->start, 279 ret = ulist_add_merge_ptr(parents, eb->start,
280 (uintptr_t)eie, 280 eie, (void **)&old, GFP_NOFS);
281 (u64 *)&old, GFP_NOFS);
282 if (ret < 0) 281 if (ret < 0)
283 break; 282 break;
284 if (!ret && extent_item_pos) { 283 if (!ret && extent_item_pos) {
@@ -1001,16 +1000,19 @@ again:
1001 ret = -EIO; 1000 ret = -EIO;
1002 goto out; 1001 goto out;
1003 } 1002 }
1003 btrfs_tree_read_lock(eb);
1004 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1004 ret = find_extent_in_eb(eb, bytenr, 1005 ret = find_extent_in_eb(eb, bytenr,
1005 *extent_item_pos, &eie); 1006 *extent_item_pos, &eie);
1007 btrfs_tree_read_unlock_blocking(eb);
1006 free_extent_buffer(eb); 1008 free_extent_buffer(eb);
1007 if (ret < 0) 1009 if (ret < 0)
1008 goto out; 1010 goto out;
1009 ref->inode_list = eie; 1011 ref->inode_list = eie;
1010 } 1012 }
1011 ret = ulist_add_merge(refs, ref->parent, 1013 ret = ulist_add_merge_ptr(refs, ref->parent,
1012 (uintptr_t)ref->inode_list, 1014 ref->inode_list,
1013 (u64 *)&eie, GFP_NOFS); 1015 (void **)&eie, GFP_NOFS);
1014 if (ret < 0) 1016 if (ret < 0)
1015 goto out; 1017 goto out;
1016 if (!ret && extent_item_pos) { 1018 if (!ret && extent_item_pos) {
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 4794923c410c..56b8522d5767 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -84,12 +84,6 @@ struct btrfs_inode {
84 */ 84 */
85 struct list_head delalloc_inodes; 85 struct list_head delalloc_inodes;
86 86
87 /*
88 * list for tracking inodes that must be sent to disk before a
89 * rename or truncate commit
90 */
91 struct list_head ordered_operations;
92
93 /* node for the red-black tree that links inodes in subvolume root */ 87 /* node for the red-black tree that links inodes in subvolume root */
94 struct rb_node rb_node; 88 struct rb_node rb_node;
95 89
@@ -240,8 +234,17 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
240 BTRFS_I(inode)->last_sub_trans <= 234 BTRFS_I(inode)->last_sub_trans <=
241 BTRFS_I(inode)->last_log_commit && 235 BTRFS_I(inode)->last_log_commit &&
242 BTRFS_I(inode)->last_sub_trans <= 236 BTRFS_I(inode)->last_sub_trans <=
243 BTRFS_I(inode)->root->last_log_commit) 237 BTRFS_I(inode)->root->last_log_commit) {
244 return 1; 238 /*
239 * After a ranged fsync we might have left some extent maps
240 * (that fall outside the fsync's range). So return false
241 * here if the list isn't empty, to make sure btrfs_log_inode()
242 * will be called and process those extent maps.
243 */
244 smp_mb();
245 if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents))
246 return 1;
247 }
245 return 0; 248 return 0;
246} 249}
247 250
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index aeab453b8e24..44ee5d2e52a4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -280,9 +280,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
280 280
281 WARN_ON(btrfs_header_generation(buf) > trans->transid); 281 WARN_ON(btrfs_header_generation(buf) > trans->transid);
282 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 282 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
283 ret = btrfs_inc_ref(trans, root, cow, 1, 1); 283 ret = btrfs_inc_ref(trans, root, cow, 1);
284 else 284 else
285 ret = btrfs_inc_ref(trans, root, cow, 0, 1); 285 ret = btrfs_inc_ref(trans, root, cow, 0);
286 286
287 if (ret) 287 if (ret)
288 return ret; 288 return ret;
@@ -1035,14 +1035,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
1035 if ((owner == root->root_key.objectid || 1035 if ((owner == root->root_key.objectid ||
1036 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && 1036 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
1037 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { 1037 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
1038 ret = btrfs_inc_ref(trans, root, buf, 1, 1); 1038 ret = btrfs_inc_ref(trans, root, buf, 1);
1039 BUG_ON(ret); /* -ENOMEM */ 1039 BUG_ON(ret); /* -ENOMEM */
1040 1040
1041 if (root->root_key.objectid == 1041 if (root->root_key.objectid ==
1042 BTRFS_TREE_RELOC_OBJECTID) { 1042 BTRFS_TREE_RELOC_OBJECTID) {
1043 ret = btrfs_dec_ref(trans, root, buf, 0, 1); 1043 ret = btrfs_dec_ref(trans, root, buf, 0);
1044 BUG_ON(ret); /* -ENOMEM */ 1044 BUG_ON(ret); /* -ENOMEM */
1045 ret = btrfs_inc_ref(trans, root, cow, 1, 1); 1045 ret = btrfs_inc_ref(trans, root, cow, 1);
1046 BUG_ON(ret); /* -ENOMEM */ 1046 BUG_ON(ret); /* -ENOMEM */
1047 } 1047 }
1048 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 1048 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
@@ -1050,9 +1050,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
1050 1050
1051 if (root->root_key.objectid == 1051 if (root->root_key.objectid ==
1052 BTRFS_TREE_RELOC_OBJECTID) 1052 BTRFS_TREE_RELOC_OBJECTID)
1053 ret = btrfs_inc_ref(trans, root, cow, 1, 1); 1053 ret = btrfs_inc_ref(trans, root, cow, 1);
1054 else 1054 else
1055 ret = btrfs_inc_ref(trans, root, cow, 0, 1); 1055 ret = btrfs_inc_ref(trans, root, cow, 0);
1056 BUG_ON(ret); /* -ENOMEM */ 1056 BUG_ON(ret); /* -ENOMEM */
1057 } 1057 }
1058 if (new_flags != 0) { 1058 if (new_flags != 0) {
@@ -1069,11 +1069,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
1069 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 1069 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
1070 if (root->root_key.objectid == 1070 if (root->root_key.objectid ==
1071 BTRFS_TREE_RELOC_OBJECTID) 1071 BTRFS_TREE_RELOC_OBJECTID)
1072 ret = btrfs_inc_ref(trans, root, cow, 1, 1); 1072 ret = btrfs_inc_ref(trans, root, cow, 1);
1073 else 1073 else
1074 ret = btrfs_inc_ref(trans, root, cow, 0, 1); 1074 ret = btrfs_inc_ref(trans, root, cow, 0);
1075 BUG_ON(ret); /* -ENOMEM */ 1075 BUG_ON(ret); /* -ENOMEM */
1076 ret = btrfs_dec_ref(trans, root, buf, 1, 1); 1076 ret = btrfs_dec_ref(trans, root, buf, 1);
1077 BUG_ON(ret); /* -ENOMEM */ 1077 BUG_ON(ret); /* -ENOMEM */
1078 } 1078 }
1079 clean_tree_block(trans, root, buf); 1079 clean_tree_block(trans, root, buf);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index be91397f4e92..8e29b614fe93 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3326,9 +3326,9 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
3326 u64 min_alloc_size, u64 empty_size, u64 hint_byte, 3326 u64 min_alloc_size, u64 empty_size, u64 hint_byte,
3327 struct btrfs_key *ins, int is_data, int delalloc); 3327 struct btrfs_key *ins, int is_data, int delalloc);
3328int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3328int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3329 struct extent_buffer *buf, int full_backref, int no_quota); 3329 struct extent_buffer *buf, int full_backref);
3330int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3330int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3331 struct extent_buffer *buf, int full_backref, int no_quota); 3331 struct extent_buffer *buf, int full_backref);
3332int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 3332int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
3333 struct btrfs_root *root, 3333 struct btrfs_root *root,
3334 u64 bytenr, u64 num_bytes, u64 flags, 3334 u64 bytenr, u64 num_bytes, u64 flags,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index da775bfdebc9..a2e90f855d7d 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1395,8 +1395,8 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
1395 return -ENOMEM; 1395 return -ENOMEM;
1396 1396
1397 async_work->delayed_root = delayed_root; 1397 async_work->delayed_root = delayed_root;
1398 btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, 1398 btrfs_init_work(&async_work->work, btrfs_delayed_meta_helper,
1399 NULL, NULL); 1399 btrfs_async_run_delayed_root, NULL, NULL);
1400 async_work->nr = nr; 1400 async_work->nr = nr;
1401 1401
1402 btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work); 1402 btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 61dae01788d7..d0d78dc07792 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -39,7 +39,6 @@
39#include "btrfs_inode.h" 39#include "btrfs_inode.h"
40#include "volumes.h" 40#include "volumes.h"
41#include "print-tree.h" 41#include "print-tree.h"
42#include "async-thread.h"
43#include "locking.h" 42#include "locking.h"
44#include "tree-log.h" 43#include "tree-log.h"
45#include "free-space-cache.h" 44#include "free-space-cache.h"
@@ -60,8 +59,6 @@ static void end_workqueue_fn(struct btrfs_work *work);
60static void free_fs_root(struct btrfs_root *root); 59static void free_fs_root(struct btrfs_root *root);
61static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, 60static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
62 int read_only); 61 int read_only);
63static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
64 struct btrfs_root *root);
65static void btrfs_destroy_ordered_extents(struct btrfs_root *root); 62static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
66static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, 63static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
67 struct btrfs_root *root); 64 struct btrfs_root *root);
@@ -695,35 +692,41 @@ static void end_workqueue_bio(struct bio *bio, int err)
695{ 692{
696 struct end_io_wq *end_io_wq = bio->bi_private; 693 struct end_io_wq *end_io_wq = bio->bi_private;
697 struct btrfs_fs_info *fs_info; 694 struct btrfs_fs_info *fs_info;
695 struct btrfs_workqueue *wq;
696 btrfs_work_func_t func;
698 697
699 fs_info = end_io_wq->info; 698 fs_info = end_io_wq->info;
700 end_io_wq->error = err; 699 end_io_wq->error = err;
701 btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
702 700
703 if (bio->bi_rw & REQ_WRITE) { 701 if (bio->bi_rw & REQ_WRITE) {
704 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) 702 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) {
705 btrfs_queue_work(fs_info->endio_meta_write_workers, 703 wq = fs_info->endio_meta_write_workers;
706 &end_io_wq->work); 704 func = btrfs_endio_meta_write_helper;
707 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) 705 } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) {
708 btrfs_queue_work(fs_info->endio_freespace_worker, 706 wq = fs_info->endio_freespace_worker;
709 &end_io_wq->work); 707 func = btrfs_freespace_write_helper;
710 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) 708 } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
711 btrfs_queue_work(fs_info->endio_raid56_workers, 709 wq = fs_info->endio_raid56_workers;
712 &end_io_wq->work); 710 func = btrfs_endio_raid56_helper;
713 else 711 } else {
714 btrfs_queue_work(fs_info->endio_write_workers, 712 wq = fs_info->endio_write_workers;
715 &end_io_wq->work); 713 func = btrfs_endio_write_helper;
714 }
716 } else { 715 } else {
717 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) 716 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
718 btrfs_queue_work(fs_info->endio_raid56_workers, 717 wq = fs_info->endio_raid56_workers;
719 &end_io_wq->work); 718 func = btrfs_endio_raid56_helper;
720 else if (end_io_wq->metadata) 719 } else if (end_io_wq->metadata) {
721 btrfs_queue_work(fs_info->endio_meta_workers, 720 wq = fs_info->endio_meta_workers;
722 &end_io_wq->work); 721 func = btrfs_endio_meta_helper;
723 else 722 } else {
724 btrfs_queue_work(fs_info->endio_workers, 723 wq = fs_info->endio_workers;
725 &end_io_wq->work); 724 func = btrfs_endio_helper;
725 }
726 } 726 }
727
728 btrfs_init_work(&end_io_wq->work, func, end_workqueue_fn, NULL, NULL);
729 btrfs_queue_work(wq, &end_io_wq->work);
727} 730}
728 731
729/* 732/*
@@ -830,7 +833,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
830 async->submit_bio_start = submit_bio_start; 833 async->submit_bio_start = submit_bio_start;
831 async->submit_bio_done = submit_bio_done; 834 async->submit_bio_done = submit_bio_done;
832 835
833 btrfs_init_work(&async->work, run_one_async_start, 836 btrfs_init_work(&async->work, btrfs_worker_helper, run_one_async_start,
834 run_one_async_done, run_one_async_free); 837 run_one_async_done, run_one_async_free);
835 838
836 async->bio_flags = bio_flags; 839 async->bio_flags = bio_flags;
@@ -3452,7 +3455,8 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
3452 btrfs_set_stack_device_generation(dev_item, 0); 3455 btrfs_set_stack_device_generation(dev_item, 0);
3453 btrfs_set_stack_device_type(dev_item, dev->type); 3456 btrfs_set_stack_device_type(dev_item, dev->type);
3454 btrfs_set_stack_device_id(dev_item, dev->devid); 3457 btrfs_set_stack_device_id(dev_item, dev->devid);
3455 btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes); 3458 btrfs_set_stack_device_total_bytes(dev_item,
3459 dev->disk_total_bytes);
3456 btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used); 3460 btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
3457 btrfs_set_stack_device_io_align(dev_item, dev->io_align); 3461 btrfs_set_stack_device_io_align(dev_item, dev->io_align);
3458 btrfs_set_stack_device_io_width(dev_item, dev->io_width); 3462 btrfs_set_stack_device_io_width(dev_item, dev->io_width);
@@ -3829,34 +3833,6 @@ static void btrfs_error_commit_super(struct btrfs_root *root)
3829 btrfs_cleanup_transaction(root); 3833 btrfs_cleanup_transaction(root);
3830} 3834}
3831 3835
3832static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
3833 struct btrfs_root *root)
3834{
3835 struct btrfs_inode *btrfs_inode;
3836 struct list_head splice;
3837
3838 INIT_LIST_HEAD(&splice);
3839
3840 mutex_lock(&root->fs_info->ordered_operations_mutex);
3841 spin_lock(&root->fs_info->ordered_root_lock);
3842
3843 list_splice_init(&t->ordered_operations, &splice);
3844 while (!list_empty(&splice)) {
3845 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
3846 ordered_operations);
3847
3848 list_del_init(&btrfs_inode->ordered_operations);
3849 spin_unlock(&root->fs_info->ordered_root_lock);
3850
3851 btrfs_invalidate_inodes(btrfs_inode->root);
3852
3853 spin_lock(&root->fs_info->ordered_root_lock);
3854 }
3855
3856 spin_unlock(&root->fs_info->ordered_root_lock);
3857 mutex_unlock(&root->fs_info->ordered_operations_mutex);
3858}
3859
3860static void btrfs_destroy_ordered_extents(struct btrfs_root *root) 3836static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
3861{ 3837{
3862 struct btrfs_ordered_extent *ordered; 3838 struct btrfs_ordered_extent *ordered;
@@ -4093,8 +4069,6 @@ again:
4093void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, 4069void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
4094 struct btrfs_root *root) 4070 struct btrfs_root *root)
4095{ 4071{
4096 btrfs_destroy_ordered_operations(cur_trans, root);
4097
4098 btrfs_destroy_delayed_refs(cur_trans, root); 4072 btrfs_destroy_delayed_refs(cur_trans, root);
4099 4073
4100 cur_trans->state = TRANS_STATE_COMMIT_START; 4074 cur_trans->state = TRANS_STATE_COMMIT_START;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 94ec71eda86b..caaf015d6e4b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -552,7 +552,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
552 caching_ctl->block_group = cache; 552 caching_ctl->block_group = cache;
553 caching_ctl->progress = cache->key.objectid; 553 caching_ctl->progress = cache->key.objectid;
554 atomic_set(&caching_ctl->count, 1); 554 atomic_set(&caching_ctl->count, 1);
555 btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); 555 btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
556 caching_thread, NULL, NULL);
556 557
557 spin_lock(&cache->lock); 558 spin_lock(&cache->lock);
558 /* 559 /*
@@ -2749,8 +2750,8 @@ int btrfs_async_run_delayed_refs(struct btrfs_root *root,
2749 async->sync = 0; 2750 async->sync = 0;
2750 init_completion(&async->wait); 2751 init_completion(&async->wait);
2751 2752
2752 btrfs_init_work(&async->work, delayed_ref_async_start, 2753 btrfs_init_work(&async->work, btrfs_extent_refs_helper,
2753 NULL, NULL); 2754 delayed_ref_async_start, NULL, NULL);
2754 2755
2755 btrfs_queue_work(root->fs_info->extent_workers, &async->work); 2756 btrfs_queue_work(root->fs_info->extent_workers, &async->work);
2756 2757
@@ -3057,7 +3058,7 @@ out:
3057static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 3058static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3058 struct btrfs_root *root, 3059 struct btrfs_root *root,
3059 struct extent_buffer *buf, 3060 struct extent_buffer *buf,
3060 int full_backref, int inc, int no_quota) 3061 int full_backref, int inc)
3061{ 3062{
3062 u64 bytenr; 3063 u64 bytenr;
3063 u64 num_bytes; 3064 u64 num_bytes;
@@ -3111,7 +3112,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3111 key.offset -= btrfs_file_extent_offset(buf, fi); 3112 key.offset -= btrfs_file_extent_offset(buf, fi);
3112 ret = process_func(trans, root, bytenr, num_bytes, 3113 ret = process_func(trans, root, bytenr, num_bytes,
3113 parent, ref_root, key.objectid, 3114 parent, ref_root, key.objectid,
3114 key.offset, no_quota); 3115 key.offset, 1);
3115 if (ret) 3116 if (ret)
3116 goto fail; 3117 goto fail;
3117 } else { 3118 } else {
@@ -3119,7 +3120,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3119 num_bytes = btrfs_level_size(root, level - 1); 3120 num_bytes = btrfs_level_size(root, level - 1);
3120 ret = process_func(trans, root, bytenr, num_bytes, 3121 ret = process_func(trans, root, bytenr, num_bytes,
3121 parent, ref_root, level - 1, 0, 3122 parent, ref_root, level - 1, 0,
3122 no_quota); 3123 1);
3123 if (ret) 3124 if (ret)
3124 goto fail; 3125 goto fail;
3125 } 3126 }
@@ -3130,15 +3131,15 @@ fail:
3130} 3131}
3131 3132
3132int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3133int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3133 struct extent_buffer *buf, int full_backref, int no_quota) 3134 struct extent_buffer *buf, int full_backref)
3134{ 3135{
3135 return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota); 3136 return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
3136} 3137}
3137 3138
3138int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3139int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3139 struct extent_buffer *buf, int full_backref, int no_quota) 3140 struct extent_buffer *buf, int full_backref)
3140{ 3141{
3141 return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota); 3142 return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
3142} 3143}
3143 3144
3144static int write_one_cache_group(struct btrfs_trans_handle *trans, 3145static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -3586,13 +3587,7 @@ static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3586 */ 3587 */
3587static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 3588static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3588{ 3589{
3589 /* 3590 u64 num_devices = root->fs_info->fs_devices->rw_devices;
3590 * we add in the count of missing devices because we want
3591 * to make sure that any RAID levels on a degraded FS
3592 * continue to be honored.
3593 */
3594 u64 num_devices = root->fs_info->fs_devices->rw_devices +
3595 root->fs_info->fs_devices->missing_devices;
3596 u64 target; 3591 u64 target;
3597 u64 tmp; 3592 u64 tmp;
3598 3593
@@ -7478,6 +7473,220 @@ reada:
7478 wc->reada_slot = slot; 7473 wc->reada_slot = slot;
7479} 7474}
7480 7475
7476static int account_leaf_items(struct btrfs_trans_handle *trans,
7477 struct btrfs_root *root,
7478 struct extent_buffer *eb)
7479{
7480 int nr = btrfs_header_nritems(eb);
7481 int i, extent_type, ret;
7482 struct btrfs_key key;
7483 struct btrfs_file_extent_item *fi;
7484 u64 bytenr, num_bytes;
7485
7486 for (i = 0; i < nr; i++) {
7487 btrfs_item_key_to_cpu(eb, &key, i);
7488
7489 if (key.type != BTRFS_EXTENT_DATA_KEY)
7490 continue;
7491
7492 fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
7493 /* filter out non qgroup-accountable extents */
7494 extent_type = btrfs_file_extent_type(eb, fi);
7495
7496 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
7497 continue;
7498
7499 bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
7500 if (!bytenr)
7501 continue;
7502
7503 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
7504
7505 ret = btrfs_qgroup_record_ref(trans, root->fs_info,
7506 root->objectid,
7507 bytenr, num_bytes,
7508 BTRFS_QGROUP_OPER_SUB_SUBTREE, 0);
7509 if (ret)
7510 return ret;
7511 }
7512 return 0;
7513}
7514
7515/*
7516 * Walk up the tree from the bottom, freeing leaves and any interior
7517 * nodes which have had all slots visited. If a node (leaf or
7518 * interior) is freed, the node above it will have it's slot
7519 * incremented. The root node will never be freed.
7520 *
7521 * At the end of this function, we should have a path which has all
7522 * slots incremented to the next position for a search. If we need to
7523 * read a new node it will be NULL and the node above it will have the
7524 * correct slot selected for a later read.
7525 *
7526 * If we increment the root nodes slot counter past the number of
7527 * elements, 1 is returned to signal completion of the search.
7528 */
7529static int adjust_slots_upwards(struct btrfs_root *root,
7530 struct btrfs_path *path, int root_level)
7531{
7532 int level = 0;
7533 int nr, slot;
7534 struct extent_buffer *eb;
7535
7536 if (root_level == 0)
7537 return 1;
7538
7539 while (level <= root_level) {
7540 eb = path->nodes[level];
7541 nr = btrfs_header_nritems(eb);
7542 path->slots[level]++;
7543 slot = path->slots[level];
7544 if (slot >= nr || level == 0) {
7545 /*
7546 * Don't free the root - we will detect this
7547 * condition after our loop and return a
7548 * positive value for caller to stop walking the tree.
7549 */
7550 if (level != root_level) {
7551 btrfs_tree_unlock_rw(eb, path->locks[level]);
7552 path->locks[level] = 0;
7553
7554 free_extent_buffer(eb);
7555 path->nodes[level] = NULL;
7556 path->slots[level] = 0;
7557 }
7558 } else {
7559 /*
7560 * We have a valid slot to walk back down
7561 * from. Stop here so caller can process these
7562 * new nodes.
7563 */
7564 break;
7565 }
7566
7567 level++;
7568 }
7569
7570 eb = path->nodes[root_level];
7571 if (path->slots[root_level] >= btrfs_header_nritems(eb))
7572 return 1;
7573
7574 return 0;
7575}
7576
7577/*
7578 * root_eb is the subtree root and is locked before this function is called.
7579 */
7580static int account_shared_subtree(struct btrfs_trans_handle *trans,
7581 struct btrfs_root *root,
7582 struct extent_buffer *root_eb,
7583 u64 root_gen,
7584 int root_level)
7585{
7586 int ret = 0;
7587 int level;
7588 struct extent_buffer *eb = root_eb;
7589 struct btrfs_path *path = NULL;
7590
7591 BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);
7592 BUG_ON(root_eb == NULL);
7593
7594 if (!root->fs_info->quota_enabled)
7595 return 0;
7596
7597 if (!extent_buffer_uptodate(root_eb)) {
7598 ret = btrfs_read_buffer(root_eb, root_gen);
7599 if (ret)
7600 goto out;
7601 }
7602
7603 if (root_level == 0) {
7604 ret = account_leaf_items(trans, root, root_eb);
7605 goto out;
7606 }
7607
7608 path = btrfs_alloc_path();
7609 if (!path)
7610 return -ENOMEM;
7611
7612 /*
7613 * Walk down the tree. Missing extent blocks are filled in as
7614 * we go. Metadata is accounted every time we read a new
7615 * extent block.
7616 *
7617 * When we reach a leaf, we account for file extent items in it,
7618 * walk back up the tree (adjusting slot pointers as we go)
7619 * and restart the search process.
7620 */
7621 extent_buffer_get(root_eb); /* For path */
7622 path->nodes[root_level] = root_eb;
7623 path->slots[root_level] = 0;
7624 path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
7625walk_down:
7626 level = root_level;
7627 while (level >= 0) {
7628 if (path->nodes[level] == NULL) {
7629 int child_bsize = root->nodesize;
7630 int parent_slot;
7631 u64 child_gen;
7632 u64 child_bytenr;
7633
7634 /* We need to get child blockptr/gen from
7635 * parent before we can read it. */
7636 eb = path->nodes[level + 1];
7637 parent_slot = path->slots[level + 1];
7638 child_bytenr = btrfs_node_blockptr(eb, parent_slot);
7639 child_gen = btrfs_node_ptr_generation(eb, parent_slot);
7640
7641 eb = read_tree_block(root, child_bytenr, child_bsize,
7642 child_gen);
7643 if (!eb || !extent_buffer_uptodate(eb)) {
7644 ret = -EIO;
7645 goto out;
7646 }
7647
7648 path->nodes[level] = eb;
7649 path->slots[level] = 0;
7650
7651 btrfs_tree_read_lock(eb);
7652 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
7653 path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
7654
7655 ret = btrfs_qgroup_record_ref(trans, root->fs_info,
7656 root->objectid,
7657 child_bytenr,
7658 child_bsize,
7659 BTRFS_QGROUP_OPER_SUB_SUBTREE,
7660 0);
7661 if (ret)
7662 goto out;
7663
7664 }
7665
7666 if (level == 0) {
7667 ret = account_leaf_items(trans, root, path->nodes[level]);
7668 if (ret)
7669 goto out;
7670
7671 /* Nonzero return here means we completed our search */
7672 ret = adjust_slots_upwards(root, path, root_level);
7673 if (ret)
7674 break;
7675
7676 /* Restart search with new slots */
7677 goto walk_down;
7678 }
7679
7680 level--;
7681 }
7682
7683 ret = 0;
7684out:
7685 btrfs_free_path(path);
7686
7687 return ret;
7688}
7689
7481/* 7690/*
7482 * helper to process tree block while walking down the tree. 7691 * helper to process tree block while walking down the tree.
7483 * 7692 *
@@ -7532,9 +7741,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
7532 /* wc->stage == UPDATE_BACKREF */ 7741 /* wc->stage == UPDATE_BACKREF */
7533 if (!(wc->flags[level] & flag)) { 7742 if (!(wc->flags[level] & flag)) {
7534 BUG_ON(!path->locks[level]); 7743 BUG_ON(!path->locks[level]);
7535 ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc); 7744 ret = btrfs_inc_ref(trans, root, eb, 1);
7536 BUG_ON(ret); /* -ENOMEM */ 7745 BUG_ON(ret); /* -ENOMEM */
7537 ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); 7746 ret = btrfs_dec_ref(trans, root, eb, 0);
7538 BUG_ON(ret); /* -ENOMEM */ 7747 BUG_ON(ret); /* -ENOMEM */
7539 ret = btrfs_set_disk_extent_flags(trans, root, eb->start, 7748 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
7540 eb->len, flag, 7749 eb->len, flag,
@@ -7581,6 +7790,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
7581 int level = wc->level; 7790 int level = wc->level;
7582 int reada = 0; 7791 int reada = 0;
7583 int ret = 0; 7792 int ret = 0;
7793 bool need_account = false;
7584 7794
7585 generation = btrfs_node_ptr_generation(path->nodes[level], 7795 generation = btrfs_node_ptr_generation(path->nodes[level],
7586 path->slots[level]); 7796 path->slots[level]);
@@ -7626,6 +7836,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
7626 7836
7627 if (wc->stage == DROP_REFERENCE) { 7837 if (wc->stage == DROP_REFERENCE) {
7628 if (wc->refs[level - 1] > 1) { 7838 if (wc->refs[level - 1] > 1) {
7839 need_account = true;
7629 if (level == 1 && 7840 if (level == 1 &&
7630 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7841 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7631 goto skip; 7842 goto skip;
@@ -7689,6 +7900,16 @@ skip:
7689 parent = 0; 7900 parent = 0;
7690 } 7901 }
7691 7902
7903 if (need_account) {
7904 ret = account_shared_subtree(trans, root, next,
7905 generation, level - 1);
7906 if (ret) {
7907 printk_ratelimited(KERN_ERR "BTRFS: %s Error "
7908 "%d accounting shared subtree. Quota "
7909 "is out of sync, rescan required.\n",
7910 root->fs_info->sb->s_id, ret);
7911 }
7912 }
7692 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, 7913 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
7693 root->root_key.objectid, level - 1, 0, 0); 7914 root->root_key.objectid, level - 1, 0, 0);
7694 BUG_ON(ret); /* -ENOMEM */ 7915 BUG_ON(ret); /* -ENOMEM */
@@ -7769,12 +7990,17 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
7769 if (wc->refs[level] == 1) { 7990 if (wc->refs[level] == 1) {
7770 if (level == 0) { 7991 if (level == 0) {
7771 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 7992 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
7772 ret = btrfs_dec_ref(trans, root, eb, 1, 7993 ret = btrfs_dec_ref(trans, root, eb, 1);
7773 wc->for_reloc);
7774 else 7994 else
7775 ret = btrfs_dec_ref(trans, root, eb, 0, 7995 ret = btrfs_dec_ref(trans, root, eb, 0);
7776 wc->for_reloc);
7777 BUG_ON(ret); /* -ENOMEM */ 7996 BUG_ON(ret); /* -ENOMEM */
7997 ret = account_leaf_items(trans, root, eb);
7998 if (ret) {
7999 printk_ratelimited(KERN_ERR "BTRFS: %s Error "
8000 "%d accounting leaf items. Quota "
8001 "is out of sync, rescan required.\n",
8002 root->fs_info->sb->s_id, ret);
8003 }
7778 } 8004 }
7779 /* make block locked assertion in clean_tree_block happy */ 8005 /* make block locked assertion in clean_tree_block happy */
7780 if (!path->locks[level] && 8006 if (!path->locks[level] &&
@@ -7900,6 +8126,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7900 int level; 8126 int level;
7901 bool root_dropped = false; 8127 bool root_dropped = false;
7902 8128
8129 btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid);
8130
7903 path = btrfs_alloc_path(); 8131 path = btrfs_alloc_path();
7904 if (!path) { 8132 if (!path) {
7905 err = -ENOMEM; 8133 err = -ENOMEM;
@@ -8025,6 +8253,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
8025 goto out_end_trans; 8253 goto out_end_trans;
8026 } 8254 }
8027 8255
8256 /*
8257 * Qgroup update accounting is run from
8258 * delayed ref handling. This usually works
8259 * out because delayed refs are normally the
8260 * only way qgroup updates are added. However,
8261 * we may have added updates during our tree
8262 * walk so run qgroups here to make sure we
8263 * don't lose any updates.
8264 */
8265 ret = btrfs_delayed_qgroup_accounting(trans,
8266 root->fs_info);
8267 if (ret)
8268 printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
8269 "running qgroup updates "
8270 "during snapshot delete. "
8271 "Quota is out of sync, "
8272 "rescan required.\n", ret);
8273
8028 btrfs_end_transaction_throttle(trans, tree_root); 8274 btrfs_end_transaction_throttle(trans, tree_root);
8029 if (!for_reloc && btrfs_need_cleaner_sleep(root)) { 8275 if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
8030 pr_debug("BTRFS: drop snapshot early exit\n"); 8276 pr_debug("BTRFS: drop snapshot early exit\n");
@@ -8078,6 +8324,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
8078 } 8324 }
8079 root_dropped = true; 8325 root_dropped = true;
8080out_end_trans: 8326out_end_trans:
8327 ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info);
8328 if (ret)
8329 printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
8330 "running qgroup updates "
8331 "during snapshot delete. "
8332 "Quota is out of sync, "
8333 "rescan required.\n", ret);
8334
8081 btrfs_end_transaction_throttle(trans, tree_root); 8335 btrfs_end_transaction_throttle(trans, tree_root);
8082out_free: 8336out_free:
8083 kfree(wc); 8337 kfree(wc);
@@ -8181,13 +8435,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
8181 if (stripped) 8435 if (stripped)
8182 return extended_to_chunk(stripped); 8436 return extended_to_chunk(stripped);
8183 8437
8184 /* 8438 num_devices = root->fs_info->fs_devices->rw_devices;
8185 * we add in the count of missing devices because we want
8186 * to make sure that any RAID levels on a degraded FS
8187 * continue to be honored.
8188 */
8189 num_devices = root->fs_info->fs_devices->rw_devices +
8190 root->fs_info->fs_devices->missing_devices;
8191 8439
8192 stripped = BTRFS_BLOCK_GROUP_RAID0 | 8440 stripped = BTRFS_BLOCK_GROUP_RAID0 |
8193 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | 8441 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3e11aab9f391..af0359dcf337 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2532,6 +2532,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2532 test_bit(BIO_UPTODATE, &bio->bi_flags); 2532 test_bit(BIO_UPTODATE, &bio->bi_flags);
2533 if (err) 2533 if (err)
2534 uptodate = 0; 2534 uptodate = 0;
2535 offset += len;
2535 continue; 2536 continue;
2536 } 2537 }
2537 } 2538 }
@@ -4207,8 +4208,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4207 return -ENOMEM; 4208 return -ENOMEM;
4208 path->leave_spinning = 1; 4209 path->leave_spinning = 1;
4209 4210
4210 start = ALIGN(start, BTRFS_I(inode)->root->sectorsize); 4211 start = round_down(start, BTRFS_I(inode)->root->sectorsize);
4211 len = ALIGN(len, BTRFS_I(inode)->root->sectorsize); 4212 len = round_up(max, BTRFS_I(inode)->root->sectorsize) - start;
4212 4213
4213 /* 4214 /*
4214 * lookup the last file extent. We're not using i_size here 4215 * lookup the last file extent. We're not using i_size here
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index f46cfe45d686..54c84daec9b5 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -756,7 +756,7 @@ again:
756 found_next = 1; 756 found_next = 1;
757 if (ret != 0) 757 if (ret != 0)
758 goto insert; 758 goto insert;
759 slot = 0; 759 slot = path->slots[0];
760 } 760 }
761 btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot); 761 btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
762 if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || 762 if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1f2b99cb55ea..ff1cc0399b9a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1838,6 +1838,8 @@ out:
1838 1838
1839int btrfs_release_file(struct inode *inode, struct file *filp) 1839int btrfs_release_file(struct inode *inode, struct file *filp)
1840{ 1840{
1841 if (filp->private_data)
1842 btrfs_ioctl_trans_end(filp);
1841 /* 1843 /*
1842 * ordered_data_close is set by settattr when we are about to truncate 1844 * ordered_data_close is set by settattr when we are about to truncate
1843 * a file from a non-zero size to a zero size. This tries to 1845 * a file from a non-zero size to a zero size. This tries to
@@ -1845,26 +1847,8 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
1845 * application were using truncate to replace a file in place. 1847 * application were using truncate to replace a file in place.
1846 */ 1848 */
1847 if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, 1849 if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
1848 &BTRFS_I(inode)->runtime_flags)) { 1850 &BTRFS_I(inode)->runtime_flags))
1849 struct btrfs_trans_handle *trans;
1850 struct btrfs_root *root = BTRFS_I(inode)->root;
1851
1852 /*
1853 * We need to block on a committing transaction to keep us from
1854 * throwing a ordered operation on to the list and causing
1855 * something like sync to deadlock trying to flush out this
1856 * inode.
1857 */
1858 trans = btrfs_start_transaction(root, 0);
1859 if (IS_ERR(trans))
1860 return PTR_ERR(trans);
1861 btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);
1862 btrfs_end_transaction(trans, root);
1863 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
1864 filemap_flush(inode->i_mapping); 1851 filemap_flush(inode->i_mapping);
1865 }
1866 if (filp->private_data)
1867 btrfs_ioctl_trans_end(filp);
1868 return 0; 1852 return 0;
1869} 1853}
1870 1854
@@ -1982,7 +1966,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1982 1966
1983 btrfs_init_log_ctx(&ctx); 1967 btrfs_init_log_ctx(&ctx);
1984 1968
1985 ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx); 1969 ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
1986 if (ret < 0) { 1970 if (ret < 0) {
1987 /* Fallthrough and commit/free transaction. */ 1971 /* Fallthrough and commit/free transaction. */
1988 ret = 1; 1972 ret = 1;
@@ -2112,10 +2096,9 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
2112 goto out; 2096 goto out;
2113 } 2097 }
2114 2098
2115 if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) { 2099 if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
2116 u64 num_bytes; 2100 u64 num_bytes;
2117 2101
2118 path->slots[0]++;
2119 key.offset = offset; 2102 key.offset = offset;
2120 btrfs_set_item_key_safe(root, path, &key); 2103 btrfs_set_item_key_safe(root, path, &key);
2121 fi = btrfs_item_ptr(leaf, path->slots[0], 2104 fi = btrfs_item_ptr(leaf, path->slots[0],
@@ -2240,7 +2223,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2240 goto out_only_mutex; 2223 goto out_only_mutex;
2241 } 2224 }
2242 2225
2243 lockstart = round_up(offset , BTRFS_I(inode)->root->sectorsize); 2226 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
2244 lockend = round_down(offset + len, 2227 lockend = round_down(offset + len,
2245 BTRFS_I(inode)->root->sectorsize) - 1; 2228 BTRFS_I(inode)->root->sectorsize) - 1;
2246 same_page = ((offset >> PAGE_CACHE_SHIFT) == 2229 same_page = ((offset >> PAGE_CACHE_SHIFT) ==
@@ -2301,7 +2284,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2301 tail_start + tail_len, 0, 1); 2284 tail_start + tail_len, 0, 1);
2302 if (ret) 2285 if (ret)
2303 goto out_only_mutex; 2286 goto out_only_mutex;
2304 } 2287 }
2305 } 2288 }
2306 } 2289 }
2307 2290
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3183742d6f0d..016c403bfe7e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -709,6 +709,18 @@ retry:
709 unlock_extent(io_tree, async_extent->start, 709 unlock_extent(io_tree, async_extent->start,
710 async_extent->start + 710 async_extent->start +
711 async_extent->ram_size - 1); 711 async_extent->ram_size - 1);
712
713 /*
714 * we need to redirty the pages if we decide to
715 * fallback to uncompressed IO, otherwise we
716 * will not submit these pages down to lower
717 * layers.
718 */
719 extent_range_redirty_for_io(inode,
720 async_extent->start,
721 async_extent->start +
722 async_extent->ram_size - 1);
723
712 goto retry; 724 goto retry;
713 } 725 }
714 goto out_free; 726 goto out_free;
@@ -766,8 +778,12 @@ retry:
766 ins.offset, 778 ins.offset,
767 BTRFS_ORDERED_COMPRESSED, 779 BTRFS_ORDERED_COMPRESSED,
768 async_extent->compress_type); 780 async_extent->compress_type);
769 if (ret) 781 if (ret) {
782 btrfs_drop_extent_cache(inode, async_extent->start,
783 async_extent->start +
784 async_extent->ram_size - 1, 0);
770 goto out_free_reserve; 785 goto out_free_reserve;
786 }
771 787
772 /* 788 /*
773 * clear dirty, set writeback and unlock the pages. 789 * clear dirty, set writeback and unlock the pages.
@@ -959,14 +975,14 @@ static noinline int cow_file_range(struct inode *inode,
959 ret = btrfs_add_ordered_extent(inode, start, ins.objectid, 975 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
960 ram_size, cur_alloc_size, 0); 976 ram_size, cur_alloc_size, 0);
961 if (ret) 977 if (ret)
962 goto out_reserve; 978 goto out_drop_extent_cache;
963 979
964 if (root->root_key.objectid == 980 if (root->root_key.objectid ==
965 BTRFS_DATA_RELOC_TREE_OBJECTID) { 981 BTRFS_DATA_RELOC_TREE_OBJECTID) {
966 ret = btrfs_reloc_clone_csums(inode, start, 982 ret = btrfs_reloc_clone_csums(inode, start,
967 cur_alloc_size); 983 cur_alloc_size);
968 if (ret) 984 if (ret)
969 goto out_reserve; 985 goto out_drop_extent_cache;
970 } 986 }
971 987
972 if (disk_num_bytes < cur_alloc_size) 988 if (disk_num_bytes < cur_alloc_size)
@@ -994,6 +1010,8 @@ static noinline int cow_file_range(struct inode *inode,
994out: 1010out:
995 return ret; 1011 return ret;
996 1012
1013out_drop_extent_cache:
1014 btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
997out_reserve: 1015out_reserve:
998 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); 1016 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
999out_unlock: 1017out_unlock:
@@ -1084,8 +1102,10 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1084 async_cow->end = cur_end; 1102 async_cow->end = cur_end;
1085 INIT_LIST_HEAD(&async_cow->extents); 1103 INIT_LIST_HEAD(&async_cow->extents);
1086 1104
1087 btrfs_init_work(&async_cow->work, async_cow_start, 1105 btrfs_init_work(&async_cow->work,
1088 async_cow_submit, async_cow_free); 1106 btrfs_delalloc_helper,
1107 async_cow_start, async_cow_submit,
1108 async_cow_free);
1089 1109
1090 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> 1110 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
1091 PAGE_CACHE_SHIFT; 1111 PAGE_CACHE_SHIFT;
@@ -1869,7 +1889,8 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1869 1889
1870 SetPageChecked(page); 1890 SetPageChecked(page);
1871 page_cache_get(page); 1891 page_cache_get(page);
1872 btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); 1892 btrfs_init_work(&fixup->work, btrfs_fixup_helper,
1893 btrfs_writepage_fixup_worker, NULL, NULL);
1873 fixup->page = page; 1894 fixup->page = page;
1874 btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work); 1895 btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
1875 return -EBUSY; 1896 return -EBUSY;
@@ -2810,7 +2831,8 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2810 struct inode *inode = page->mapping->host; 2831 struct inode *inode = page->mapping->host;
2811 struct btrfs_root *root = BTRFS_I(inode)->root; 2832 struct btrfs_root *root = BTRFS_I(inode)->root;
2812 struct btrfs_ordered_extent *ordered_extent = NULL; 2833 struct btrfs_ordered_extent *ordered_extent = NULL;
2813 struct btrfs_workqueue *workers; 2834 struct btrfs_workqueue *wq;
2835 btrfs_work_func_t func;
2814 2836
2815 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); 2837 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
2816 2838
@@ -2819,13 +2841,17 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2819 end - start + 1, uptodate)) 2841 end - start + 1, uptodate))
2820 return 0; 2842 return 0;
2821 2843
2822 btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL); 2844 if (btrfs_is_free_space_inode(inode)) {
2845 wq = root->fs_info->endio_freespace_worker;
2846 func = btrfs_freespace_write_helper;
2847 } else {
2848 wq = root->fs_info->endio_write_workers;
2849 func = btrfs_endio_write_helper;
2850 }
2823 2851
2824 if (btrfs_is_free_space_inode(inode)) 2852 btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
2825 workers = root->fs_info->endio_freespace_worker; 2853 NULL);
2826 else 2854 btrfs_queue_work(wq, &ordered_extent->work);
2827 workers = root->fs_info->endio_write_workers;
2828 btrfs_queue_work(workers, &ordered_extent->work);
2829 2855
2830 return 0; 2856 return 0;
2831} 2857}
@@ -4222,7 +4248,8 @@ out:
4222 btrfs_abort_transaction(trans, root, ret); 4248 btrfs_abort_transaction(trans, root, ret);
4223 } 4249 }
4224error: 4250error:
4225 if (last_size != (u64)-1) 4251 if (last_size != (u64)-1 &&
4252 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
4226 btrfs_ordered_update_i_size(inode, last_size, NULL); 4253 btrfs_ordered_update_i_size(inode, last_size, NULL);
4227 btrfs_free_path(path); 4254 btrfs_free_path(path);
4228 return err; 4255 return err;
@@ -4662,6 +4689,11 @@ static void evict_inode_truncate_pages(struct inode *inode)
4662 clear_bit(EXTENT_FLAG_LOGGING, &em->flags); 4689 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
4663 remove_extent_mapping(map_tree, em); 4690 remove_extent_mapping(map_tree, em);
4664 free_extent_map(em); 4691 free_extent_map(em);
4692 if (need_resched()) {
4693 write_unlock(&map_tree->lock);
4694 cond_resched();
4695 write_lock(&map_tree->lock);
4696 }
4665 } 4697 }
4666 write_unlock(&map_tree->lock); 4698 write_unlock(&map_tree->lock);
4667 4699
@@ -4684,6 +4716,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
4684 &cached_state, GFP_NOFS); 4716 &cached_state, GFP_NOFS);
4685 free_extent_state(state); 4717 free_extent_state(state);
4686 4718
4719 cond_resched();
4687 spin_lock(&io_tree->lock); 4720 spin_lock(&io_tree->lock);
4688 } 4721 }
4689 spin_unlock(&io_tree->lock); 4722 spin_unlock(&io_tree->lock);
@@ -5169,6 +5202,42 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5169 iput(inode); 5202 iput(inode);
5170 inode = ERR_PTR(ret); 5203 inode = ERR_PTR(ret);
5171 } 5204 }
5205 /*
5206 * If orphan cleanup did remove any orphans, it means the tree
5207 * was modified and therefore the commit root is not the same as
5208 * the current root anymore. This is a problem, because send
5209 * uses the commit root and therefore can see inode items that
5210 * don't exist in the current root anymore, and for example make
5211 * calls to btrfs_iget, which will do tree lookups based on the
5212 * current root and not on the commit root. Those lookups will
5213 * fail, returning a -ESTALE error, and making send fail with
5214 * that error. So make sure a send does not see any orphans we
5215 * have just removed, and that it will see the same inodes
5216 * regardless of whether a transaction commit happened before
5217 * it started (meaning that the commit root will be the same as
5218 * the current root) or not.
5219 */
5220 if (sub_root->node != sub_root->commit_root) {
5221 u64 sub_flags = btrfs_root_flags(&sub_root->root_item);
5222
5223 if (sub_flags & BTRFS_ROOT_SUBVOL_RDONLY) {
5224 struct extent_buffer *eb;
5225
5226 /*
5227 * Assert we can't have races between dentry
5228 * lookup called through the snapshot creation
5229 * ioctl and the VFS.
5230 */
5231 ASSERT(mutex_is_locked(&dir->i_mutex));
5232
5233 down_write(&root->fs_info->commit_root_sem);
5234 eb = sub_root->commit_root;
5235 sub_root->commit_root =
5236 btrfs_root_node(sub_root);
5237 up_write(&root->fs_info->commit_root_sem);
5238 free_extent_buffer(eb);
5239 }
5240 }
5172 } 5241 }
5173 5242
5174 return inode; 5243 return inode;
@@ -5565,6 +5634,17 @@ int btrfs_set_inode_index(struct inode *dir, u64 *index)
5565 return ret; 5634 return ret;
5566} 5635}
5567 5636
5637static int btrfs_insert_inode_locked(struct inode *inode)
5638{
5639 struct btrfs_iget_args args;
5640 args.location = &BTRFS_I(inode)->location;
5641 args.root = BTRFS_I(inode)->root;
5642
5643 return insert_inode_locked4(inode,
5644 btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
5645 btrfs_find_actor, &args);
5646}
5647
5568static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, 5648static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5569 struct btrfs_root *root, 5649 struct btrfs_root *root,
5570 struct inode *dir, 5650 struct inode *dir,
@@ -5594,6 +5674,13 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5594 } 5674 }
5595 5675
5596 /* 5676 /*
5677 * O_TMPFILE, set link count to 0, so that after this point,
5678 * we fill in an inode item with the correct link count.
5679 */
5680 if (!name)
5681 set_nlink(inode, 0);
5682
5683 /*
5597 * we have to initialize this early, so we can reclaim the inode 5684 * we have to initialize this early, so we can reclaim the inode
5598 * number if we fail afterwards in this function. 5685 * number if we fail afterwards in this function.
5599 */ 5686 */
@@ -5650,10 +5737,19 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5650 sizes[1] = name_len + sizeof(*ref); 5737 sizes[1] = name_len + sizeof(*ref);
5651 } 5738 }
5652 5739
5740 location = &BTRFS_I(inode)->location;
5741 location->objectid = objectid;
5742 location->offset = 0;
5743 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
5744
5745 ret = btrfs_insert_inode_locked(inode);
5746 if (ret < 0)
5747 goto fail;
5748
5653 path->leave_spinning = 1; 5749 path->leave_spinning = 1;
5654 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems); 5750 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
5655 if (ret != 0) 5751 if (ret != 0)
5656 goto fail; 5752 goto fail_unlock;
5657 5753
5658 inode_init_owner(inode, dir, mode); 5754 inode_init_owner(inode, dir, mode);
5659 inode_set_bytes(inode, 0); 5755 inode_set_bytes(inode, 0);
@@ -5676,11 +5772,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5676 btrfs_mark_buffer_dirty(path->nodes[0]); 5772 btrfs_mark_buffer_dirty(path->nodes[0]);
5677 btrfs_free_path(path); 5773 btrfs_free_path(path);
5678 5774
5679 location = &BTRFS_I(inode)->location;
5680 location->objectid = objectid;
5681 location->offset = 0;
5682 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
5683
5684 btrfs_inherit_iflags(inode, dir); 5775 btrfs_inherit_iflags(inode, dir);
5685 5776
5686 if (S_ISREG(mode)) { 5777 if (S_ISREG(mode)) {
@@ -5691,7 +5782,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5691 BTRFS_INODE_NODATASUM; 5782 BTRFS_INODE_NODATASUM;
5692 } 5783 }
5693 5784
5694 btrfs_insert_inode_hash(inode);
5695 inode_tree_add(inode); 5785 inode_tree_add(inode);
5696 5786
5697 trace_btrfs_inode_new(inode); 5787 trace_btrfs_inode_new(inode);
@@ -5706,6 +5796,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5706 btrfs_ino(inode), root->root_key.objectid, ret); 5796 btrfs_ino(inode), root->root_key.objectid, ret);
5707 5797
5708 return inode; 5798 return inode;
5799
5800fail_unlock:
5801 unlock_new_inode(inode);
5709fail: 5802fail:
5710 if (dir && name) 5803 if (dir && name)
5711 BTRFS_I(dir)->index_cnt--; 5804 BTRFS_I(dir)->index_cnt--;
@@ -5840,28 +5933,28 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
5840 goto out_unlock; 5933 goto out_unlock;
5841 } 5934 }
5842 5935
5843 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
5844 if (err) {
5845 drop_inode = 1;
5846 goto out_unlock;
5847 }
5848
5849 /* 5936 /*
5850 * If the active LSM wants to access the inode during 5937 * If the active LSM wants to access the inode during
5851 * d_instantiate it needs these. Smack checks to see 5938 * d_instantiate it needs these. Smack checks to see
5852 * if the filesystem supports xattrs by looking at the 5939 * if the filesystem supports xattrs by looking at the
5853 * ops vector. 5940 * ops vector.
5854 */ 5941 */
5855
5856 inode->i_op = &btrfs_special_inode_operations; 5942 inode->i_op = &btrfs_special_inode_operations;
5857 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 5943 init_special_inode(inode, inode->i_mode, rdev);
5944
5945 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
5858 if (err) 5946 if (err)
5859 drop_inode = 1; 5947 goto out_unlock_inode;
5860 else { 5948
5861 init_special_inode(inode, inode->i_mode, rdev); 5949 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
5950 if (err) {
5951 goto out_unlock_inode;
5952 } else {
5862 btrfs_update_inode(trans, root, inode); 5953 btrfs_update_inode(trans, root, inode);
5954 unlock_new_inode(inode);
5863 d_instantiate(dentry, inode); 5955 d_instantiate(dentry, inode);
5864 } 5956 }
5957
5865out_unlock: 5958out_unlock:
5866 btrfs_end_transaction(trans, root); 5959 btrfs_end_transaction(trans, root);
5867 btrfs_balance_delayed_items(root); 5960 btrfs_balance_delayed_items(root);
@@ -5871,6 +5964,12 @@ out_unlock:
5871 iput(inode); 5964 iput(inode);
5872 } 5965 }
5873 return err; 5966 return err;
5967
5968out_unlock_inode:
5969 drop_inode = 1;
5970 unlock_new_inode(inode);
5971 goto out_unlock;
5972
5874} 5973}
5875 5974
5876static int btrfs_create(struct inode *dir, struct dentry *dentry, 5975static int btrfs_create(struct inode *dir, struct dentry *dentry,
@@ -5905,15 +6004,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
5905 goto out_unlock; 6004 goto out_unlock;
5906 } 6005 }
5907 drop_inode_on_err = 1; 6006 drop_inode_on_err = 1;
5908
5909 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
5910 if (err)
5911 goto out_unlock;
5912
5913 err = btrfs_update_inode(trans, root, inode);
5914 if (err)
5915 goto out_unlock;
5916
5917 /* 6007 /*
5918 * If the active LSM wants to access the inode during 6008 * If the active LSM wants to access the inode during
5919 * d_instantiate it needs these. Smack checks to see 6009 * d_instantiate it needs these. Smack checks to see
@@ -5922,14 +6012,23 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
5922 */ 6012 */
5923 inode->i_fop = &btrfs_file_operations; 6013 inode->i_fop = &btrfs_file_operations;
5924 inode->i_op = &btrfs_file_inode_operations; 6014 inode->i_op = &btrfs_file_inode_operations;
6015 inode->i_mapping->a_ops = &btrfs_aops;
6016 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
6017
6018 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6019 if (err)
6020 goto out_unlock_inode;
6021
6022 err = btrfs_update_inode(trans, root, inode);
6023 if (err)
6024 goto out_unlock_inode;
5925 6025
5926 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 6026 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
5927 if (err) 6027 if (err)
5928 goto out_unlock; 6028 goto out_unlock_inode;
5929 6029
5930 inode->i_mapping->a_ops = &btrfs_aops;
5931 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
5932 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 6030 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
6031 unlock_new_inode(inode);
5933 d_instantiate(dentry, inode); 6032 d_instantiate(dentry, inode);
5934 6033
5935out_unlock: 6034out_unlock:
@@ -5941,6 +6040,11 @@ out_unlock:
5941 btrfs_balance_delayed_items(root); 6040 btrfs_balance_delayed_items(root);
5942 btrfs_btree_balance_dirty(root); 6041 btrfs_btree_balance_dirty(root);
5943 return err; 6042 return err;
6043
6044out_unlock_inode:
6045 unlock_new_inode(inode);
6046 goto out_unlock;
6047
5944} 6048}
5945 6049
5946static int btrfs_link(struct dentry *old_dentry, struct inode *dir, 6050static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
@@ -6048,25 +6152,30 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
6048 } 6152 }
6049 6153
6050 drop_on_err = 1; 6154 drop_on_err = 1;
6155 /* these must be set before we unlock the inode */
6156 inode->i_op = &btrfs_dir_inode_operations;
6157 inode->i_fop = &btrfs_dir_file_operations;
6051 6158
6052 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 6159 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6053 if (err) 6160 if (err)
6054 goto out_fail; 6161 goto out_fail_inode;
6055
6056 inode->i_op = &btrfs_dir_inode_operations;
6057 inode->i_fop = &btrfs_dir_file_operations;
6058 6162
6059 btrfs_i_size_write(inode, 0); 6163 btrfs_i_size_write(inode, 0);
6060 err = btrfs_update_inode(trans, root, inode); 6164 err = btrfs_update_inode(trans, root, inode);
6061 if (err) 6165 if (err)
6062 goto out_fail; 6166 goto out_fail_inode;
6063 6167
6064 err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, 6168 err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
6065 dentry->d_name.len, 0, index); 6169 dentry->d_name.len, 0, index);
6066 if (err) 6170 if (err)
6067 goto out_fail; 6171 goto out_fail_inode;
6068 6172
6069 d_instantiate(dentry, inode); 6173 d_instantiate(dentry, inode);
6174 /*
6175 * mkdir is special. We're unlocking after we call d_instantiate
6176 * to avoid a race with nfsd calling d_instantiate.
6177 */
6178 unlock_new_inode(inode);
6070 drop_on_err = 0; 6179 drop_on_err = 0;
6071 6180
6072out_fail: 6181out_fail:
@@ -6076,6 +6185,10 @@ out_fail:
6076 btrfs_balance_delayed_items(root); 6185 btrfs_balance_delayed_items(root);
6077 btrfs_btree_balance_dirty(root); 6186 btrfs_btree_balance_dirty(root);
6078 return err; 6187 return err;
6188
6189out_fail_inode:
6190 unlock_new_inode(inode);
6191 goto out_fail;
6079} 6192}
6080 6193
6081/* helper for btfs_get_extent. Given an existing extent in the tree, 6194/* helper for btfs_get_extent. Given an existing extent in the tree,
@@ -6085,14 +6198,14 @@ out_fail:
6085static int merge_extent_mapping(struct extent_map_tree *em_tree, 6198static int merge_extent_mapping(struct extent_map_tree *em_tree,
6086 struct extent_map *existing, 6199 struct extent_map *existing,
6087 struct extent_map *em, 6200 struct extent_map *em,
6088 u64 map_start, u64 map_len) 6201 u64 map_start)
6089{ 6202{
6090 u64 start_diff; 6203 u64 start_diff;
6091 6204
6092 BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); 6205 BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
6093 start_diff = map_start - em->start; 6206 start_diff = map_start - em->start;
6094 em->start = map_start; 6207 em->start = map_start;
6095 em->len = map_len; 6208 em->len = existing->start - em->start;
6096 if (em->block_start < EXTENT_MAP_LAST_BYTE && 6209 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
6097 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 6210 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
6098 em->block_start += start_diff; 6211 em->block_start += start_diff;
@@ -6263,6 +6376,8 @@ next:
6263 goto not_found; 6376 goto not_found;
6264 if (start + len <= found_key.offset) 6377 if (start + len <= found_key.offset)
6265 goto not_found; 6378 goto not_found;
6379 if (start > found_key.offset)
6380 goto next;
6266 em->start = start; 6381 em->start = start;
6267 em->orig_start = start; 6382 em->orig_start = start;
6268 em->len = found_key.offset - start; 6383 em->len = found_key.offset - start;
@@ -6378,8 +6493,7 @@ insert:
6378 em->len); 6493 em->len);
6379 if (existing) { 6494 if (existing) {
6380 err = merge_extent_mapping(em_tree, existing, 6495 err = merge_extent_mapping(em_tree, existing,
6381 em, start, 6496 em, start);
6382 root->sectorsize);
6383 free_extent_map(existing); 6497 free_extent_map(existing);
6384 if (err) { 6498 if (err) {
6385 free_extent_map(em); 6499 free_extent_map(em);
@@ -7146,7 +7260,8 @@ again:
7146 if (!ret) 7260 if (!ret)
7147 goto out_test; 7261 goto out_test;
7148 7262
7149 btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL); 7263 btrfs_init_work(&ordered->work, btrfs_endio_write_helper,
7264 finish_ordered_fn, NULL, NULL);
7150 btrfs_queue_work(root->fs_info->endio_write_workers, 7265 btrfs_queue_work(root->fs_info->endio_write_workers,
7151 &ordered->work); 7266 &ordered->work);
7152out_test: 7267out_test:
@@ -7294,10 +7409,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
7294 map_length = orig_bio->bi_iter.bi_size; 7409 map_length = orig_bio->bi_iter.bi_size;
7295 ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, 7410 ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
7296 &map_length, NULL, 0); 7411 &map_length, NULL, 0);
7297 if (ret) { 7412 if (ret)
7298 bio_put(orig_bio);
7299 return -EIO; 7413 return -EIO;
7300 }
7301 7414
7302 if (map_length >= orig_bio->bi_iter.bi_size) { 7415 if (map_length >= orig_bio->bi_iter.bi_size) {
7303 bio = orig_bio; 7416 bio = orig_bio;
@@ -7314,6 +7427,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
7314 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); 7427 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
7315 if (!bio) 7428 if (!bio)
7316 return -ENOMEM; 7429 return -ENOMEM;
7430
7317 bio->bi_private = dip; 7431 bio->bi_private = dip;
7318 bio->bi_end_io = btrfs_end_dio_bio; 7432 bio->bi_end_io = btrfs_end_dio_bio;
7319 atomic_inc(&dip->pending_bios); 7433 atomic_inc(&dip->pending_bios);
@@ -7522,7 +7636,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
7522 count = iov_iter_count(iter); 7636 count = iov_iter_count(iter);
7523 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 7637 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
7524 &BTRFS_I(inode)->runtime_flags)) 7638 &BTRFS_I(inode)->runtime_flags))
7525 filemap_fdatawrite_range(inode->i_mapping, offset, count); 7639 filemap_fdatawrite_range(inode->i_mapping, offset,
7640 offset + count - 1);
7526 7641
7527 if (rw & WRITE) { 7642 if (rw & WRITE) {
7528 /* 7643 /*
@@ -7939,27 +8054,6 @@ static int btrfs_truncate(struct inode *inode)
7939 BUG_ON(ret); 8054 BUG_ON(ret);
7940 8055
7941 /* 8056 /*
7942 * setattr is responsible for setting the ordered_data_close flag,
7943 * but that is only tested during the last file release. That
7944 * could happen well after the next commit, leaving a great big
7945 * window where new writes may get lost if someone chooses to write
7946 * to this file after truncating to zero
7947 *
7948 * The inode doesn't have any dirty data here, and so if we commit
7949 * this is a noop. If someone immediately starts writing to the inode
7950 * it is very likely we'll catch some of their writes in this
7951 * transaction, and the commit will find this file on the ordered
7952 * data list with good things to send down.
7953 *
7954 * This is a best effort solution, there is still a window where
7955 * using truncate to replace the contents of the file will
7956 * end up with a zero length file after a crash.
7957 */
7958 if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
7959 &BTRFS_I(inode)->runtime_flags))
7960 btrfs_add_ordered_operation(trans, root, inode);
7961
7962 /*
7963 * So if we truncate and then write and fsync we normally would just 8057 * So if we truncate and then write and fsync we normally would just
7964 * write the extents that changed, which is a problem if we need to 8058 * write the extents that changed, which is a problem if we need to
7965 * first truncate that entire inode. So set this flag so we write out 8059 * first truncate that entire inode. So set this flag so we write out
@@ -8050,6 +8144,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
8050 8144
8051 set_nlink(inode, 1); 8145 set_nlink(inode, 1);
8052 btrfs_i_size_write(inode, 0); 8146 btrfs_i_size_write(inode, 0);
8147 unlock_new_inode(inode);
8053 8148
8054 err = btrfs_subvol_inherit_props(trans, new_root, parent_root); 8149 err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
8055 if (err) 8150 if (err)
@@ -8106,7 +8201,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
8106 mutex_init(&ei->delalloc_mutex); 8201 mutex_init(&ei->delalloc_mutex);
8107 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 8202 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
8108 INIT_LIST_HEAD(&ei->delalloc_inodes); 8203 INIT_LIST_HEAD(&ei->delalloc_inodes);
8109 INIT_LIST_HEAD(&ei->ordered_operations);
8110 RB_CLEAR_NODE(&ei->rb_node); 8204 RB_CLEAR_NODE(&ei->rb_node);
8111 8205
8112 return inode; 8206 return inode;
@@ -8146,17 +8240,6 @@ void btrfs_destroy_inode(struct inode *inode)
8146 if (!root) 8240 if (!root)
8147 goto free; 8241 goto free;
8148 8242
8149 /*
8150 * Make sure we're properly removed from the ordered operation
8151 * lists.
8152 */
8153 smp_mb();
8154 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
8155 spin_lock(&root->fs_info->ordered_root_lock);
8156 list_del_init(&BTRFS_I(inode)->ordered_operations);
8157 spin_unlock(&root->fs_info->ordered_root_lock);
8158 }
8159
8160 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 8243 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
8161 &BTRFS_I(inode)->runtime_flags)) { 8244 &BTRFS_I(inode)->runtime_flags)) {
8162 btrfs_info(root->fs_info, "inode %llu still on the orphan list", 8245 btrfs_info(root->fs_info, "inode %llu still on the orphan list",
@@ -8338,12 +8421,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
8338 ret = 0; 8421 ret = 0;
8339 8422
8340 /* 8423 /*
8341 * we're using rename to replace one file with another. 8424 * we're using rename to replace one file with another. Start IO on it
8342 * and the replacement file is large. Start IO on it now so 8425 * now so we don't add too much work to the end of the transaction
8343 * we don't add too much work to the end of the transaction
8344 */ 8426 */
8345 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size && 8427 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
8346 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
8347 filemap_flush(old_inode->i_mapping); 8428 filemap_flush(old_inode->i_mapping);
8348 8429
8349 /* close the racy window with snapshot create/destroy ioctl */ 8430 /* close the racy window with snapshot create/destroy ioctl */
@@ -8391,12 +8472,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
8391 */ 8472 */
8392 btrfs_pin_log_trans(root); 8473 btrfs_pin_log_trans(root);
8393 } 8474 }
8394 /*
8395 * make sure the inode gets flushed if it is replacing
8396 * something.
8397 */
8398 if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
8399 btrfs_add_ordered_operation(trans, root, old_inode);
8400 8475
8401 inode_inc_iversion(old_dir); 8476 inode_inc_iversion(old_dir);
8402 inode_inc_iversion(new_dir); 8477 inode_inc_iversion(new_dir);
@@ -8524,7 +8599,9 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
8524 work->inode = inode; 8599 work->inode = inode;
8525 work->wait = wait; 8600 work->wait = wait;
8526 work->delay_iput = delay_iput; 8601 work->delay_iput = delay_iput;
8527 btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL); 8602 WARN_ON_ONCE(!inode);
8603 btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
8604 btrfs_run_delalloc_work, NULL, NULL);
8528 8605
8529 return work; 8606 return work;
8530} 8607}
@@ -8728,12 +8805,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
8728 goto out_unlock; 8805 goto out_unlock;
8729 } 8806 }
8730 8807
8731 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
8732 if (err) {
8733 drop_inode = 1;
8734 goto out_unlock;
8735 }
8736
8737 /* 8808 /*
8738 * If the active LSM wants to access the inode during 8809 * If the active LSM wants to access the inode during
8739 * d_instantiate it needs these. Smack checks to see 8810 * d_instantiate it needs these. Smack checks to see
@@ -8742,23 +8813,22 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
8742 */ 8813 */
8743 inode->i_fop = &btrfs_file_operations; 8814 inode->i_fop = &btrfs_file_operations;
8744 inode->i_op = &btrfs_file_inode_operations; 8815 inode->i_op = &btrfs_file_inode_operations;
8816 inode->i_mapping->a_ops = &btrfs_aops;
8817 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
8818 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
8819
8820 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
8821 if (err)
8822 goto out_unlock_inode;
8745 8823
8746 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 8824 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
8747 if (err) 8825 if (err)
8748 drop_inode = 1; 8826 goto out_unlock_inode;
8749 else {
8750 inode->i_mapping->a_ops = &btrfs_aops;
8751 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
8752 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
8753 }
8754 if (drop_inode)
8755 goto out_unlock;
8756 8827
8757 path = btrfs_alloc_path(); 8828 path = btrfs_alloc_path();
8758 if (!path) { 8829 if (!path) {
8759 err = -ENOMEM; 8830 err = -ENOMEM;
8760 drop_inode = 1; 8831 goto out_unlock_inode;
8761 goto out_unlock;
8762 } 8832 }
8763 key.objectid = btrfs_ino(inode); 8833 key.objectid = btrfs_ino(inode);
8764 key.offset = 0; 8834 key.offset = 0;
@@ -8767,9 +8837,8 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
8767 err = btrfs_insert_empty_item(trans, root, path, &key, 8837 err = btrfs_insert_empty_item(trans, root, path, &key,
8768 datasize); 8838 datasize);
8769 if (err) { 8839 if (err) {
8770 drop_inode = 1;
8771 btrfs_free_path(path); 8840 btrfs_free_path(path);
8772 goto out_unlock; 8841 goto out_unlock_inode;
8773 } 8842 }
8774 leaf = path->nodes[0]; 8843 leaf = path->nodes[0];
8775 ei = btrfs_item_ptr(leaf, path->slots[0], 8844 ei = btrfs_item_ptr(leaf, path->slots[0],
@@ -8793,12 +8862,15 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
8793 inode_set_bytes(inode, name_len); 8862 inode_set_bytes(inode, name_len);
8794 btrfs_i_size_write(inode, name_len); 8863 btrfs_i_size_write(inode, name_len);
8795 err = btrfs_update_inode(trans, root, inode); 8864 err = btrfs_update_inode(trans, root, inode);
8796 if (err) 8865 if (err) {
8797 drop_inode = 1; 8866 drop_inode = 1;
8867 goto out_unlock_inode;
8868 }
8869
8870 unlock_new_inode(inode);
8871 d_instantiate(dentry, inode);
8798 8872
8799out_unlock: 8873out_unlock:
8800 if (!err)
8801 d_instantiate(dentry, inode);
8802 btrfs_end_transaction(trans, root); 8874 btrfs_end_transaction(trans, root);
8803 if (drop_inode) { 8875 if (drop_inode) {
8804 inode_dec_link_count(inode); 8876 inode_dec_link_count(inode);
@@ -8806,6 +8878,11 @@ out_unlock:
8806 } 8878 }
8807 btrfs_btree_balance_dirty(root); 8879 btrfs_btree_balance_dirty(root);
8808 return err; 8880 return err;
8881
8882out_unlock_inode:
8883 drop_inode = 1;
8884 unlock_new_inode(inode);
8885 goto out_unlock;
8809} 8886}
8810 8887
8811static int __btrfs_prealloc_file_range(struct inode *inode, int mode, 8888static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
@@ -8989,14 +9066,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
8989 goto out; 9066 goto out;
8990 } 9067 }
8991 9068
8992 ret = btrfs_init_inode_security(trans, inode, dir, NULL);
8993 if (ret)
8994 goto out;
8995
8996 ret = btrfs_update_inode(trans, root, inode);
8997 if (ret)
8998 goto out;
8999
9000 inode->i_fop = &btrfs_file_operations; 9069 inode->i_fop = &btrfs_file_operations;
9001 inode->i_op = &btrfs_file_inode_operations; 9070 inode->i_op = &btrfs_file_inode_operations;
9002 9071
@@ -9004,10 +9073,26 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
9004 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 9073 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
9005 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 9074 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
9006 9075
9076 ret = btrfs_init_inode_security(trans, inode, dir, NULL);
9077 if (ret)
9078 goto out_inode;
9079
9080 ret = btrfs_update_inode(trans, root, inode);
9081 if (ret)
9082 goto out_inode;
9007 ret = btrfs_orphan_add(trans, inode); 9083 ret = btrfs_orphan_add(trans, inode);
9008 if (ret) 9084 if (ret)
9009 goto out; 9085 goto out_inode;
9010 9086
9087 /*
9088 * We set number of links to 0 in btrfs_new_inode(), and here we set
9089 * it to 1 because d_tmpfile() will issue a warning if the count is 0,
9090 * through:
9091 *
9092 * d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
9093 */
9094 set_nlink(inode, 1);
9095 unlock_new_inode(inode);
9011 d_tmpfile(dentry, inode); 9096 d_tmpfile(dentry, inode);
9012 mark_inode_dirty(inode); 9097 mark_inode_dirty(inode);
9013 9098
@@ -9017,8 +9102,12 @@ out:
9017 iput(inode); 9102 iput(inode);
9018 btrfs_balance_delayed_items(root); 9103 btrfs_balance_delayed_items(root);
9019 btrfs_btree_balance_dirty(root); 9104 btrfs_btree_balance_dirty(root);
9020
9021 return ret; 9105 return ret;
9106
9107out_inode:
9108 unlock_new_inode(inode);
9109 goto out;
9110
9022} 9111}
9023 9112
9024static const struct inode_operations btrfs_dir_inode_operations = { 9113static const struct inode_operations btrfs_dir_inode_operations = {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 47aceb494d1d..8a8e29878c34 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -711,39 +711,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
711 if (ret) 711 if (ret)
712 goto fail; 712 goto fail;
713 713
714 ret = btrfs_orphan_cleanup(pending_snapshot->snap);
715 if (ret)
716 goto fail;
717
718 /*
719 * If orphan cleanup did remove any orphans, it means the tree was
720 * modified and therefore the commit root is not the same as the
721 * current root anymore. This is a problem, because send uses the
722 * commit root and therefore can see inode items that don't exist
723 * in the current root anymore, and for example make calls to
724 * btrfs_iget, which will do tree lookups based on the current root
725 * and not on the commit root. Those lookups will fail, returning a
726 * -ESTALE error, and making send fail with that error. So make sure
727 * a send does not see any orphans we have just removed, and that it
728 * will see the same inodes regardless of whether a transaction
729 * commit happened before it started (meaning that the commit root
730 * will be the same as the current root) or not.
731 */
732 if (readonly && pending_snapshot->snap->node !=
733 pending_snapshot->snap->commit_root) {
734 trans = btrfs_join_transaction(pending_snapshot->snap);
735 if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) {
736 ret = PTR_ERR(trans);
737 goto fail;
738 }
739 if (!IS_ERR(trans)) {
740 ret = btrfs_commit_transaction(trans,
741 pending_snapshot->snap);
742 if (ret)
743 goto fail;
744 }
745 }
746
747 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); 714 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
748 if (IS_ERR(inode)) { 715 if (IS_ERR(inode)) {
749 ret = PTR_ERR(inode); 716 ret = PTR_ERR(inode);
@@ -1052,8 +1019,10 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
1052 return false; 1019 return false;
1053 1020
1054 next = defrag_lookup_extent(inode, em->start + em->len); 1021 next = defrag_lookup_extent(inode, em->start + em->len);
1055 if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE || 1022 if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
1056 (em->block_start + em->block_len == next->block_start)) 1023 ret = false;
1024 else if ((em->block_start + em->block_len == next->block_start) &&
1025 (em->block_len > 128 * 1024 && next->block_len > 128 * 1024))
1057 ret = false; 1026 ret = false;
1058 1027
1059 free_extent_map(next); 1028 free_extent_map(next);
@@ -1088,7 +1057,6 @@ static int should_defrag_range(struct inode *inode, u64 start, int thresh,
1088 } 1057 }
1089 1058
1090 next_mergeable = defrag_check_next_extent(inode, em); 1059 next_mergeable = defrag_check_next_extent(inode, em);
1091
1092 /* 1060 /*
1093 * we hit a real extent, if it is big or the next extent is not a 1061 * we hit a real extent, if it is big or the next extent is not a
1094 * real extent, don't bother defragging it 1062 * real extent, don't bother defragging it
@@ -1735,7 +1703,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
1735 ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY | 1703 ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY |
1736 BTRFS_SUBVOL_QGROUP_INHERIT)) { 1704 BTRFS_SUBVOL_QGROUP_INHERIT)) {
1737 ret = -EOPNOTSUPP; 1705 ret = -EOPNOTSUPP;
1738 goto out; 1706 goto free_args;
1739 } 1707 }
1740 1708
1741 if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) 1709 if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
@@ -1745,27 +1713,31 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
1745 if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { 1713 if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
1746 if (vol_args->size > PAGE_CACHE_SIZE) { 1714 if (vol_args->size > PAGE_CACHE_SIZE) {
1747 ret = -EINVAL; 1715 ret = -EINVAL;
1748 goto out; 1716 goto free_args;
1749 } 1717 }
1750 inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size); 1718 inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
1751 if (IS_ERR(inherit)) { 1719 if (IS_ERR(inherit)) {
1752 ret = PTR_ERR(inherit); 1720 ret = PTR_ERR(inherit);
1753 goto out; 1721 goto free_args;
1754 } 1722 }
1755 } 1723 }
1756 1724
1757 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, 1725 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
1758 vol_args->fd, subvol, ptr, 1726 vol_args->fd, subvol, ptr,
1759 readonly, inherit); 1727 readonly, inherit);
1728 if (ret)
1729 goto free_inherit;
1760 1730
1761 if (ret == 0 && ptr && 1731 if (ptr && copy_to_user(arg +
1762 copy_to_user(arg + 1732 offsetof(struct btrfs_ioctl_vol_args_v2,
1763 offsetof(struct btrfs_ioctl_vol_args_v2, 1733 transid),
1764 transid), ptr, sizeof(*ptr))) 1734 ptr, sizeof(*ptr)))
1765 ret = -EFAULT; 1735 ret = -EFAULT;
1766out: 1736
1767 kfree(vol_args); 1737free_inherit:
1768 kfree(inherit); 1738 kfree(inherit);
1739free_args:
1740 kfree(vol_args);
1769 return ret; 1741 return ret;
1770} 1742}
1771 1743
@@ -2685,7 +2657,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
2685 vol_args = memdup_user(arg, sizeof(*vol_args)); 2657 vol_args = memdup_user(arg, sizeof(*vol_args));
2686 if (IS_ERR(vol_args)) { 2658 if (IS_ERR(vol_args)) {
2687 ret = PTR_ERR(vol_args); 2659 ret = PTR_ERR(vol_args);
2688 goto out; 2660 goto err_drop;
2689 } 2661 }
2690 2662
2691 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2663 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
@@ -2703,6 +2675,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
2703 2675
2704out: 2676out:
2705 kfree(vol_args); 2677 kfree(vol_args);
2678err_drop:
2706 mnt_drop_write_file(file); 2679 mnt_drop_write_file(file);
2707 return ret; 2680 return ret;
2708} 2681}
@@ -3527,7 +3500,8 @@ process_slot:
3527 btrfs_mark_buffer_dirty(leaf); 3500 btrfs_mark_buffer_dirty(leaf);
3528 btrfs_release_path(path); 3501 btrfs_release_path(path);
3529 3502
3530 last_dest_end = new_key.offset + datal; 3503 last_dest_end = ALIGN(new_key.offset + datal,
3504 root->sectorsize);
3531 ret = clone_finish_inode_update(trans, inode, 3505 ret = clone_finish_inode_update(trans, inode,
3532 last_dest_end, 3506 last_dest_end,
3533 destoff, olen); 3507 destoff, olen);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 7187b14faa6c..ac734ec4cc20 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -571,18 +571,6 @@ void btrfs_remove_ordered_extent(struct inode *inode,
571 571
572 trace_btrfs_ordered_extent_remove(inode, entry); 572 trace_btrfs_ordered_extent_remove(inode, entry);
573 573
574 /*
575 * we have no more ordered extents for this inode and
576 * no dirty pages. We can safely remove it from the
577 * list of ordered extents
578 */
579 if (RB_EMPTY_ROOT(&tree->tree) &&
580 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
581 spin_lock(&root->fs_info->ordered_root_lock);
582 list_del_init(&BTRFS_I(inode)->ordered_operations);
583 spin_unlock(&root->fs_info->ordered_root_lock);
584 }
585
586 if (!root->nr_ordered_extents) { 574 if (!root->nr_ordered_extents) {
587 spin_lock(&root->fs_info->ordered_root_lock); 575 spin_lock(&root->fs_info->ordered_root_lock);
588 BUG_ON(list_empty(&root->ordered_root)); 576 BUG_ON(list_empty(&root->ordered_root));
@@ -627,6 +615,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
627 spin_unlock(&root->ordered_extent_lock); 615 spin_unlock(&root->ordered_extent_lock);
628 616
629 btrfs_init_work(&ordered->flush_work, 617 btrfs_init_work(&ordered->flush_work,
618 btrfs_flush_delalloc_helper,
630 btrfs_run_ordered_extent_work, NULL, NULL); 619 btrfs_run_ordered_extent_work, NULL, NULL);
631 list_add_tail(&ordered->work_list, &works); 620 list_add_tail(&ordered->work_list, &works);
632 btrfs_queue_work(root->fs_info->flush_workers, 621 btrfs_queue_work(root->fs_info->flush_workers,
@@ -687,81 +676,6 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
687} 676}
688 677
689/* 678/*
690 * this is used during transaction commit to write all the inodes
691 * added to the ordered operation list. These files must be fully on
692 * disk before the transaction commits.
693 *
694 * we have two modes here, one is to just start the IO via filemap_flush
695 * and the other is to wait for all the io. When we wait, we have an
696 * extra check to make sure the ordered operation list really is empty
697 * before we return
698 */
699int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
700 struct btrfs_root *root, int wait)
701{
702 struct btrfs_inode *btrfs_inode;
703 struct inode *inode;
704 struct btrfs_transaction *cur_trans = trans->transaction;
705 struct list_head splice;
706 struct list_head works;
707 struct btrfs_delalloc_work *work, *next;
708 int ret = 0;
709
710 INIT_LIST_HEAD(&splice);
711 INIT_LIST_HEAD(&works);
712
713 mutex_lock(&root->fs_info->ordered_extent_flush_mutex);
714 spin_lock(&root->fs_info->ordered_root_lock);
715 list_splice_init(&cur_trans->ordered_operations, &splice);
716 while (!list_empty(&splice)) {
717 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
718 ordered_operations);
719 inode = &btrfs_inode->vfs_inode;
720
721 list_del_init(&btrfs_inode->ordered_operations);
722
723 /*
724 * the inode may be getting freed (in sys_unlink path).
725 */
726 inode = igrab(inode);
727 if (!inode)
728 continue;
729
730 if (!wait)
731 list_add_tail(&BTRFS_I(inode)->ordered_operations,
732 &cur_trans->ordered_operations);
733 spin_unlock(&root->fs_info->ordered_root_lock);
734
735 work = btrfs_alloc_delalloc_work(inode, wait, 1);
736 if (!work) {
737 spin_lock(&root->fs_info->ordered_root_lock);
738 if (list_empty(&BTRFS_I(inode)->ordered_operations))
739 list_add_tail(&btrfs_inode->ordered_operations,
740 &splice);
741 list_splice_tail(&splice,
742 &cur_trans->ordered_operations);
743 spin_unlock(&root->fs_info->ordered_root_lock);
744 ret = -ENOMEM;
745 goto out;
746 }
747 list_add_tail(&work->list, &works);
748 btrfs_queue_work(root->fs_info->flush_workers,
749 &work->work);
750
751 cond_resched();
752 spin_lock(&root->fs_info->ordered_root_lock);
753 }
754 spin_unlock(&root->fs_info->ordered_root_lock);
755out:
756 list_for_each_entry_safe(work, next, &works, list) {
757 list_del_init(&work->list);
758 btrfs_wait_and_free_delalloc_work(work);
759 }
760 mutex_unlock(&root->fs_info->ordered_extent_flush_mutex);
761 return ret;
762}
763
764/*
765 * Used to start IO or wait for a given ordered extent to finish. 679 * Used to start IO or wait for a given ordered extent to finish.
766 * 680 *
767 * If wait is one, this effectively waits on page writeback for all the pages 681 * If wait is one, this effectively waits on page writeback for all the pages
@@ -1120,42 +1034,6 @@ out:
1120 return index; 1034 return index;
1121} 1035}
1122 1036
1123
1124/*
1125 * add a given inode to the list of inodes that must be fully on
1126 * disk before a transaction commit finishes.
1127 *
1128 * This basically gives us the ext3 style data=ordered mode, and it is mostly
1129 * used to make sure renamed files are fully on disk.
1130 *
1131 * It is a noop if the inode is already fully on disk.
1132 *
1133 * If trans is not null, we'll do a friendly check for a transaction that
1134 * is already flushing things and force the IO down ourselves.
1135 */
1136void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
1137 struct btrfs_root *root, struct inode *inode)
1138{
1139 struct btrfs_transaction *cur_trans = trans->transaction;
1140 u64 last_mod;
1141
1142 last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
1143
1144 /*
1145 * if this file hasn't been changed since the last transaction
1146 * commit, we can safely return without doing anything
1147 */
1148 if (last_mod <= root->fs_info->last_trans_committed)
1149 return;
1150
1151 spin_lock(&root->fs_info->ordered_root_lock);
1152 if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
1153 list_add_tail(&BTRFS_I(inode)->ordered_operations,
1154 &cur_trans->ordered_operations);
1155 }
1156 spin_unlock(&root->fs_info->ordered_root_lock);
1157}
1158
1159int __init ordered_data_init(void) 1037int __init ordered_data_init(void)
1160{ 1038{
1161 btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", 1039 btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 246897058efb..d81a274d621e 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -190,11 +190,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
190 struct btrfs_ordered_extent *ordered); 190 struct btrfs_ordered_extent *ordered);
191int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, 191int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
192 u32 *sum, int len); 192 u32 *sum, int len);
193int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
194 struct btrfs_root *root, int wait);
195void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
196 struct btrfs_root *root,
197 struct inode *inode);
198int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); 193int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
199void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); 194void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
200void btrfs_get_logged_extents(struct inode *inode, 195void btrfs_get_logged_extents(struct inode *inode,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 98cb6b2630f9..ded5c601d916 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1201,6 +1201,50 @@ out:
1201 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1201 mutex_unlock(&fs_info->qgroup_ioctl_lock);
1202 return ret; 1202 return ret;
1203} 1203}
1204
1205static int comp_oper_exist(struct btrfs_qgroup_operation *oper1,
1206 struct btrfs_qgroup_operation *oper2)
1207{
1208 /*
1209 * Ignore seq and type here, we're looking for any operation
1210 * at all related to this extent on that root.
1211 */
1212 if (oper1->bytenr < oper2->bytenr)
1213 return -1;
1214 if (oper1->bytenr > oper2->bytenr)
1215 return 1;
1216 if (oper1->ref_root < oper2->ref_root)
1217 return -1;
1218 if (oper1->ref_root > oper2->ref_root)
1219 return 1;
1220 return 0;
1221}
1222
1223static int qgroup_oper_exists(struct btrfs_fs_info *fs_info,
1224 struct btrfs_qgroup_operation *oper)
1225{
1226 struct rb_node *n;
1227 struct btrfs_qgroup_operation *cur;
1228 int cmp;
1229
1230 spin_lock(&fs_info->qgroup_op_lock);
1231 n = fs_info->qgroup_op_tree.rb_node;
1232 while (n) {
1233 cur = rb_entry(n, struct btrfs_qgroup_operation, n);
1234 cmp = comp_oper_exist(cur, oper);
1235 if (cmp < 0) {
1236 n = n->rb_right;
1237 } else if (cmp) {
1238 n = n->rb_left;
1239 } else {
1240 spin_unlock(&fs_info->qgroup_op_lock);
1241 return -EEXIST;
1242 }
1243 }
1244 spin_unlock(&fs_info->qgroup_op_lock);
1245 return 0;
1246}
1247
1204static int comp_oper(struct btrfs_qgroup_operation *oper1, 1248static int comp_oper(struct btrfs_qgroup_operation *oper1,
1205 struct btrfs_qgroup_operation *oper2) 1249 struct btrfs_qgroup_operation *oper2)
1206{ 1250{
@@ -1290,6 +1334,23 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
1290 oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq); 1334 oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq);
1291 INIT_LIST_HEAD(&oper->elem.list); 1335 INIT_LIST_HEAD(&oper->elem.list);
1292 oper->elem.seq = 0; 1336 oper->elem.seq = 0;
1337
1338 if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) {
1339 /*
1340 * If any operation for this bytenr/ref_root combo
1341 * exists, then we know it's not exclusively owned and
1342 * shouldn't be queued up.
1343 *
1344 * This also catches the case where we have a cloned
1345 * extent that gets queued up multiple times during
1346 * drop snapshot.
1347 */
1348 if (qgroup_oper_exists(fs_info, oper)) {
1349 kfree(oper);
1350 return 0;
1351 }
1352 }
1353
1293 ret = insert_qgroup_oper(fs_info, oper); 1354 ret = insert_qgroup_oper(fs_info, oper);
1294 if (ret) { 1355 if (ret) {
1295 /* Shouldn't happen so have an assert for developers */ 1356 /* Shouldn't happen so have an assert for developers */
@@ -1884,6 +1945,111 @@ out:
1884} 1945}
1885 1946
1886/* 1947/*
1948 * Process a reference to a shared subtree. This type of operation is
1949 * queued during snapshot removal when we encounter extents which are
1950 * shared between more than one root.
1951 */
1952static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans,
1953 struct btrfs_fs_info *fs_info,
1954 struct btrfs_qgroup_operation *oper)
1955{
1956 struct ulist *roots = NULL;
1957 struct ulist_node *unode;
1958 struct ulist_iterator uiter;
1959 struct btrfs_qgroup_list *glist;
1960 struct ulist *parents;
1961 int ret = 0;
1962 int err;
1963 struct btrfs_qgroup *qg;
1964 u64 root_obj = 0;
1965 struct seq_list elem = {};
1966
1967 parents = ulist_alloc(GFP_NOFS);
1968 if (!parents)
1969 return -ENOMEM;
1970
1971 btrfs_get_tree_mod_seq(fs_info, &elem);
1972 ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
1973 elem.seq, &roots);
1974 btrfs_put_tree_mod_seq(fs_info, &elem);
1975 if (ret < 0)
1976 goto out;
1977
1978 if (roots->nnodes != 1)
1979 goto out;
1980
1981 ULIST_ITER_INIT(&uiter);
1982 unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */
1983 /*
1984 * If we find our ref root then that means all refs
1985 * this extent has to the root have not yet been
1986 * deleted. In that case, we do nothing and let the
1987 * last ref for this bytenr drive our update.
1988 *
1989 * This can happen for example if an extent is
1990 * referenced multiple times in a snapshot (clone,
1991 * etc). If we are in the middle of snapshot removal,
1992 * queued updates for such an extent will find the
1993 * root if we have not yet finished removing the
1994 * snapshot.
1995 */
1996 if (unode->val == oper->ref_root)
1997 goto out;
1998
1999 root_obj = unode->val;
2000 BUG_ON(!root_obj);
2001
2002 spin_lock(&fs_info->qgroup_lock);
2003 qg = find_qgroup_rb(fs_info, root_obj);
2004 if (!qg)
2005 goto out_unlock;
2006
2007 qg->excl += oper->num_bytes;
2008 qg->excl_cmpr += oper->num_bytes;
2009 qgroup_dirty(fs_info, qg);
2010
2011 /*
2012 * Adjust counts for parent groups. First we find all
2013 * parents, then in the 2nd loop we do the adjustment
2014 * while adding parents of the parents to our ulist.
2015 */
2016 list_for_each_entry(glist, &qg->groups, next_group) {
2017 err = ulist_add(parents, glist->group->qgroupid,
2018 ptr_to_u64(glist->group), GFP_ATOMIC);
2019 if (err < 0) {
2020 ret = err;
2021 goto out_unlock;
2022 }
2023 }
2024
2025 ULIST_ITER_INIT(&uiter);
2026 while ((unode = ulist_next(parents, &uiter))) {
2027 qg = u64_to_ptr(unode->aux);
2028 qg->excl += oper->num_bytes;
2029 qg->excl_cmpr += oper->num_bytes;
2030 qgroup_dirty(fs_info, qg);
2031
2032 /* Add any parents of the parents */
2033 list_for_each_entry(glist, &qg->groups, next_group) {
2034 err = ulist_add(parents, glist->group->qgroupid,
2035 ptr_to_u64(glist->group), GFP_ATOMIC);
2036 if (err < 0) {
2037 ret = err;
2038 goto out_unlock;
2039 }
2040 }
2041 }
2042
2043out_unlock:
2044 spin_unlock(&fs_info->qgroup_lock);
2045
2046out:
2047 ulist_free(roots);
2048 ulist_free(parents);
2049 return ret;
2050}
2051
2052/*
1887 * btrfs_qgroup_account_ref is called for every ref that is added to or deleted 2053 * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
1888 * from the fs. First, all roots referencing the extent are searched, and 2054 * from the fs. First, all roots referencing the extent are searched, and
1889 * then the space is accounted accordingly to the different roots. The 2055 * then the space is accounted accordingly to the different roots. The
@@ -1920,6 +2086,9 @@ static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
1920 case BTRFS_QGROUP_OPER_SUB_SHARED: 2086 case BTRFS_QGROUP_OPER_SUB_SHARED:
1921 ret = qgroup_shared_accounting(trans, fs_info, oper); 2087 ret = qgroup_shared_accounting(trans, fs_info, oper);
1922 break; 2088 break;
2089 case BTRFS_QGROUP_OPER_SUB_SUBTREE:
2090 ret = qgroup_subtree_accounting(trans, fs_info, oper);
2091 break;
1923 default: 2092 default:
1924 ASSERT(0); 2093 ASSERT(0);
1925 } 2094 }
@@ -2551,6 +2720,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
2551 memset(&fs_info->qgroup_rescan_work, 0, 2720 memset(&fs_info->qgroup_rescan_work, 0,
2552 sizeof(fs_info->qgroup_rescan_work)); 2721 sizeof(fs_info->qgroup_rescan_work));
2553 btrfs_init_work(&fs_info->qgroup_rescan_work, 2722 btrfs_init_work(&fs_info->qgroup_rescan_work,
2723 btrfs_qgroup_rescan_helper,
2554 btrfs_qgroup_rescan_worker, NULL, NULL); 2724 btrfs_qgroup_rescan_worker, NULL, NULL);
2555 2725
2556 if (ret) { 2726 if (ret) {
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 5952ff1fbd7a..18cc68ca3090 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -44,6 +44,7 @@ enum btrfs_qgroup_operation_type {
44 BTRFS_QGROUP_OPER_ADD_SHARED, 44 BTRFS_QGROUP_OPER_ADD_SHARED,
45 BTRFS_QGROUP_OPER_SUB_EXCL, 45 BTRFS_QGROUP_OPER_SUB_EXCL,
46 BTRFS_QGROUP_OPER_SUB_SHARED, 46 BTRFS_QGROUP_OPER_SUB_SHARED,
47 BTRFS_QGROUP_OPER_SUB_SUBTREE,
47}; 48};
48 49
49struct btrfs_qgroup_operation { 50struct btrfs_qgroup_operation {
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 4a88f073fdd7..0a6b6e4bcbb9 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1416,7 +1416,8 @@ cleanup:
1416 1416
1417static void async_rmw_stripe(struct btrfs_raid_bio *rbio) 1417static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1418{ 1418{
1419 btrfs_init_work(&rbio->work, rmw_work, NULL, NULL); 1419 btrfs_init_work(&rbio->work, btrfs_rmw_helper,
1420 rmw_work, NULL, NULL);
1420 1421
1421 btrfs_queue_work(rbio->fs_info->rmw_workers, 1422 btrfs_queue_work(rbio->fs_info->rmw_workers,
1422 &rbio->work); 1423 &rbio->work);
@@ -1424,7 +1425,8 @@ static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1424 1425
1425static void async_read_rebuild(struct btrfs_raid_bio *rbio) 1426static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1426{ 1427{
1427 btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL); 1428 btrfs_init_work(&rbio->work, btrfs_rmw_helper,
1429 read_rebuild_work, NULL, NULL);
1428 1430
1429 btrfs_queue_work(rbio->fs_info->rmw_workers, 1431 btrfs_queue_work(rbio->fs_info->rmw_workers,
1430 &rbio->work); 1432 &rbio->work);
@@ -1665,7 +1667,8 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1665 plug = container_of(cb, struct btrfs_plug_cb, cb); 1667 plug = container_of(cb, struct btrfs_plug_cb, cb);
1666 1668
1667 if (from_schedule) { 1669 if (from_schedule) {
1668 btrfs_init_work(&plug->work, unplug_work, NULL, NULL); 1670 btrfs_init_work(&plug->work, btrfs_rmw_helper,
1671 unplug_work, NULL, NULL);
1669 btrfs_queue_work(plug->info->rmw_workers, 1672 btrfs_queue_work(plug->info->rmw_workers,
1670 &plug->work); 1673 &plug->work);
1671 return; 1674 return;
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 09230cf3a244..20408c6b665a 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -798,7 +798,8 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info)
798 /* FIXME we cannot handle this properly right now */ 798 /* FIXME we cannot handle this properly right now */
799 BUG(); 799 BUG();
800 } 800 }
801 btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL); 801 btrfs_init_work(&rmw->work, btrfs_readahead_helper,
802 reada_start_machine_worker, NULL, NULL);
802 rmw->fs_info = fs_info; 803 rmw->fs_info = fs_info;
803 804
804 btrfs_queue_work(fs_info->readahead_workers, &rmw->work); 805 btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b6d198f5181e..f4a41f37be22 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -428,8 +428,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
428 sbio->index = i; 428 sbio->index = i;
429 sbio->sctx = sctx; 429 sbio->sctx = sctx;
430 sbio->page_count = 0; 430 sbio->page_count = 0;
431 btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, 431 btrfs_init_work(&sbio->work, btrfs_scrub_helper,
432 NULL, NULL); 432 scrub_bio_end_io_worker, NULL, NULL);
433 433
434 if (i != SCRUB_BIOS_PER_SCTX - 1) 434 if (i != SCRUB_BIOS_PER_SCTX - 1)
435 sctx->bios[i]->next_free = i + 1; 435 sctx->bios[i]->next_free = i + 1;
@@ -999,8 +999,8 @@ nodatasum_case:
999 fixup_nodatasum->root = fs_info->extent_root; 999 fixup_nodatasum->root = fs_info->extent_root;
1000 fixup_nodatasum->mirror_num = failed_mirror_index + 1; 1000 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
1001 scrub_pending_trans_workers_inc(sctx); 1001 scrub_pending_trans_workers_inc(sctx);
1002 btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum, 1002 btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
1003 NULL, NULL); 1003 scrub_fixup_nodatasum, NULL, NULL);
1004 btrfs_queue_work(fs_info->scrub_workers, 1004 btrfs_queue_work(fs_info->scrub_workers,
1005 &fixup_nodatasum->work); 1005 &fixup_nodatasum->work);
1006 goto out; 1006 goto out;
@@ -1616,7 +1616,8 @@ static void scrub_wr_bio_end_io(struct bio *bio, int err)
1616 sbio->err = err; 1616 sbio->err = err;
1617 sbio->bio = bio; 1617 sbio->bio = bio;
1618 1618
1619 btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL); 1619 btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
1620 scrub_wr_bio_end_io_worker, NULL, NULL);
1620 btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); 1621 btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1621} 1622}
1622 1623
@@ -2904,6 +2905,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2904 struct scrub_ctx *sctx; 2905 struct scrub_ctx *sctx;
2905 int ret; 2906 int ret;
2906 struct btrfs_device *dev; 2907 struct btrfs_device *dev;
2908 struct rcu_string *name;
2907 2909
2908 if (btrfs_fs_closing(fs_info)) 2910 if (btrfs_fs_closing(fs_info))
2909 return -EINVAL; 2911 return -EINVAL;
@@ -2965,6 +2967,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2965 return -ENODEV; 2967 return -ENODEV;
2966 } 2968 }
2967 2969
2970 if (!is_dev_replace && !readonly && !dev->writeable) {
2971 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2972 rcu_read_lock();
2973 name = rcu_dereference(dev->name);
2974 btrfs_err(fs_info, "scrub: device %s is not writable",
2975 name->str);
2976 rcu_read_unlock();
2977 return -EROFS;
2978 }
2979
2968 mutex_lock(&fs_info->scrub_lock); 2980 mutex_lock(&fs_info->scrub_lock);
2969 if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) { 2981 if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
2970 mutex_unlock(&fs_info->scrub_lock); 2982 mutex_unlock(&fs_info->scrub_lock);
@@ -3203,7 +3215,8 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3203 nocow_ctx->len = len; 3215 nocow_ctx->len = len;
3204 nocow_ctx->mirror_num = mirror_num; 3216 nocow_ctx->mirror_num = mirror_num;
3205 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; 3217 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3206 btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL); 3218 btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper,
3219 copy_nocow_pages_worker, NULL, NULL);
3207 INIT_LIST_HEAD(&nocow_ctx->inodes); 3220 INIT_LIST_HEAD(&nocow_ctx->inodes);
3208 btrfs_queue_work(fs_info->scrub_nocow_workers, 3221 btrfs_queue_work(fs_info->scrub_nocow_workers,
3209 &nocow_ctx->work); 3222 &nocow_ctx->work);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 67b48b9a03e0..c4124de4435b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1665,6 +1665,21 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1665 return 0; 1665 return 0;
1666} 1666}
1667 1667
1668/*
1669 * Calculate numbers for 'df', pessimistic in case of mixed raid profiles.
1670 *
1671 * If there's a redundant raid level at DATA block groups, use the respective
1672 * multiplier to scale the sizes.
1673 *
1674 * Unused device space usage is based on simulating the chunk allocator
1675 * algorithm that respects the device sizes, order of allocations and the
1676 * 'alloc_start' value, this is a close approximation of the actual use but
1677 * there are other factors that may change the result (like a new metadata
1678 * chunk).
1679 *
1680 * FIXME: not accurate for mixed block groups, total and free/used are ok,
1681 * available appears slightly larger.
1682 */
1668static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) 1683static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1669{ 1684{
1670 struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); 1685 struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
@@ -1675,6 +1690,8 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1675 u64 total_free_data = 0; 1690 u64 total_free_data = 0;
1676 int bits = dentry->d_sb->s_blocksize_bits; 1691 int bits = dentry->d_sb->s_blocksize_bits;
1677 __be32 *fsid = (__be32 *)fs_info->fsid; 1692 __be32 *fsid = (__be32 *)fs_info->fsid;
1693 unsigned factor = 1;
1694 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
1678 int ret; 1695 int ret;
1679 1696
1680 /* holding chunk_muext to avoid allocating new chunks */ 1697 /* holding chunk_muext to avoid allocating new chunks */
@@ -1682,30 +1699,52 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1682 rcu_read_lock(); 1699 rcu_read_lock();
1683 list_for_each_entry_rcu(found, head, list) { 1700 list_for_each_entry_rcu(found, head, list) {
1684 if (found->flags & BTRFS_BLOCK_GROUP_DATA) { 1701 if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
1702 int i;
1703
1685 total_free_data += found->disk_total - found->disk_used; 1704 total_free_data += found->disk_total - found->disk_used;
1686 total_free_data -= 1705 total_free_data -=
1687 btrfs_account_ro_block_groups_free_space(found); 1706 btrfs_account_ro_block_groups_free_space(found);
1707
1708 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1709 if (!list_empty(&found->block_groups[i])) {
1710 switch (i) {
1711 case BTRFS_RAID_DUP:
1712 case BTRFS_RAID_RAID1:
1713 case BTRFS_RAID_RAID10:
1714 factor = 2;
1715 }
1716 }
1717 }
1688 } 1718 }
1689 1719
1690 total_used += found->disk_used; 1720 total_used += found->disk_used;
1691 } 1721 }
1722
1692 rcu_read_unlock(); 1723 rcu_read_unlock();
1693 1724
1694 buf->f_namelen = BTRFS_NAME_LEN; 1725 buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor);
1695 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; 1726 buf->f_blocks >>= bits;
1696 buf->f_bfree = buf->f_blocks - (total_used >> bits); 1727 buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits);
1697 buf->f_bsize = dentry->d_sb->s_blocksize; 1728
1698 buf->f_type = BTRFS_SUPER_MAGIC; 1729 /* Account global block reserve as used, it's in logical size already */
1730 spin_lock(&block_rsv->lock);
1731 buf->f_bfree -= block_rsv->size >> bits;
1732 spin_unlock(&block_rsv->lock);
1733
1699 buf->f_bavail = total_free_data; 1734 buf->f_bavail = total_free_data;
1700 ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data); 1735 ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
1701 if (ret) { 1736 if (ret) {
1702 mutex_unlock(&fs_info->chunk_mutex); 1737 mutex_unlock(&fs_info->chunk_mutex);
1703 return ret; 1738 return ret;
1704 } 1739 }
1705 buf->f_bavail += total_free_data; 1740 buf->f_bavail += div_u64(total_free_data, factor);
1706 buf->f_bavail = buf->f_bavail >> bits; 1741 buf->f_bavail = buf->f_bavail >> bits;
1707 mutex_unlock(&fs_info->chunk_mutex); 1742 mutex_unlock(&fs_info->chunk_mutex);
1708 1743
1744 buf->f_type = BTRFS_SUPER_MAGIC;
1745 buf->f_bsize = dentry->d_sb->s_blocksize;
1746 buf->f_namelen = BTRFS_NAME_LEN;
1747
1709 /* We treat it as constant endianness (it doesn't matter _which_) 1748 /* We treat it as constant endianness (it doesn't matter _which_)
1710 because we want the fsid to come out the same whether mounted 1749 because we want the fsid to come out the same whether mounted
1711 on a big-endian or little-endian host */ 1750 on a big-endian or little-endian host */
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 78699364f537..12e53556e214 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -614,7 +614,7 @@ int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
614 if (!fs_info->device_dir_kobj) 614 if (!fs_info->device_dir_kobj)
615 return -EINVAL; 615 return -EINVAL;
616 616
617 if (one_device) { 617 if (one_device && one_device->bdev) {
618 disk = one_device->bdev->bd_part; 618 disk = one_device->bdev->bd_part;
619 disk_kobj = &part_to_dev(disk)->kobj; 619 disk_kobj = &part_to_dev(disk)->kobj;
620 620
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5f379affdf23..d89c6d3542ca 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -218,7 +218,6 @@ loop:
218 spin_lock_init(&cur_trans->delayed_refs.lock); 218 spin_lock_init(&cur_trans->delayed_refs.lock);
219 219
220 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 220 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
221 INIT_LIST_HEAD(&cur_trans->ordered_operations);
222 INIT_LIST_HEAD(&cur_trans->pending_chunks); 221 INIT_LIST_HEAD(&cur_trans->pending_chunks);
223 INIT_LIST_HEAD(&cur_trans->switch_commits); 222 INIT_LIST_HEAD(&cur_trans->switch_commits);
224 list_add_tail(&cur_trans->list, &fs_info->trans_list); 223 list_add_tail(&cur_trans->list, &fs_info->trans_list);
@@ -1612,27 +1611,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1612 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1611 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1613} 1612}
1614 1613
1615static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1616 struct btrfs_root *root)
1617{
1618 int ret;
1619
1620 ret = btrfs_run_delayed_items(trans, root);
1621 if (ret)
1622 return ret;
1623
1624 /*
1625 * rename don't use btrfs_join_transaction, so, once we
1626 * set the transaction to blocked above, we aren't going
1627 * to get any new ordered operations. We can safely run
1628 * it here and no for sure that nothing new will be added
1629 * to the list
1630 */
1631 ret = btrfs_run_ordered_operations(trans, root, 1);
1632
1633 return ret;
1634}
1635
1636static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) 1614static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
1637{ 1615{
1638 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) 1616 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
@@ -1653,13 +1631,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1653 struct btrfs_transaction *prev_trans = NULL; 1631 struct btrfs_transaction *prev_trans = NULL;
1654 int ret; 1632 int ret;
1655 1633
1656 ret = btrfs_run_ordered_operations(trans, root, 0);
1657 if (ret) {
1658 btrfs_abort_transaction(trans, root, ret);
1659 btrfs_end_transaction(trans, root);
1660 return ret;
1661 }
1662
1663 /* Stop the commit early if ->aborted is set */ 1634 /* Stop the commit early if ->aborted is set */
1664 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { 1635 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
1665 ret = cur_trans->aborted; 1636 ret = cur_trans->aborted;
@@ -1740,7 +1711,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1740 if (ret) 1711 if (ret)
1741 goto cleanup_transaction; 1712 goto cleanup_transaction;
1742 1713
1743 ret = btrfs_flush_all_pending_stuffs(trans, root); 1714 ret = btrfs_run_delayed_items(trans, root);
1744 if (ret) 1715 if (ret)
1745 goto cleanup_transaction; 1716 goto cleanup_transaction;
1746 1717
@@ -1748,7 +1719,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1748 extwriter_counter_read(cur_trans) == 0); 1719 extwriter_counter_read(cur_trans) == 0);
1749 1720
1750 /* some pending stuffs might be added after the previous flush. */ 1721 /* some pending stuffs might be added after the previous flush. */
1751 ret = btrfs_flush_all_pending_stuffs(trans, root); 1722 ret = btrfs_run_delayed_items(trans, root);
1752 if (ret) 1723 if (ret)
1753 goto cleanup_transaction; 1724 goto cleanup_transaction;
1754 1725
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 7dd558ed0716..579be51b27e5 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -55,7 +55,6 @@ struct btrfs_transaction {
55 wait_queue_head_t writer_wait; 55 wait_queue_head_t writer_wait;
56 wait_queue_head_t commit_wait; 56 wait_queue_head_t commit_wait;
57 struct list_head pending_snapshots; 57 struct list_head pending_snapshots;
58 struct list_head ordered_operations;
59 struct list_head pending_chunks; 58 struct list_head pending_chunks;
60 struct list_head switch_commits; 59 struct list_head switch_commits;
61 struct btrfs_delayed_ref_root delayed_refs; 60 struct btrfs_delayed_ref_root delayed_refs;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9e1f2cd5e67a..1d1ba083ca6e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -94,8 +94,10 @@
94#define LOG_WALK_REPLAY_ALL 3 94#define LOG_WALK_REPLAY_ALL 3
95 95
96static int btrfs_log_inode(struct btrfs_trans_handle *trans, 96static int btrfs_log_inode(struct btrfs_trans_handle *trans,
97 struct btrfs_root *root, struct inode *inode, 97 struct btrfs_root *root, struct inode *inode,
98 int inode_only); 98 int inode_only,
99 const loff_t start,
100 const loff_t end);
99static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 101static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
100 struct btrfs_root *root, 102 struct btrfs_root *root,
101 struct btrfs_path *path, u64 objectid); 103 struct btrfs_path *path, u64 objectid);
@@ -3298,7 +3300,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
3298 struct list_head ordered_sums; 3300 struct list_head ordered_sums;
3299 int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3301 int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3300 bool has_extents = false; 3302 bool has_extents = false;
3301 bool need_find_last_extent = (*last_extent == 0); 3303 bool need_find_last_extent = true;
3302 bool done = false; 3304 bool done = false;
3303 3305
3304 INIT_LIST_HEAD(&ordered_sums); 3306 INIT_LIST_HEAD(&ordered_sums);
@@ -3352,8 +3354,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
3352 */ 3354 */
3353 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { 3355 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
3354 has_extents = true; 3356 has_extents = true;
3355 if (need_find_last_extent && 3357 if (first_key.objectid == (u64)-1)
3356 first_key.objectid == (u64)-1)
3357 first_key = ins_keys[i]; 3358 first_key = ins_keys[i];
3358 } else { 3359 } else {
3359 need_find_last_extent = false; 3360 need_find_last_extent = false;
@@ -3427,6 +3428,16 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
3427 if (!has_extents) 3428 if (!has_extents)
3428 return ret; 3429 return ret;
3429 3430
3431 if (need_find_last_extent && *last_extent == first_key.offset) {
3432 /*
3433 * We don't have any leafs between our current one and the one
3434 * we processed before that can have file extent items for our
3435 * inode (and have a generation number smaller than our current
3436 * transaction id).
3437 */
3438 need_find_last_extent = false;
3439 }
3440
3430 /* 3441 /*
3431 * Because we use btrfs_search_forward we could skip leaves that were 3442 * Because we use btrfs_search_forward we could skip leaves that were
3432 * not modified and then assume *last_extent is valid when it really 3443 * not modified and then assume *last_extent is valid when it really
@@ -3537,7 +3548,7 @@ fill_holes:
3537 0, 0); 3548 0, 0);
3538 if (ret) 3549 if (ret)
3539 break; 3550 break;
3540 *last_extent = offset + len; 3551 *last_extent = extent_end;
3541 } 3552 }
3542 /* 3553 /*
3543 * Need to let the callers know we dropped the path so they should 3554 * Need to let the callers know we dropped the path so they should
@@ -3849,8 +3860,10 @@ process:
3849 * This handles both files and directories. 3860 * This handles both files and directories.
3850 */ 3861 */
3851static int btrfs_log_inode(struct btrfs_trans_handle *trans, 3862static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3852 struct btrfs_root *root, struct inode *inode, 3863 struct btrfs_root *root, struct inode *inode,
3853 int inode_only) 3864 int inode_only,
3865 const loff_t start,
3866 const loff_t end)
3854{ 3867{
3855 struct btrfs_path *path; 3868 struct btrfs_path *path;
3856 struct btrfs_path *dst_path; 3869 struct btrfs_path *dst_path;
@@ -3867,6 +3880,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3867 int ins_nr; 3880 int ins_nr;
3868 bool fast_search = false; 3881 bool fast_search = false;
3869 u64 ino = btrfs_ino(inode); 3882 u64 ino = btrfs_ino(inode);
3883 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3870 3884
3871 path = btrfs_alloc_path(); 3885 path = btrfs_alloc_path();
3872 if (!path) 3886 if (!path)
@@ -4040,13 +4054,35 @@ log_extents:
4040 goto out_unlock; 4054 goto out_unlock;
4041 } 4055 }
4042 } else if (inode_only == LOG_INODE_ALL) { 4056 } else if (inode_only == LOG_INODE_ALL) {
4043 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
4044 struct extent_map *em, *n; 4057 struct extent_map *em, *n;
4045 4058
4046 write_lock(&tree->lock); 4059 write_lock(&em_tree->lock);
4047 list_for_each_entry_safe(em, n, &tree->modified_extents, list) 4060 /*
4048 list_del_init(&em->list); 4061 * We can't just remove every em if we're called for a ranged
4049 write_unlock(&tree->lock); 4062 * fsync - that is, one that doesn't cover the whole possible
4063 * file range (0 to LLONG_MAX). This is because we can have
4064 * em's that fall outside the range we're logging and therefore
4065 * their ordered operations haven't completed yet
4066 * (btrfs_finish_ordered_io() not invoked yet). This means we
4067 * didn't get their respective file extent item in the fs/subvol
4068 * tree yet, and need to let the next fast fsync (one which
4069 * consults the list of modified extent maps) find the em so
4070 * that it logs a matching file extent item and waits for the
4071 * respective ordered operation to complete (if it's still
4072 * running).
4073 *
4074 * Removing every em outside the range we're logging would make
4075 * the next fast fsync not log their matching file extent items,
4076 * therefore making us lose data after a log replay.
4077 */
4078 list_for_each_entry_safe(em, n, &em_tree->modified_extents,
4079 list) {
4080 const u64 mod_end = em->mod_start + em->mod_len - 1;
4081
4082 if (em->mod_start >= start && mod_end <= end)
4083 list_del_init(&em->list);
4084 }
4085 write_unlock(&em_tree->lock);
4050 } 4086 }
4051 4087
4052 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 4088 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
@@ -4056,6 +4092,7 @@ log_extents:
4056 goto out_unlock; 4092 goto out_unlock;
4057 } 4093 }
4058 } 4094 }
4095
4059 BTRFS_I(inode)->logged_trans = trans->transid; 4096 BTRFS_I(inode)->logged_trans = trans->transid;
4060 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; 4097 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
4061out_unlock: 4098out_unlock:
@@ -4152,7 +4189,10 @@ out:
4152 */ 4189 */
4153static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 4190static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4154 struct btrfs_root *root, struct inode *inode, 4191 struct btrfs_root *root, struct inode *inode,
4155 struct dentry *parent, int exists_only, 4192 struct dentry *parent,
4193 const loff_t start,
4194 const loff_t end,
4195 int exists_only,
4156 struct btrfs_log_ctx *ctx) 4196 struct btrfs_log_ctx *ctx)
4157{ 4197{
4158 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 4198 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
@@ -4198,7 +4238,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4198 if (ret) 4238 if (ret)
4199 goto end_no_trans; 4239 goto end_no_trans;
4200 4240
4201 ret = btrfs_log_inode(trans, root, inode, inode_only); 4241 ret = btrfs_log_inode(trans, root, inode, inode_only, start, end);
4202 if (ret) 4242 if (ret)
4203 goto end_trans; 4243 goto end_trans;
4204 4244
@@ -4226,7 +4266,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4226 4266
4227 if (BTRFS_I(inode)->generation > 4267 if (BTRFS_I(inode)->generation >
4228 root->fs_info->last_trans_committed) { 4268 root->fs_info->last_trans_committed) {
4229 ret = btrfs_log_inode(trans, root, inode, inode_only); 4269 ret = btrfs_log_inode(trans, root, inode, inode_only,
4270 0, LLONG_MAX);
4230 if (ret) 4271 if (ret)
4231 goto end_trans; 4272 goto end_trans;
4232 } 4273 }
@@ -4260,13 +4301,15 @@ end_no_trans:
4260 */ 4301 */
4261int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 4302int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
4262 struct btrfs_root *root, struct dentry *dentry, 4303 struct btrfs_root *root, struct dentry *dentry,
4304 const loff_t start,
4305 const loff_t end,
4263 struct btrfs_log_ctx *ctx) 4306 struct btrfs_log_ctx *ctx)
4264{ 4307{
4265 struct dentry *parent = dget_parent(dentry); 4308 struct dentry *parent = dget_parent(dentry);
4266 int ret; 4309 int ret;
4267 4310
4268 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 4311 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
4269 0, ctx); 4312 start, end, 0, ctx);
4270 dput(parent); 4313 dput(parent);
4271 4314
4272 return ret; 4315 return ret;
@@ -4503,6 +4546,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
4503 root->fs_info->last_trans_committed)) 4546 root->fs_info->last_trans_committed))
4504 return 0; 4547 return 0;
4505 4548
4506 return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL); 4549 return btrfs_log_inode_parent(trans, root, inode, parent, 0,
4550 LLONG_MAX, 1, NULL);
4507} 4551}
4508 4552
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 7f5b41bd5373..e2e798ae7cd7 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -59,6 +59,8 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
59int btrfs_recover_log_trees(struct btrfs_root *tree_root); 59int btrfs_recover_log_trees(struct btrfs_root *tree_root);
60int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 60int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
61 struct btrfs_root *root, struct dentry *dentry, 61 struct btrfs_root *root, struct dentry *dentry,
62 const loff_t start,
63 const loff_t end,
62 struct btrfs_log_ctx *ctx); 64 struct btrfs_log_ctx *ctx);
63int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 65int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
64 struct btrfs_root *root, 66 struct btrfs_root *root,
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 7f78cbf5cf41..4c29db604bbe 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -57,6 +57,21 @@ void ulist_free(struct ulist *ulist);
57int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask); 57int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask);
58int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, 58int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
59 u64 *old_aux, gfp_t gfp_mask); 59 u64 *old_aux, gfp_t gfp_mask);
60
61/* just like ulist_add_merge() but take a pointer for the aux data */
62static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux,
63 void **old_aux, gfp_t gfp_mask)
64{
65#if BITS_PER_LONG == 32
66 u64 old64 = (uintptr_t)*old_aux;
67 int ret = ulist_add_merge(ulist, val, (uintptr_t)aux, &old64, gfp_mask);
68 *old_aux = (void *)((uintptr_t)old64);
69 return ret;
70#else
71 return ulist_add_merge(ulist, val, (u64)aux, (u64 *)old_aux, gfp_mask);
72#endif
73}
74
60struct ulist_node *ulist_next(struct ulist *ulist, 75struct ulist_node *ulist_next(struct ulist *ulist,
61 struct ulist_iterator *uiter); 76 struct ulist_iterator *uiter);
62 77
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6cb82f62cb7c..2c2d6d1d8eee 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -508,6 +508,43 @@ static noinline int device_list_add(const char *path,
508 ret = 1; 508 ret = 1;
509 device->fs_devices = fs_devices; 509 device->fs_devices = fs_devices;
510 } else if (!device->name || strcmp(device->name->str, path)) { 510 } else if (!device->name || strcmp(device->name->str, path)) {
511 /*
512 * When FS is already mounted.
513 * 1. If you are here and if the device->name is NULL that
514 * means this device was missing at time of FS mount.
515 * 2. If you are here and if the device->name is different
516 * from 'path' that means either
517 * a. The same device disappeared and reappeared with
518 * different name. or
519 * b. The missing-disk-which-was-replaced, has
520 * reappeared now.
521 *
522 * We must allow 1 and 2a above. But 2b would be a spurious
523 * and unintentional.
524 *
525 * Further in case of 1 and 2a above, the disk at 'path'
526 * would have missed some transaction when it was away and
527 * in case of 2a the stale bdev has to be updated as well.
528 * 2b must not be allowed at all time.
529 */
530
531 /*
532 * For now, we do allow update to btrfs_fs_device through the
533 * btrfs dev scan cli after FS has been mounted. We're still
534 * tracking a problem where systems fail mount by subvolume id
535 * when we reject replacement on a mounted FS.
536 */
537 if (!fs_devices->opened && found_transid < device->generation) {
538 /*
539 * That is if the FS is _not_ mounted and if you
540 * are here, that means there is more than one
541 * disk with same uuid and devid.We keep the one
542 * with larger generation number or the last-in if
543 * generation are equal.
544 */
545 return -EEXIST;
546 }
547
511 name = rcu_string_strdup(path, GFP_NOFS); 548 name = rcu_string_strdup(path, GFP_NOFS);
512 if (!name) 549 if (!name)
513 return -ENOMEM; 550 return -ENOMEM;
@@ -519,6 +556,15 @@ static noinline int device_list_add(const char *path,
519 } 556 }
520 } 557 }
521 558
559 /*
560 * Unmount does not free the btrfs_device struct but would zero
561 * generation along with most of the other members. So just update
562 * it back. We need it to pick the disk with largest generation
563 * (as above).
564 */
565 if (!fs_devices->opened)
566 device->generation = found_transid;
567
522 if (found_transid > fs_devices->latest_trans) { 568 if (found_transid > fs_devices->latest_trans) {
523 fs_devices->latest_devid = devid; 569 fs_devices->latest_devid = devid;
524 fs_devices->latest_trans = found_transid; 570 fs_devices->latest_trans = found_transid;
@@ -1436,7 +1482,7 @@ static int btrfs_add_device(struct btrfs_trans_handle *trans,
1436 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1482 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1437 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1483 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1438 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1484 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1439 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); 1485 btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
1440 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1486 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1441 btrfs_set_device_group(leaf, dev_item, 0); 1487 btrfs_set_device_group(leaf, dev_item, 0);
1442 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1488 btrfs_set_device_seek_speed(leaf, dev_item, 0);
@@ -1671,7 +1717,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1671 device->fs_devices->total_devices--; 1717 device->fs_devices->total_devices--;
1672 1718
1673 if (device->missing) 1719 if (device->missing)
1674 root->fs_info->fs_devices->missing_devices--; 1720 device->fs_devices->missing_devices--;
1675 1721
1676 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1722 next_device = list_entry(root->fs_info->fs_devices->devices.next,
1677 struct btrfs_device, dev_list); 1723 struct btrfs_device, dev_list);
@@ -1801,8 +1847,12 @@ void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
1801 if (srcdev->bdev) { 1847 if (srcdev->bdev) {
1802 fs_info->fs_devices->open_devices--; 1848 fs_info->fs_devices->open_devices--;
1803 1849
1804 /* zero out the old super */ 1850 /*
1805 btrfs_scratch_superblock(srcdev); 1851 * zero out the old super if it is not writable
1852 * (e.g. seed device)
1853 */
1854 if (srcdev->writeable)
1855 btrfs_scratch_superblock(srcdev);
1806 } 1856 }
1807 1857
1808 call_rcu(&srcdev->rcu, free_device); 1858 call_rcu(&srcdev->rcu, free_device);
@@ -1941,6 +1991,9 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
1941 fs_devices->seeding = 0; 1991 fs_devices->seeding = 0;
1942 fs_devices->num_devices = 0; 1992 fs_devices->num_devices = 0;
1943 fs_devices->open_devices = 0; 1993 fs_devices->open_devices = 0;
1994 fs_devices->missing_devices = 0;
1995 fs_devices->num_can_discard = 0;
1996 fs_devices->rotating = 0;
1944 fs_devices->seed = seed_devices; 1997 fs_devices->seed = seed_devices;
1945 1998
1946 generate_random_uuid(fs_devices->fsid); 1999 generate_random_uuid(fs_devices->fsid);
@@ -5800,7 +5853,8 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
5800 else 5853 else
5801 generate_random_uuid(dev->uuid); 5854 generate_random_uuid(dev->uuid);
5802 5855
5803 btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL); 5856 btrfs_init_work(&dev->work, btrfs_submit_helper,
5857 pending_bios_fn, NULL, NULL);
5804 5858
5805 return dev; 5859 return dev;
5806} 5860}