aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2010-01-04 19:17:33 -0500
committerTejun Heo <tj@kernel.org>2010-01-04 19:17:33 -0500
commit32032df6c2f6c9c6b2ada2ce42322231824f70c2 (patch)
treeb1ce838a37044bb38dfc128e2116ca35630e629a /fs/btrfs
parent22b737f4c75197372d64afc6ed1bccd58c00e549 (diff)
parentc5974b835a909ff15c3b7e6cf6789b5eb919f419 (diff)
Merge branch 'master' into percpu
Conflicts: arch/powerpc/platforms/pseries/hvCall.S include/linux/percpu.h
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/acl.c74
-rw-r--r--fs/btrfs/async-thread.c81
-rw-r--r--fs/btrfs/async-thread.h10
-rw-r--r--fs/btrfs/btrfs_inode.h23
-rw-r--r--fs/btrfs/ctree.c229
-rw-r--r--fs/btrfs/ctree.h59
-rw-r--r--fs/btrfs/dir-item.c19
-rw-r--r--fs/btrfs/disk-io.c77
-rw-r--r--fs/btrfs/extent-tree.c429
-rw-r--r--fs/btrfs/extent_io.c42
-rw-r--r--fs/btrfs/extent_io.h18
-rw-r--r--fs/btrfs/extent_map.c4
-rw-r--r--fs/btrfs/file.c717
-rw-r--r--fs/btrfs/free-space-cache.c2
-rw-r--r--fs/btrfs/inode.c794
-rw-r--r--fs/btrfs/ioctl.c41
-rw-r--r--fs/btrfs/ordered-data.c121
-rw-r--r--fs/btrfs/ordered-data.h5
-rw-r--r--fs/btrfs/relocation.c42
-rw-r--r--fs/btrfs/root-tree.c2
-rw-r--r--fs/btrfs/super.c24
-rw-r--r--fs/btrfs/transaction.c98
-rw-r--r--fs/btrfs/transaction.h7
-rw-r--r--fs/btrfs/tree-log.c130
-rw-r--r--fs/btrfs/tree-log.h3
-rw-r--r--fs/btrfs/volumes.c2
-rw-r--r--fs/btrfs/xattr.c82
-rw-r--r--fs/btrfs/xattr.h9
28 files changed, 1969 insertions, 1175 deletions
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 69b355ae7f49..2e9e69987a82 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -27,7 +27,7 @@
27#include "btrfs_inode.h" 27#include "btrfs_inode.h"
28#include "xattr.h" 28#include "xattr.h"
29 29
30#ifdef CONFIG_BTRFS_POSIX_ACL 30#ifdef CONFIG_BTRFS_FS_POSIX_ACL
31 31
32static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) 32static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
33{ 33{
@@ -73,13 +73,13 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
73 return acl; 73 return acl;
74} 74}
75 75
76static int btrfs_xattr_get_acl(struct inode *inode, int type, 76static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name,
77 void *value, size_t size) 77 void *value, size_t size, int type)
78{ 78{
79 struct posix_acl *acl; 79 struct posix_acl *acl;
80 int ret = 0; 80 int ret = 0;
81 81
82 acl = btrfs_get_acl(inode, type); 82 acl = btrfs_get_acl(dentry->d_inode, type);
83 83
84 if (IS_ERR(acl)) 84 if (IS_ERR(acl))
85 return PTR_ERR(acl); 85 return PTR_ERR(acl);
@@ -94,7 +94,8 @@ static int btrfs_xattr_get_acl(struct inode *inode, int type,
94/* 94/*
95 * Needs to be called with fs_mutex held 95 * Needs to be called with fs_mutex held
96 */ 96 */
97static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) 97static int btrfs_set_acl(struct btrfs_trans_handle *trans,
98 struct inode *inode, struct posix_acl *acl, int type)
98{ 99{
99 int ret, size = 0; 100 int ret, size = 0;
100 const char *name; 101 const char *name;
@@ -140,8 +141,7 @@ static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
140 goto out; 141 goto out;
141 } 142 }
142 143
143 ret = __btrfs_setxattr(inode, name, value, size, 0); 144 ret = __btrfs_setxattr(trans, inode, name, value, size, 0);
144
145out: 145out:
146 kfree(value); 146 kfree(value);
147 147
@@ -151,10 +151,10 @@ out:
151 return ret; 151 return ret;
152} 152}
153 153
154static int btrfs_xattr_set_acl(struct inode *inode, int type, 154static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
155 const void *value, size_t size) 155 const void *value, size_t size, int flags, int type)
156{ 156{
157 int ret = 0; 157 int ret;
158 struct posix_acl *acl = NULL; 158 struct posix_acl *acl = NULL;
159 159
160 if (value) { 160 if (value) {
@@ -167,38 +167,13 @@ static int btrfs_xattr_set_acl(struct inode *inode, int type,
167 } 167 }
168 } 168 }
169 169
170 ret = btrfs_set_acl(inode, acl, type); 170 ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type);
171 171
172 posix_acl_release(acl); 172 posix_acl_release(acl);
173 173
174 return ret; 174 return ret;
175} 175}
176 176
177
178static int btrfs_xattr_acl_access_get(struct inode *inode, const char *name,
179 void *value, size_t size)
180{
181 return btrfs_xattr_get_acl(inode, ACL_TYPE_ACCESS, value, size);
182}
183
184static int btrfs_xattr_acl_access_set(struct inode *inode, const char *name,
185 const void *value, size_t size, int flags)
186{
187 return btrfs_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
188}
189
190static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name,
191 void *value, size_t size)
192{
193 return btrfs_xattr_get_acl(inode, ACL_TYPE_DEFAULT, value, size);
194}
195
196static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name,
197 const void *value, size_t size, int flags)
198{
199 return btrfs_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
200}
201
202int btrfs_check_acl(struct inode *inode, int mask) 177int btrfs_check_acl(struct inode *inode, int mask)
203{ 178{
204 struct posix_acl *acl; 179 struct posix_acl *acl;
@@ -221,7 +196,8 @@ int btrfs_check_acl(struct inode *inode, int mask)
221 * stuff has been fixed to work with that. If the locking stuff changes, we 196 * stuff has been fixed to work with that. If the locking stuff changes, we
222 * need to re-evaluate the acl locking stuff. 197 * need to re-evaluate the acl locking stuff.
223 */ 198 */
224int btrfs_init_acl(struct inode *inode, struct inode *dir) 199int btrfs_init_acl(struct btrfs_trans_handle *trans,
200 struct inode *inode, struct inode *dir)
225{ 201{
226 struct posix_acl *acl = NULL; 202 struct posix_acl *acl = NULL;
227 int ret = 0; 203 int ret = 0;
@@ -246,7 +222,8 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
246 mode_t mode; 222 mode_t mode;
247 223
248 if (S_ISDIR(inode->i_mode)) { 224 if (S_ISDIR(inode->i_mode)) {
249 ret = btrfs_set_acl(inode, acl, ACL_TYPE_DEFAULT); 225 ret = btrfs_set_acl(trans, inode, acl,
226 ACL_TYPE_DEFAULT);
250 if (ret) 227 if (ret)
251 goto failed; 228 goto failed;
252 } 229 }
@@ -261,7 +238,7 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
261 inode->i_mode = mode; 238 inode->i_mode = mode;
262 if (ret > 0) { 239 if (ret > 0) {
263 /* we need an acl */ 240 /* we need an acl */
264 ret = btrfs_set_acl(inode, clone, 241 ret = btrfs_set_acl(trans, inode, clone,
265 ACL_TYPE_ACCESS); 242 ACL_TYPE_ACCESS);
266 } 243 }
267 } 244 }
@@ -294,7 +271,7 @@ int btrfs_acl_chmod(struct inode *inode)
294 271
295 ret = posix_acl_chmod_masq(clone, inode->i_mode); 272 ret = posix_acl_chmod_masq(clone, inode->i_mode);
296 if (!ret) 273 if (!ret)
297 ret = btrfs_set_acl(inode, clone, ACL_TYPE_ACCESS); 274 ret = btrfs_set_acl(NULL, inode, clone, ACL_TYPE_ACCESS);
298 275
299 posix_acl_release(clone); 276 posix_acl_release(clone);
300 277
@@ -303,26 +280,29 @@ int btrfs_acl_chmod(struct inode *inode)
303 280
304struct xattr_handler btrfs_xattr_acl_default_handler = { 281struct xattr_handler btrfs_xattr_acl_default_handler = {
305 .prefix = POSIX_ACL_XATTR_DEFAULT, 282 .prefix = POSIX_ACL_XATTR_DEFAULT,
306 .get = btrfs_xattr_acl_default_get, 283 .flags = ACL_TYPE_DEFAULT,
307 .set = btrfs_xattr_acl_default_set, 284 .get = btrfs_xattr_acl_get,
285 .set = btrfs_xattr_acl_set,
308}; 286};
309 287
310struct xattr_handler btrfs_xattr_acl_access_handler = { 288struct xattr_handler btrfs_xattr_acl_access_handler = {
311 .prefix = POSIX_ACL_XATTR_ACCESS, 289 .prefix = POSIX_ACL_XATTR_ACCESS,
312 .get = btrfs_xattr_acl_access_get, 290 .flags = ACL_TYPE_ACCESS,
313 .set = btrfs_xattr_acl_access_set, 291 .get = btrfs_xattr_acl_get,
292 .set = btrfs_xattr_acl_set,
314}; 293};
315 294
316#else /* CONFIG_BTRFS_POSIX_ACL */ 295#else /* CONFIG_BTRFS_FS_POSIX_ACL */
317 296
318int btrfs_acl_chmod(struct inode *inode) 297int btrfs_acl_chmod(struct inode *inode)
319{ 298{
320 return 0; 299 return 0;
321} 300}
322 301
323int btrfs_init_acl(struct inode *inode, struct inode *dir) 302int btrfs_init_acl(struct btrfs_trans_handle *trans,
303 struct inode *inode, struct inode *dir)
324{ 304{
325 return 0; 305 return 0;
326} 306}
327 307
328#endif /* CONFIG_BTRFS_POSIX_ACL */ 308#endif /* CONFIG_BTRFS_FS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 282ca085c2fb..c0861e781cdb 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -64,6 +64,51 @@ struct btrfs_worker_thread {
64}; 64};
65 65
66/* 66/*
67 * btrfs_start_workers uses kthread_run, which can block waiting for memory
68 * for a very long time. It will actually throttle on page writeback,
69 * and so it may not make progress until after our btrfs worker threads
70 * process all of the pending work structs in their queue
71 *
72 * This means we can't use btrfs_start_workers from inside a btrfs worker
73 * thread that is used as part of cleaning dirty memory, which pretty much
74 * involves all of the worker threads.
75 *
76 * Instead we have a helper queue who never has more than one thread
77 * where we scheduler thread start operations. This worker_start struct
78 * is used to contain the work and hold a pointer to the queue that needs
79 * another worker.
80 */
81struct worker_start {
82 struct btrfs_work work;
83 struct btrfs_workers *queue;
84};
85
86static void start_new_worker_func(struct btrfs_work *work)
87{
88 struct worker_start *start;
89 start = container_of(work, struct worker_start, work);
90 btrfs_start_workers(start->queue, 1);
91 kfree(start);
92}
93
94static int start_new_worker(struct btrfs_workers *queue)
95{
96 struct worker_start *start;
97 int ret;
98
99 start = kzalloc(sizeof(*start), GFP_NOFS);
100 if (!start)
101 return -ENOMEM;
102
103 start->work.func = start_new_worker_func;
104 start->queue = queue;
105 ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work);
106 if (ret)
107 kfree(start);
108 return ret;
109}
110
111/*
67 * helper function to move a thread onto the idle list after it 112 * helper function to move a thread onto the idle list after it
68 * has finished some requests. 113 * has finished some requests.
69 */ 114 */
@@ -118,11 +163,13 @@ static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
118 goto out; 163 goto out;
119 164
120 workers->atomic_start_pending = 0; 165 workers->atomic_start_pending = 0;
121 if (workers->num_workers >= workers->max_workers) 166 if (workers->num_workers + workers->num_workers_starting >=
167 workers->max_workers)
122 goto out; 168 goto out;
123 169
170 workers->num_workers_starting += 1;
124 spin_unlock_irqrestore(&workers->lock, flags); 171 spin_unlock_irqrestore(&workers->lock, flags);
125 btrfs_start_workers(workers, 1); 172 start_new_worker(workers);
126 return; 173 return;
127 174
128out: 175out:
@@ -390,9 +437,11 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
390/* 437/*
391 * simple init on struct btrfs_workers 438 * simple init on struct btrfs_workers
392 */ 439 */
393void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) 440void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
441 struct btrfs_workers *async_helper)
394{ 442{
395 workers->num_workers = 0; 443 workers->num_workers = 0;
444 workers->num_workers_starting = 0;
396 INIT_LIST_HEAD(&workers->worker_list); 445 INIT_LIST_HEAD(&workers->worker_list);
397 INIT_LIST_HEAD(&workers->idle_list); 446 INIT_LIST_HEAD(&workers->idle_list);
398 INIT_LIST_HEAD(&workers->order_list); 447 INIT_LIST_HEAD(&workers->order_list);
@@ -404,14 +453,15 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
404 workers->name = name; 453 workers->name = name;
405 workers->ordered = 0; 454 workers->ordered = 0;
406 workers->atomic_start_pending = 0; 455 workers->atomic_start_pending = 0;
407 workers->atomic_worker_start = 0; 456 workers->atomic_worker_start = async_helper;
408} 457}
409 458
410/* 459/*
411 * starts new worker threads. This does not enforce the max worker 460 * starts new worker threads. This does not enforce the max worker
412 * count in case you need to temporarily go past it. 461 * count in case you need to temporarily go past it.
413 */ 462 */
414int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) 463static int __btrfs_start_workers(struct btrfs_workers *workers,
464 int num_workers)
415{ 465{
416 struct btrfs_worker_thread *worker; 466 struct btrfs_worker_thread *worker;
417 int ret = 0; 467 int ret = 0;
@@ -444,6 +494,8 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
444 list_add_tail(&worker->worker_list, &workers->idle_list); 494 list_add_tail(&worker->worker_list, &workers->idle_list);
445 worker->idle = 1; 495 worker->idle = 1;
446 workers->num_workers++; 496 workers->num_workers++;
497 workers->num_workers_starting--;
498 WARN_ON(workers->num_workers_starting < 0);
447 spin_unlock_irq(&workers->lock); 499 spin_unlock_irq(&workers->lock);
448 } 500 }
449 return 0; 501 return 0;
@@ -452,6 +504,14 @@ fail:
452 return ret; 504 return ret;
453} 505}
454 506
507int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
508{
509 spin_lock_irq(&workers->lock);
510 workers->num_workers_starting += num_workers;
511 spin_unlock_irq(&workers->lock);
512 return __btrfs_start_workers(workers, num_workers);
513}
514
455/* 515/*
456 * run through the list and find a worker thread that doesn't have a lot 516 * run through the list and find a worker thread that doesn't have a lot
457 * to do right now. This can return null if we aren't yet at the thread 517 * to do right now. This can return null if we aren't yet at the thread
@@ -461,7 +521,10 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
461{ 521{
462 struct btrfs_worker_thread *worker; 522 struct btrfs_worker_thread *worker;
463 struct list_head *next; 523 struct list_head *next;
464 int enforce_min = workers->num_workers < workers->max_workers; 524 int enforce_min;
525
526 enforce_min = (workers->num_workers + workers->num_workers_starting) <
527 workers->max_workers;
465 528
466 /* 529 /*
467 * if we find an idle thread, don't move it to the end of the 530 * if we find an idle thread, don't move it to the end of the
@@ -509,15 +572,17 @@ again:
509 worker = next_worker(workers); 572 worker = next_worker(workers);
510 573
511 if (!worker) { 574 if (!worker) {
512 if (workers->num_workers >= workers->max_workers) { 575 if (workers->num_workers + workers->num_workers_starting >=
576 workers->max_workers) {
513 goto fallback; 577 goto fallback;
514 } else if (workers->atomic_worker_start) { 578 } else if (workers->atomic_worker_start) {
515 workers->atomic_start_pending = 1; 579 workers->atomic_start_pending = 1;
516 goto fallback; 580 goto fallback;
517 } else { 581 } else {
582 workers->num_workers_starting++;
518 spin_unlock_irqrestore(&workers->lock, flags); 583 spin_unlock_irqrestore(&workers->lock, flags);
519 /* we're below the limit, start another worker */ 584 /* we're below the limit, start another worker */
520 btrfs_start_workers(workers, 1); 585 __btrfs_start_workers(workers, 1);
521 goto again; 586 goto again;
522 } 587 }
523 } 588 }
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index fc089b95ec14..5077746cf85e 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -64,6 +64,8 @@ struct btrfs_workers {
64 /* current number of running workers */ 64 /* current number of running workers */
65 int num_workers; 65 int num_workers;
66 66
67 int num_workers_starting;
68
67 /* max number of workers allowed. changed by btrfs_start_workers */ 69 /* max number of workers allowed. changed by btrfs_start_workers */
68 int max_workers; 70 int max_workers;
69 71
@@ -78,9 +80,10 @@ struct btrfs_workers {
78 80
79 /* 81 /*
80 * are we allowed to sleep while starting workers or are we required 82 * are we allowed to sleep while starting workers or are we required
81 * to start them at a later time? 83 * to start them at a later time? If we can't sleep, this indicates
84 * which queue we need to use to schedule thread creation.
82 */ 85 */
83 int atomic_worker_start; 86 struct btrfs_workers *atomic_worker_start;
84 87
85 /* list with all the work threads. The workers on the idle thread 88 /* list with all the work threads. The workers on the idle thread
86 * may be actively servicing jobs, but they haven't yet hit the 89 * may be actively servicing jobs, but they haven't yet hit the
@@ -109,7 +112,8 @@ struct btrfs_workers {
109int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); 112int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
110int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); 113int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
111int btrfs_stop_workers(struct btrfs_workers *workers); 114int btrfs_stop_workers(struct btrfs_workers *workers);
112void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); 115void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
116 struct btrfs_workers *async_starter);
113int btrfs_requeue_work(struct btrfs_work *work); 117int btrfs_requeue_work(struct btrfs_work *work);
114void btrfs_set_work_high_prio(struct btrfs_work *work); 118void btrfs_set_work_high_prio(struct btrfs_work *work);
115#endif 119#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index a54d354cefcb..3f1f50d9d916 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -44,9 +44,6 @@ struct btrfs_inode {
44 */ 44 */
45 struct extent_io_tree io_failure_tree; 45 struct extent_io_tree io_failure_tree;
46 46
47 /* held while inesrting or deleting extents from files */
48 struct mutex extent_mutex;
49
50 /* held while logging the inode in tree-log.c */ 47 /* held while logging the inode in tree-log.c */
51 struct mutex log_mutex; 48 struct mutex log_mutex;
52 49
@@ -86,6 +83,12 @@ struct btrfs_inode {
86 * transid of the trans_handle that last modified this inode 83 * transid of the trans_handle that last modified this inode
87 */ 84 */
88 u64 last_trans; 85 u64 last_trans;
86
87 /*
88 * log transid when this inode was last modified
89 */
90 u64 last_sub_trans;
91
89 /* 92 /*
90 * transid that last logged this inode 93 * transid that last logged this inode
91 */ 94 */
@@ -128,12 +131,14 @@ struct btrfs_inode {
128 u64 last_unlink_trans; 131 u64 last_unlink_trans;
129 132
130 /* 133 /*
131 * These two counters are for delalloc metadata reservations. We keep 134 * Counters to keep track of the number of extent item's we may use due
132 * track of how many extents we've accounted for vs how many extents we 135 * to delalloc and such. outstanding_extents is the number of extent
133 * have. 136 * items we think we'll end up using, and reserved_extents is the number
137 * of extent items we've reserved metadata for.
134 */ 138 */
135 int delalloc_reserved_extents; 139 spinlock_t accounting_lock;
136 int delalloc_extents; 140 int reserved_extents;
141 int outstanding_extents;
137 142
138 /* 143 /*
139 * ordered_data_close is set by truncate when a file that used 144 * ordered_data_close is set by truncate when a file that used
@@ -158,7 +163,7 @@ static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
158 163
159static inline void btrfs_i_size_write(struct inode *inode, u64 size) 164static inline void btrfs_i_size_write(struct inode *inode, u64 size)
160{ 165{
161 inode->i_size = size; 166 i_size_write(inode, size);
162 BTRFS_I(inode)->disk_i_size = size; 167 BTRFS_I(inode)->disk_i_size = size;
163} 168}
164 169
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index ec96f3a6d536..c4bc570a396e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -37,6 +37,11 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
37 struct extent_buffer *src_buf); 37 struct extent_buffer *src_buf);
38static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 38static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
39 struct btrfs_path *path, int level, int slot); 39 struct btrfs_path *path, int level, int slot);
40static int setup_items_for_insert(struct btrfs_trans_handle *trans,
41 struct btrfs_root *root, struct btrfs_path *path,
42 struct btrfs_key *cpu_key, u32 *data_size,
43 u32 total_data, u32 total_size, int nr);
44
40 45
41struct btrfs_path *btrfs_alloc_path(void) 46struct btrfs_path *btrfs_alloc_path(void)
42{ 47{
@@ -451,9 +456,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
451 extent_buffer_get(cow); 456 extent_buffer_get(cow);
452 spin_unlock(&root->node_lock); 457 spin_unlock(&root->node_lock);
453 458
454 btrfs_free_extent(trans, root, buf->start, buf->len, 459 btrfs_free_tree_block(trans, root, buf->start, buf->len,
455 parent_start, root->root_key.objectid, 460 parent_start, root->root_key.objectid, level);
456 level, 0);
457 free_extent_buffer(buf); 461 free_extent_buffer(buf);
458 add_root_to_dirty_list(root); 462 add_root_to_dirty_list(root);
459 } else { 463 } else {
@@ -468,9 +472,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
468 btrfs_set_node_ptr_generation(parent, parent_slot, 472 btrfs_set_node_ptr_generation(parent, parent_slot,
469 trans->transid); 473 trans->transid);
470 btrfs_mark_buffer_dirty(parent); 474 btrfs_mark_buffer_dirty(parent);
471 btrfs_free_extent(trans, root, buf->start, buf->len, 475 btrfs_free_tree_block(trans, root, buf->start, buf->len,
472 parent_start, root->root_key.objectid, 476 parent_start, root->root_key.objectid, level);
473 level, 0);
474 } 477 }
475 if (unlock_orig) 478 if (unlock_orig)
476 btrfs_tree_unlock(buf); 479 btrfs_tree_unlock(buf);
@@ -1030,8 +1033,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1030 btrfs_tree_unlock(mid); 1033 btrfs_tree_unlock(mid);
1031 /* once for the path */ 1034 /* once for the path */
1032 free_extent_buffer(mid); 1035 free_extent_buffer(mid);
1033 ret = btrfs_free_extent(trans, root, mid->start, mid->len, 1036 ret = btrfs_free_tree_block(trans, root, mid->start, mid->len,
1034 0, root->root_key.objectid, level, 1); 1037 0, root->root_key.objectid, level);
1035 /* once for the root ptr */ 1038 /* once for the root ptr */
1036 free_extent_buffer(mid); 1039 free_extent_buffer(mid);
1037 return ret; 1040 return ret;
@@ -1095,10 +1098,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1095 1); 1098 1);
1096 if (wret) 1099 if (wret)
1097 ret = wret; 1100 ret = wret;
1098 wret = btrfs_free_extent(trans, root, bytenr, 1101 wret = btrfs_free_tree_block(trans, root,
1099 blocksize, 0, 1102 bytenr, blocksize, 0,
1100 root->root_key.objectid, 1103 root->root_key.objectid,
1101 level, 0); 1104 level);
1102 if (wret) 1105 if (wret)
1103 ret = wret; 1106 ret = wret;
1104 } else { 1107 } else {
@@ -1143,9 +1146,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1143 wret = del_ptr(trans, root, path, level + 1, pslot); 1146 wret = del_ptr(trans, root, path, level + 1, pslot);
1144 if (wret) 1147 if (wret)
1145 ret = wret; 1148 ret = wret;
1146 wret = btrfs_free_extent(trans, root, bytenr, blocksize, 1149 wret = btrfs_free_tree_block(trans, root, bytenr, blocksize,
1147 0, root->root_key.objectid, 1150 0, root->root_key.objectid, level);
1148 level, 0);
1149 if (wret) 1151 if (wret)
1150 ret = wret; 1152 ret = wret;
1151 } else { 1153 } else {
@@ -2997,75 +2999,85 @@ again:
2997 return ret; 2999 return ret;
2998} 3000}
2999 3001
3000/* 3002static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3001 * This function splits a single item into two items, 3003 struct btrfs_root *root,
3002 * giving 'new_key' to the new item and splitting the 3004 struct btrfs_path *path, int ins_len)
3003 * old one at split_offset (from the start of the item).
3004 *
3005 * The path may be released by this operation. After
3006 * the split, the path is pointing to the old item. The
3007 * new item is going to be in the same node as the old one.
3008 *
3009 * Note, the item being split must be smaller enough to live alone on
3010 * a tree block with room for one extra struct btrfs_item
3011 *
3012 * This allows us to split the item in place, keeping a lock on the
3013 * leaf the entire time.
3014 */
3015int btrfs_split_item(struct btrfs_trans_handle *trans,
3016 struct btrfs_root *root,
3017 struct btrfs_path *path,
3018 struct btrfs_key *new_key,
3019 unsigned long split_offset)
3020{ 3005{
3021 u32 item_size; 3006 struct btrfs_key key;
3022 struct extent_buffer *leaf; 3007 struct extent_buffer *leaf;
3023 struct btrfs_key orig_key; 3008 struct btrfs_file_extent_item *fi;
3024 struct btrfs_item *item; 3009 u64 extent_len = 0;
3025 struct btrfs_item *new_item; 3010 u32 item_size;
3026 int ret = 0; 3011 int ret;
3027 int slot;
3028 u32 nritems;
3029 u32 orig_offset;
3030 struct btrfs_disk_key disk_key;
3031 char *buf;
3032 3012
3033 leaf = path->nodes[0]; 3013 leaf = path->nodes[0];
3034 btrfs_item_key_to_cpu(leaf, &orig_key, path->slots[0]); 3014 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3035 if (btrfs_leaf_free_space(root, leaf) >= sizeof(struct btrfs_item)) 3015
3036 goto split; 3016 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3017 key.type != BTRFS_EXTENT_CSUM_KEY);
3018
3019 if (btrfs_leaf_free_space(root, leaf) >= ins_len)
3020 return 0;
3037 3021
3038 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 3022 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3023 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3024 fi = btrfs_item_ptr(leaf, path->slots[0],
3025 struct btrfs_file_extent_item);
3026 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3027 }
3039 btrfs_release_path(root, path); 3028 btrfs_release_path(root, path);
3040 3029
3041 path->search_for_split = 1;
3042 path->keep_locks = 1; 3030 path->keep_locks = 1;
3043 3031 path->search_for_split = 1;
3044 ret = btrfs_search_slot(trans, root, &orig_key, path, 0, 1); 3032 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3045 path->search_for_split = 0; 3033 path->search_for_split = 0;
3034 if (ret < 0)
3035 goto err;
3046 3036
3037 ret = -EAGAIN;
3038 leaf = path->nodes[0];
3047 /* if our item isn't there or got smaller, return now */ 3039 /* if our item isn't there or got smaller, return now */
3048 if (ret != 0 || item_size != btrfs_item_size_nr(path->nodes[0], 3040 if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
3049 path->slots[0])) { 3041 goto err;
3050 path->keep_locks = 0; 3042
3051 return -EAGAIN; 3043 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3044 fi = btrfs_item_ptr(leaf, path->slots[0],
3045 struct btrfs_file_extent_item);
3046 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3047 goto err;
3052 } 3048 }
3053 3049
3054 btrfs_set_path_blocking(path); 3050 btrfs_set_path_blocking(path);
3055 ret = split_leaf(trans, root, &orig_key, path, 3051 ret = split_leaf(trans, root, &key, path, ins_len, 1);
3056 sizeof(struct btrfs_item), 1);
3057 path->keep_locks = 0;
3058 BUG_ON(ret); 3052 BUG_ON(ret);
3059 3053
3054 path->keep_locks = 0;
3060 btrfs_unlock_up_safe(path, 1); 3055 btrfs_unlock_up_safe(path, 1);
3056 return 0;
3057err:
3058 path->keep_locks = 0;
3059 return ret;
3060}
3061
3062static noinline int split_item(struct btrfs_trans_handle *trans,
3063 struct btrfs_root *root,
3064 struct btrfs_path *path,
3065 struct btrfs_key *new_key,
3066 unsigned long split_offset)
3067{
3068 struct extent_buffer *leaf;
3069 struct btrfs_item *item;
3070 struct btrfs_item *new_item;
3071 int slot;
3072 char *buf;
3073 u32 nritems;
3074 u32 item_size;
3075 u32 orig_offset;
3076 struct btrfs_disk_key disk_key;
3077
3061 leaf = path->nodes[0]; 3078 leaf = path->nodes[0];
3062 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); 3079 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
3063 3080
3064split:
3065 /*
3066 * make sure any changes to the path from split_leaf leave it
3067 * in a blocking state
3068 */
3069 btrfs_set_path_blocking(path); 3081 btrfs_set_path_blocking(path);
3070 3082
3071 item = btrfs_item_nr(leaf, path->slots[0]); 3083 item = btrfs_item_nr(leaf, path->slots[0]);
@@ -3073,19 +3085,19 @@ split:
3073 item_size = btrfs_item_size(leaf, item); 3085 item_size = btrfs_item_size(leaf, item);
3074 3086
3075 buf = kmalloc(item_size, GFP_NOFS); 3087 buf = kmalloc(item_size, GFP_NOFS);
3088 if (!buf)
3089 return -ENOMEM;
3090
3076 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, 3091 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3077 path->slots[0]), item_size); 3092 path->slots[0]), item_size);
3078 slot = path->slots[0] + 1;
3079 leaf = path->nodes[0];
3080 3093
3094 slot = path->slots[0] + 1;
3081 nritems = btrfs_header_nritems(leaf); 3095 nritems = btrfs_header_nritems(leaf);
3082
3083 if (slot != nritems) { 3096 if (slot != nritems) {
3084 /* shift the items */ 3097 /* shift the items */
3085 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1), 3098 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
3086 btrfs_item_nr_offset(slot), 3099 btrfs_item_nr_offset(slot),
3087 (nritems - slot) * sizeof(struct btrfs_item)); 3100 (nritems - slot) * sizeof(struct btrfs_item));
3088
3089 } 3101 }
3090 3102
3091 btrfs_cpu_key_to_disk(&disk_key, new_key); 3103 btrfs_cpu_key_to_disk(&disk_key, new_key);
@@ -3113,16 +3125,81 @@ split:
3113 item_size - split_offset); 3125 item_size - split_offset);
3114 btrfs_mark_buffer_dirty(leaf); 3126 btrfs_mark_buffer_dirty(leaf);
3115 3127
3116 ret = 0; 3128 BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
3117 if (btrfs_leaf_free_space(root, leaf) < 0) {
3118 btrfs_print_leaf(root, leaf);
3119 BUG();
3120 }
3121 kfree(buf); 3129 kfree(buf);
3130 return 0;
3131}
3132
3133/*
3134 * This function splits a single item into two items,
3135 * giving 'new_key' to the new item and splitting the
3136 * old one at split_offset (from the start of the item).
3137 *
3138 * The path may be released by this operation. After
3139 * the split, the path is pointing to the old item. The
3140 * new item is going to be in the same node as the old one.
3141 *
3142 * Note, the item being split must be smaller enough to live alone on
3143 * a tree block with room for one extra struct btrfs_item
3144 *
3145 * This allows us to split the item in place, keeping a lock on the
3146 * leaf the entire time.
3147 */
3148int btrfs_split_item(struct btrfs_trans_handle *trans,
3149 struct btrfs_root *root,
3150 struct btrfs_path *path,
3151 struct btrfs_key *new_key,
3152 unsigned long split_offset)
3153{
3154 int ret;
3155 ret = setup_leaf_for_split(trans, root, path,
3156 sizeof(struct btrfs_item));
3157 if (ret)
3158 return ret;
3159
3160 ret = split_item(trans, root, path, new_key, split_offset);
3122 return ret; 3161 return ret;
3123} 3162}
3124 3163
3125/* 3164/*
3165 * This function duplicate a item, giving 'new_key' to the new item.
3166 * It guarantees both items live in the same tree leaf and the new item
3167 * is contiguous with the original item.
3168 *
3169 * This allows us to split file extent in place, keeping a lock on the
3170 * leaf the entire time.
3171 */
3172int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
3173 struct btrfs_root *root,
3174 struct btrfs_path *path,
3175 struct btrfs_key *new_key)
3176{
3177 struct extent_buffer *leaf;
3178 int ret;
3179 u32 item_size;
3180
3181 leaf = path->nodes[0];
3182 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3183 ret = setup_leaf_for_split(trans, root, path,
3184 item_size + sizeof(struct btrfs_item));
3185 if (ret)
3186 return ret;
3187
3188 path->slots[0]++;
3189 ret = setup_items_for_insert(trans, root, path, new_key, &item_size,
3190 item_size, item_size +
3191 sizeof(struct btrfs_item), 1);
3192 BUG_ON(ret);
3193
3194 leaf = path->nodes[0];
3195 memcpy_extent_buffer(leaf,
3196 btrfs_item_ptr_offset(leaf, path->slots[0]),
3197 btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
3198 item_size);
3199 return 0;
3200}
3201
3202/*
3126 * make the item pointed to by the path smaller. new_size indicates 3203 * make the item pointed to by the path smaller. new_size indicates
3127 * how small to make it, and from_end tells us if we just chop bytes 3204 * how small to make it, and from_end tells us if we just chop bytes
3128 * off the end of the item or if we shift the item to chop bytes off 3205 * off the end of the item or if we shift the item to chop bytes off
@@ -3714,8 +3791,8 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3714 */ 3791 */
3715 btrfs_unlock_up_safe(path, 0); 3792 btrfs_unlock_up_safe(path, 0);
3716 3793
3717 ret = btrfs_free_extent(trans, root, leaf->start, leaf->len, 3794 ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len,
3718 0, root->root_key.objectid, 0, 0); 3795 0, root->root_key.objectid, 0);
3719 return ret; 3796 return ret;
3720} 3797}
3721/* 3798/*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index dd8ced9814c4..9f806dd04c27 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -310,6 +310,9 @@ struct btrfs_header {
310#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ 310#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
311 sizeof(struct btrfs_item) - \ 311 sizeof(struct btrfs_item) - \
312 sizeof(struct btrfs_file_extent_item)) 312 sizeof(struct btrfs_file_extent_item))
313#define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
314 sizeof(struct btrfs_item) -\
315 sizeof(struct btrfs_dir_item))
313 316
314 317
315/* 318/*
@@ -691,14 +694,17 @@ struct btrfs_space_info {
691 694
692 struct list_head list; 695 struct list_head list;
693 696
697 /* for controlling how we free up space for allocations */
698 wait_queue_head_t allocate_wait;
699 wait_queue_head_t flush_wait;
700 int allocating_chunk;
701 int flushing;
702
694 /* for block groups in our same type */ 703 /* for block groups in our same type */
695 struct list_head block_groups; 704 struct list_head block_groups;
696 spinlock_t lock; 705 spinlock_t lock;
697 struct rw_semaphore groups_sem; 706 struct rw_semaphore groups_sem;
698 atomic_t caching_threads; 707 atomic_t caching_threads;
699
700 int allocating_chunk;
701 wait_queue_head_t wait;
702}; 708};
703 709
704/* 710/*
@@ -856,8 +862,9 @@ struct btrfs_fs_info {
856 struct mutex ordered_operations_mutex; 862 struct mutex ordered_operations_mutex;
857 struct rw_semaphore extent_commit_sem; 863 struct rw_semaphore extent_commit_sem;
858 864
859 struct rw_semaphore subvol_sem; 865 struct rw_semaphore cleanup_work_sem;
860 866
867 struct rw_semaphore subvol_sem;
861 struct srcu_struct subvol_srcu; 868 struct srcu_struct subvol_srcu;
862 869
863 struct list_head trans_list; 870 struct list_head trans_list;
@@ -865,6 +872,9 @@ struct btrfs_fs_info {
865 struct list_head dead_roots; 872 struct list_head dead_roots;
866 struct list_head caching_block_groups; 873 struct list_head caching_block_groups;
867 874
875 spinlock_t delayed_iput_lock;
876 struct list_head delayed_iputs;
877
868 atomic_t nr_async_submits; 878 atomic_t nr_async_submits;
869 atomic_t async_submit_draining; 879 atomic_t async_submit_draining;
870 atomic_t nr_async_bios; 880 atomic_t nr_async_bios;
@@ -907,6 +917,7 @@ struct btrfs_fs_info {
907 * A third pool does submit_bio to avoid deadlocking with the other 917 * A third pool does submit_bio to avoid deadlocking with the other
908 * two 918 * two
909 */ 919 */
920 struct btrfs_workers generic_worker;
910 struct btrfs_workers workers; 921 struct btrfs_workers workers;
911 struct btrfs_workers delalloc_workers; 922 struct btrfs_workers delalloc_workers;
912 struct btrfs_workers endio_workers; 923 struct btrfs_workers endio_workers;
@@ -914,6 +925,7 @@ struct btrfs_fs_info {
914 struct btrfs_workers endio_meta_write_workers; 925 struct btrfs_workers endio_meta_write_workers;
915 struct btrfs_workers endio_write_workers; 926 struct btrfs_workers endio_write_workers;
916 struct btrfs_workers submit_workers; 927 struct btrfs_workers submit_workers;
928 struct btrfs_workers enospc_workers;
917 /* 929 /*
918 * fixup workers take dirty pages that didn't properly go through 930 * fixup workers take dirty pages that didn't properly go through
919 * the cow mechanism and make them safe to write. It happens 931 * the cow mechanism and make them safe to write. It happens
@@ -1004,7 +1016,10 @@ struct btrfs_root {
1004 atomic_t log_writers; 1016 atomic_t log_writers;
1005 atomic_t log_commit[2]; 1017 atomic_t log_commit[2];
1006 unsigned long log_transid; 1018 unsigned long log_transid;
1019 unsigned long last_log_commit;
1007 unsigned long log_batch; 1020 unsigned long log_batch;
1021 pid_t log_start_pid;
1022 bool log_multiple_pids;
1008 1023
1009 u64 objectid; 1024 u64 objectid;
1010 u64 last_trans; 1025 u64 last_trans;
@@ -1026,12 +1041,12 @@ struct btrfs_root {
1026 int ref_cows; 1041 int ref_cows;
1027 int track_dirty; 1042 int track_dirty;
1028 int in_radix; 1043 int in_radix;
1044 int clean_orphans;
1029 1045
1030 u64 defrag_trans_start; 1046 u64 defrag_trans_start;
1031 struct btrfs_key defrag_progress; 1047 struct btrfs_key defrag_progress;
1032 struct btrfs_key defrag_max; 1048 struct btrfs_key defrag_max;
1033 int defrag_running; 1049 int defrag_running;
1034 int defrag_level;
1035 char *name; 1050 char *name;
1036 int in_sysfs; 1051 int in_sysfs;
1037 1052
@@ -1145,6 +1160,7 @@ struct btrfs_root {
1145#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) 1160#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7)
1146#define BTRFS_MOUNT_SSD_SPREAD (1 << 8) 1161#define BTRFS_MOUNT_SSD_SPREAD (1 << 8)
1147#define BTRFS_MOUNT_NOSSD (1 << 9) 1162#define BTRFS_MOUNT_NOSSD (1 << 9)
1163#define BTRFS_MOUNT_DISCARD (1 << 10)
1148 1164
1149#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1165#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1150#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1166#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1966,6 +1982,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1966 u64 parent, u64 root_objectid, 1982 u64 parent, u64 root_objectid,
1967 struct btrfs_disk_key *key, int level, 1983 struct btrfs_disk_key *key, int level,
1968 u64 hint, u64 empty_size); 1984 u64 hint, u64 empty_size);
1985int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
1986 struct btrfs_root *root,
1987 u64 bytenr, u32 blocksize,
1988 u64 parent, u64 root_objectid, int level);
1969struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 1989struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
1970 struct btrfs_root *root, 1990 struct btrfs_root *root,
1971 u64 bytenr, u32 blocksize, 1991 u64 bytenr, u32 blocksize,
@@ -2080,6 +2100,10 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
2080 struct btrfs_path *path, 2100 struct btrfs_path *path,
2081 struct btrfs_key *new_key, 2101 struct btrfs_key *new_key,
2082 unsigned long split_offset); 2102 unsigned long split_offset);
2103int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
2104 struct btrfs_root *root,
2105 struct btrfs_path *path,
2106 struct btrfs_key *new_key);
2083int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root 2107int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2084 *root, struct btrfs_key *key, struct btrfs_path *p, int 2108 *root, struct btrfs_key *key, struct btrfs_path *p, int
2085 ins_len, int cow); 2109 ins_len, int cow);
@@ -2187,9 +2211,10 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
2187 struct btrfs_path *path, 2211 struct btrfs_path *path,
2188 struct btrfs_dir_item *di); 2212 struct btrfs_dir_item *di);
2189int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, 2213int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
2190 struct btrfs_root *root, const char *name, 2214 struct btrfs_root *root,
2191 u16 name_len, const void *data, u16 data_len, 2215 struct btrfs_path *path, u64 objectid,
2192 u64 dir); 2216 const char *name, u16 name_len,
2217 const void *data, u16 data_len);
2193struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, 2218struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
2194 struct btrfs_root *root, 2219 struct btrfs_root *root,
2195 struct btrfs_path *path, u64 dir, 2220 struct btrfs_path *path, u64 dir,
@@ -2283,7 +2308,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2283 struct inode *inode, u64 new_size, 2308 struct inode *inode, u64 new_size,
2284 u32 min_type); 2309 u32 min_type);
2285 2310
2286int btrfs_start_delalloc_inodes(struct btrfs_root *root); 2311int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
2287int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end); 2312int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
2288int btrfs_writepages(struct address_space *mapping, 2313int btrfs_writepages(struct address_space *mapping,
2289 struct writeback_control *wbc); 2314 struct writeback_control *wbc);
@@ -2323,7 +2348,9 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2323void btrfs_orphan_cleanup(struct btrfs_root *root); 2348void btrfs_orphan_cleanup(struct btrfs_root *root);
2324int btrfs_cont_expand(struct inode *inode, loff_t size); 2349int btrfs_cont_expand(struct inode *inode, loff_t size);
2325int btrfs_invalidate_inodes(struct btrfs_root *root); 2350int btrfs_invalidate_inodes(struct btrfs_root *root);
2326extern struct dentry_operations btrfs_dentry_operations; 2351void btrfs_add_delayed_iput(struct inode *inode);
2352void btrfs_run_delayed_iputs(struct btrfs_root *root);
2353extern const struct dentry_operations btrfs_dentry_operations;
2327 2354
2328/* ioctl.c */ 2355/* ioctl.c */
2329long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); 2356long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@@ -2336,12 +2363,9 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
2336 int skip_pinned); 2363 int skip_pinned);
2337int btrfs_check_file(struct btrfs_root *root, struct inode *inode); 2364int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
2338extern const struct file_operations btrfs_file_operations; 2365extern const struct file_operations btrfs_file_operations;
2339int btrfs_drop_extents(struct btrfs_trans_handle *trans, 2366int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
2340 struct btrfs_root *root, struct inode *inode, 2367 u64 start, u64 end, u64 *hint_byte, int drop_cache);
2341 u64 start, u64 end, u64 locked_end,
2342 u64 inline_limit, u64 *hint_block, int drop_cache);
2343int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, 2368int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
2344 struct btrfs_root *root,
2345 struct inode *inode, u64 start, u64 end); 2369 struct inode *inode, u64 start, u64 end);
2346int btrfs_release_file(struct inode *inode, struct file *file); 2370int btrfs_release_file(struct inode *inode, struct file *file);
2347 2371
@@ -2366,12 +2390,13 @@ int btrfs_parse_options(struct btrfs_root *root, char *options);
2366int btrfs_sync_fs(struct super_block *sb, int wait); 2390int btrfs_sync_fs(struct super_block *sb, int wait);
2367 2391
2368/* acl.c */ 2392/* acl.c */
2369#ifdef CONFIG_BTRFS_POSIX_ACL 2393#ifdef CONFIG_BTRFS_FS_POSIX_ACL
2370int btrfs_check_acl(struct inode *inode, int mask); 2394int btrfs_check_acl(struct inode *inode, int mask);
2371#else 2395#else
2372#define btrfs_check_acl NULL 2396#define btrfs_check_acl NULL
2373#endif 2397#endif
2374int btrfs_init_acl(struct inode *inode, struct inode *dir); 2398int btrfs_init_acl(struct btrfs_trans_handle *trans,
2399 struct inode *inode, struct inode *dir);
2375int btrfs_acl_chmod(struct inode *inode); 2400int btrfs_acl_chmod(struct inode *inode);
2376 2401
2377/* relocation.c */ 2402/* relocation.c */
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index f3a6075519cc..e9103b3baa49 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -68,12 +68,12 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
68 * into the tree 68 * into the tree
69 */ 69 */
70int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, 70int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
71 struct btrfs_root *root, const char *name, 71 struct btrfs_root *root,
72 u16 name_len, const void *data, u16 data_len, 72 struct btrfs_path *path, u64 objectid,
73 u64 dir) 73 const char *name, u16 name_len,
74 const void *data, u16 data_len)
74{ 75{
75 int ret = 0; 76 int ret = 0;
76 struct btrfs_path *path;
77 struct btrfs_dir_item *dir_item; 77 struct btrfs_dir_item *dir_item;
78 unsigned long name_ptr, data_ptr; 78 unsigned long name_ptr, data_ptr;
79 struct btrfs_key key, location; 79 struct btrfs_key key, location;
@@ -81,15 +81,11 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
81 struct extent_buffer *leaf; 81 struct extent_buffer *leaf;
82 u32 data_size; 82 u32 data_size;
83 83
84 key.objectid = dir; 84 BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root));
85
86 key.objectid = objectid;
85 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); 87 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
86 key.offset = btrfs_name_hash(name, name_len); 88 key.offset = btrfs_name_hash(name, name_len);
87 path = btrfs_alloc_path();
88 if (!path)
89 return -ENOMEM;
90 if (name_len + data_len + sizeof(struct btrfs_dir_item) >
91 BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item))
92 return -ENOSPC;
93 89
94 data_size = sizeof(*dir_item) + name_len + data_len; 90 data_size = sizeof(*dir_item) + name_len + data_len;
95 dir_item = insert_with_overflow(trans, root, path, &key, data_size, 91 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
@@ -117,7 +113,6 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
117 write_extent_buffer(leaf, data, data_ptr, data_len); 113 write_extent_buffer(leaf, data, data_ptr, data_len);
118 btrfs_mark_buffer_dirty(path->nodes[0]); 114 btrfs_mark_buffer_dirty(path->nodes[0]);
119 115
120 btrfs_free_path(path);
121 return ret; 116 return ret;
122} 117}
123 118
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index af0435f79fa6..009e3bd18f23 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -892,6 +892,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
892 root->stripesize = stripesize; 892 root->stripesize = stripesize;
893 root->ref_cows = 0; 893 root->ref_cows = 0;
894 root->track_dirty = 0; 894 root->track_dirty = 0;
895 root->in_radix = 0;
896 root->clean_orphans = 0;
895 897
896 root->fs_info = fs_info; 898 root->fs_info = fs_info;
897 root->objectid = objectid; 899 root->objectid = objectid;
@@ -917,6 +919,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
917 atomic_set(&root->log_writers, 0); 919 atomic_set(&root->log_writers, 0);
918 root->log_batch = 0; 920 root->log_batch = 0;
919 root->log_transid = 0; 921 root->log_transid = 0;
922 root->last_log_commit = 0;
920 extent_io_tree_init(&root->dirty_log_pages, 923 extent_io_tree_init(&root->dirty_log_pages,
921 fs_info->btree_inode->i_mapping, GFP_NOFS); 924 fs_info->btree_inode->i_mapping, GFP_NOFS);
922 925
@@ -927,7 +930,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
927 root->defrag_trans_start = fs_info->generation; 930 root->defrag_trans_start = fs_info->generation;
928 init_completion(&root->kobj_unregister); 931 init_completion(&root->kobj_unregister);
929 root->defrag_running = 0; 932 root->defrag_running = 0;
930 root->defrag_level = 0;
931 root->root_key.objectid = objectid; 933 root->root_key.objectid = objectid;
932 root->anon_super.s_root = NULL; 934 root->anon_super.s_root = NULL;
933 root->anon_super.s_dev = 0; 935 root->anon_super.s_dev = 0;
@@ -979,12 +981,12 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
979 981
980 while (1) { 982 while (1) {
981 ret = find_first_extent_bit(&log_root_tree->dirty_log_pages, 983 ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
982 0, &start, &end, EXTENT_DIRTY); 984 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
983 if (ret) 985 if (ret)
984 break; 986 break;
985 987
986 clear_extent_dirty(&log_root_tree->dirty_log_pages, 988 clear_extent_bits(&log_root_tree->dirty_log_pages, start, end,
987 start, end, GFP_NOFS); 989 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
988 } 990 }
989 eb = fs_info->log_root_tree->node; 991 eb = fs_info->log_root_tree->node;
990 992
@@ -1087,6 +1089,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1087 WARN_ON(root->log_root); 1089 WARN_ON(root->log_root);
1088 root->log_root = log_root; 1090 root->log_root = log_root;
1089 root->log_transid = 0; 1091 root->log_transid = 0;
1092 root->last_log_commit = 0;
1090 return 0; 1093 return 0;
1091} 1094}
1092 1095
@@ -1208,8 +1211,10 @@ again:
1208 ret = radix_tree_insert(&fs_info->fs_roots_radix, 1211 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1209 (unsigned long)root->root_key.objectid, 1212 (unsigned long)root->root_key.objectid,
1210 root); 1213 root);
1211 if (ret == 0) 1214 if (ret == 0) {
1212 root->in_radix = 1; 1215 root->in_radix = 1;
1216 root->clean_orphans = 1;
1217 }
1213 spin_unlock(&fs_info->fs_roots_radix_lock); 1218 spin_unlock(&fs_info->fs_roots_radix_lock);
1214 radix_tree_preload_end(); 1219 radix_tree_preload_end();
1215 if (ret) { 1220 if (ret) {
@@ -1223,10 +1228,6 @@ again:
1223 ret = btrfs_find_dead_roots(fs_info->tree_root, 1228 ret = btrfs_find_dead_roots(fs_info->tree_root,
1224 root->root_key.objectid); 1229 root->root_key.objectid);
1225 WARN_ON(ret); 1230 WARN_ON(ret);
1226
1227 if (!(fs_info->sb->s_flags & MS_RDONLY))
1228 btrfs_orphan_cleanup(root);
1229
1230 return root; 1231 return root;
1231fail: 1232fail:
1232 free_fs_root(root); 1233 free_fs_root(root);
@@ -1475,6 +1476,7 @@ static int cleaner_kthread(void *arg)
1475 1476
1476 if (!(root->fs_info->sb->s_flags & MS_RDONLY) && 1477 if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
1477 mutex_trylock(&root->fs_info->cleaner_mutex)) { 1478 mutex_trylock(&root->fs_info->cleaner_mutex)) {
1479 btrfs_run_delayed_iputs(root);
1478 btrfs_clean_old_snapshots(root); 1480 btrfs_clean_old_snapshots(root);
1479 mutex_unlock(&root->fs_info->cleaner_mutex); 1481 mutex_unlock(&root->fs_info->cleaner_mutex);
1480 } 1482 }
@@ -1604,6 +1606,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1604 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); 1606 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
1605 INIT_LIST_HEAD(&fs_info->trans_list); 1607 INIT_LIST_HEAD(&fs_info->trans_list);
1606 INIT_LIST_HEAD(&fs_info->dead_roots); 1608 INIT_LIST_HEAD(&fs_info->dead_roots);
1609 INIT_LIST_HEAD(&fs_info->delayed_iputs);
1607 INIT_LIST_HEAD(&fs_info->hashers); 1610 INIT_LIST_HEAD(&fs_info->hashers);
1608 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 1611 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1609 INIT_LIST_HEAD(&fs_info->ordered_operations); 1612 INIT_LIST_HEAD(&fs_info->ordered_operations);
@@ -1612,6 +1615,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1612 spin_lock_init(&fs_info->new_trans_lock); 1615 spin_lock_init(&fs_info->new_trans_lock);
1613 spin_lock_init(&fs_info->ref_cache_lock); 1616 spin_lock_init(&fs_info->ref_cache_lock);
1614 spin_lock_init(&fs_info->fs_roots_radix_lock); 1617 spin_lock_init(&fs_info->fs_roots_radix_lock);
1618 spin_lock_init(&fs_info->delayed_iput_lock);
1615 1619
1616 init_completion(&fs_info->kobj_unregister); 1620 init_completion(&fs_info->kobj_unregister);
1617 fs_info->tree_root = tree_root; 1621 fs_info->tree_root = tree_root;
@@ -1687,6 +1691,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1687 mutex_init(&fs_info->cleaner_mutex); 1691 mutex_init(&fs_info->cleaner_mutex);
1688 mutex_init(&fs_info->volume_mutex); 1692 mutex_init(&fs_info->volume_mutex);
1689 init_rwsem(&fs_info->extent_commit_sem); 1693 init_rwsem(&fs_info->extent_commit_sem);
1694 init_rwsem(&fs_info->cleanup_work_sem);
1690 init_rwsem(&fs_info->subvol_sem); 1695 init_rwsem(&fs_info->subvol_sem);
1691 1696
1692 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); 1697 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
@@ -1746,21 +1751,25 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1746 err = -EINVAL; 1751 err = -EINVAL;
1747 goto fail_iput; 1752 goto fail_iput;
1748 } 1753 }
1749printk("thread pool is %d\n", fs_info->thread_pool_size); 1754
1750 /* 1755 btrfs_init_workers(&fs_info->generic_worker,
1751 * we need to start all the end_io workers up front because the 1756 "genwork", 1, NULL);
1752 * queue work function gets called at interrupt time, and so it 1757
1753 * cannot dynamically grow.
1754 */
1755 btrfs_init_workers(&fs_info->workers, "worker", 1758 btrfs_init_workers(&fs_info->workers, "worker",
1756 fs_info->thread_pool_size); 1759 fs_info->thread_pool_size,
1760 &fs_info->generic_worker);
1757 1761
1758 btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", 1762 btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
1759 fs_info->thread_pool_size); 1763 fs_info->thread_pool_size,
1764 &fs_info->generic_worker);
1760 1765
1761 btrfs_init_workers(&fs_info->submit_workers, "submit", 1766 btrfs_init_workers(&fs_info->submit_workers, "submit",
1762 min_t(u64, fs_devices->num_devices, 1767 min_t(u64, fs_devices->num_devices,
1763 fs_info->thread_pool_size)); 1768 fs_info->thread_pool_size),
1769 &fs_info->generic_worker);
1770 btrfs_init_workers(&fs_info->enospc_workers, "enospc",
1771 fs_info->thread_pool_size,
1772 &fs_info->generic_worker);
1764 1773
1765 /* a higher idle thresh on the submit workers makes it much more 1774 /* a higher idle thresh on the submit workers makes it much more
1766 * likely that bios will be send down in a sane order to the 1775 * likely that bios will be send down in a sane order to the
@@ -1774,15 +1783,20 @@ printk("thread pool is %d\n", fs_info->thread_pool_size);
1774 fs_info->delalloc_workers.idle_thresh = 2; 1783 fs_info->delalloc_workers.idle_thresh = 2;
1775 fs_info->delalloc_workers.ordered = 1; 1784 fs_info->delalloc_workers.ordered = 1;
1776 1785
1777 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1); 1786 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
1787 &fs_info->generic_worker);
1778 btrfs_init_workers(&fs_info->endio_workers, "endio", 1788 btrfs_init_workers(&fs_info->endio_workers, "endio",
1779 fs_info->thread_pool_size); 1789 fs_info->thread_pool_size,
1790 &fs_info->generic_worker);
1780 btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta", 1791 btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
1781 fs_info->thread_pool_size); 1792 fs_info->thread_pool_size,
1793 &fs_info->generic_worker);
1782 btrfs_init_workers(&fs_info->endio_meta_write_workers, 1794 btrfs_init_workers(&fs_info->endio_meta_write_workers,
1783 "endio-meta-write", fs_info->thread_pool_size); 1795 "endio-meta-write", fs_info->thread_pool_size,
1796 &fs_info->generic_worker);
1784 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", 1797 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
1785 fs_info->thread_pool_size); 1798 fs_info->thread_pool_size,
1799 &fs_info->generic_worker);
1786 1800
1787 /* 1801 /*
1788 * endios are largely parallel and should have a very 1802 * endios are largely parallel and should have a very
@@ -1794,12 +1808,8 @@ printk("thread pool is %d\n", fs_info->thread_pool_size);
1794 fs_info->endio_write_workers.idle_thresh = 2; 1808 fs_info->endio_write_workers.idle_thresh = 2;
1795 fs_info->endio_meta_write_workers.idle_thresh = 2; 1809 fs_info->endio_meta_write_workers.idle_thresh = 2;
1796 1810
1797 fs_info->endio_workers.atomic_worker_start = 1;
1798 fs_info->endio_meta_workers.atomic_worker_start = 1;
1799 fs_info->endio_write_workers.atomic_worker_start = 1;
1800 fs_info->endio_meta_write_workers.atomic_worker_start = 1;
1801
1802 btrfs_start_workers(&fs_info->workers, 1); 1811 btrfs_start_workers(&fs_info->workers, 1);
1812 btrfs_start_workers(&fs_info->generic_worker, 1);
1803 btrfs_start_workers(&fs_info->submit_workers, 1); 1813 btrfs_start_workers(&fs_info->submit_workers, 1);
1804 btrfs_start_workers(&fs_info->delalloc_workers, 1); 1814 btrfs_start_workers(&fs_info->delalloc_workers, 1);
1805 btrfs_start_workers(&fs_info->fixup_workers, 1); 1815 btrfs_start_workers(&fs_info->fixup_workers, 1);
@@ -1807,6 +1817,7 @@ printk("thread pool is %d\n", fs_info->thread_pool_size);
1807 btrfs_start_workers(&fs_info->endio_meta_workers, 1); 1817 btrfs_start_workers(&fs_info->endio_meta_workers, 1);
1808 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); 1818 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
1809 btrfs_start_workers(&fs_info->endio_write_workers, 1); 1819 btrfs_start_workers(&fs_info->endio_write_workers, 1);
1820 btrfs_start_workers(&fs_info->enospc_workers, 1);
1810 1821
1811 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 1822 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1812 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 1823 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -2012,6 +2023,7 @@ fail_chunk_root:
2012 free_extent_buffer(chunk_root->node); 2023 free_extent_buffer(chunk_root->node);
2013 free_extent_buffer(chunk_root->commit_root); 2024 free_extent_buffer(chunk_root->commit_root);
2014fail_sb_buffer: 2025fail_sb_buffer:
2026 btrfs_stop_workers(&fs_info->generic_worker);
2015 btrfs_stop_workers(&fs_info->fixup_workers); 2027 btrfs_stop_workers(&fs_info->fixup_workers);
2016 btrfs_stop_workers(&fs_info->delalloc_workers); 2028 btrfs_stop_workers(&fs_info->delalloc_workers);
2017 btrfs_stop_workers(&fs_info->workers); 2029 btrfs_stop_workers(&fs_info->workers);
@@ -2020,6 +2032,7 @@ fail_sb_buffer:
2020 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2032 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2021 btrfs_stop_workers(&fs_info->endio_write_workers); 2033 btrfs_stop_workers(&fs_info->endio_write_workers);
2022 btrfs_stop_workers(&fs_info->submit_workers); 2034 btrfs_stop_workers(&fs_info->submit_workers);
2035 btrfs_stop_workers(&fs_info->enospc_workers);
2023fail_iput: 2036fail_iput:
2024 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2037 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2025 iput(fs_info->btree_inode); 2038 iput(fs_info->btree_inode);
@@ -2376,8 +2389,14 @@ int btrfs_commit_super(struct btrfs_root *root)
2376 int ret; 2389 int ret;
2377 2390
2378 mutex_lock(&root->fs_info->cleaner_mutex); 2391 mutex_lock(&root->fs_info->cleaner_mutex);
2392 btrfs_run_delayed_iputs(root);
2379 btrfs_clean_old_snapshots(root); 2393 btrfs_clean_old_snapshots(root);
2380 mutex_unlock(&root->fs_info->cleaner_mutex); 2394 mutex_unlock(&root->fs_info->cleaner_mutex);
2395
2396 /* wait until ongoing cleanup work done */
2397 down_write(&root->fs_info->cleanup_work_sem);
2398 up_write(&root->fs_info->cleanup_work_sem);
2399
2381 trans = btrfs_start_transaction(root, 1); 2400 trans = btrfs_start_transaction(root, 1);
2382 ret = btrfs_commit_transaction(trans, root); 2401 ret = btrfs_commit_transaction(trans, root);
2383 BUG_ON(ret); 2402 BUG_ON(ret);
@@ -2437,6 +2456,7 @@ int close_ctree(struct btrfs_root *root)
2437 2456
2438 iput(fs_info->btree_inode); 2457 iput(fs_info->btree_inode);
2439 2458
2459 btrfs_stop_workers(&fs_info->generic_worker);
2440 btrfs_stop_workers(&fs_info->fixup_workers); 2460 btrfs_stop_workers(&fs_info->fixup_workers);
2441 btrfs_stop_workers(&fs_info->delalloc_workers); 2461 btrfs_stop_workers(&fs_info->delalloc_workers);
2442 btrfs_stop_workers(&fs_info->workers); 2462 btrfs_stop_workers(&fs_info->workers);
@@ -2445,6 +2465,7 @@ int close_ctree(struct btrfs_root *root)
2445 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2465 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2446 btrfs_stop_workers(&fs_info->endio_write_workers); 2466 btrfs_stop_workers(&fs_info->endio_write_workers);
2447 btrfs_stop_workers(&fs_info->submit_workers); 2467 btrfs_stop_workers(&fs_info->submit_workers);
2468 btrfs_stop_workers(&fs_info->enospc_workers);
2448 2469
2449 btrfs_close_devices(fs_info->fs_devices); 2470 btrfs_close_devices(fs_info->fs_devices);
2450 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2471 btrfs_mapping_tree_free(&fs_info->mapping_tree);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 359a754c782c..56e50137d0e6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -195,6 +195,14 @@ static int exclude_super_stripes(struct btrfs_root *root,
195 int stripe_len; 195 int stripe_len;
196 int i, nr, ret; 196 int i, nr, ret;
197 197
198 if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
199 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
200 cache->bytes_super += stripe_len;
201 ret = add_excluded_extent(root, cache->key.objectid,
202 stripe_len);
203 BUG_ON(ret);
204 }
205
198 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 206 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
199 bytenr = btrfs_sb_offset(i); 207 bytenr = btrfs_sb_offset(i);
200 ret = btrfs_rmap_block(&root->fs_info->mapping_tree, 208 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
@@ -255,7 +263,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
255 if (ret) 263 if (ret)
256 break; 264 break;
257 265
258 if (extent_start == start) { 266 if (extent_start <= start) {
259 start = extent_end + 1; 267 start = extent_end + 1;
260 } else if (extent_start > start && extent_start < end) { 268 } else if (extent_start > start && extent_start < end) {
261 size = extent_start - start; 269 size = extent_start - start;
@@ -1568,23 +1576,23 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
1568 return ret; 1576 return ret;
1569} 1577}
1570 1578
1571#ifdef BIO_RW_DISCARD
1572static void btrfs_issue_discard(struct block_device *bdev, 1579static void btrfs_issue_discard(struct block_device *bdev,
1573 u64 start, u64 len) 1580 u64 start, u64 len)
1574{ 1581{
1575 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 1582 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
1576 DISCARD_FL_BARRIER); 1583 DISCARD_FL_BARRIER);
1577} 1584}
1578#endif
1579 1585
1580static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1586static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1581 u64 num_bytes) 1587 u64 num_bytes)
1582{ 1588{
1583#ifdef BIO_RW_DISCARD
1584 int ret; 1589 int ret;
1585 u64 map_length = num_bytes; 1590 u64 map_length = num_bytes;
1586 struct btrfs_multi_bio *multi = NULL; 1591 struct btrfs_multi_bio *multi = NULL;
1587 1592
1593 if (!btrfs_test_opt(root, DISCARD))
1594 return 0;
1595
1588 /* Tell the block device(s) that the sectors can be discarded */ 1596 /* Tell the block device(s) that the sectors can be discarded */
1589 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, 1597 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
1590 bytenr, &map_length, &multi, 0); 1598 bytenr, &map_length, &multi, 0);
@@ -1604,9 +1612,6 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1604 } 1612 }
1605 1613
1606 return ret; 1614 return ret;
1607#else
1608 return 0;
1609#endif
1610} 1615}
1611 1616
1612int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1617int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
@@ -2824,14 +2829,17 @@ int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2824 num_items); 2829 num_items);
2825 2830
2826 spin_lock(&meta_sinfo->lock); 2831 spin_lock(&meta_sinfo->lock);
2827 if (BTRFS_I(inode)->delalloc_reserved_extents <= 2832 spin_lock(&BTRFS_I(inode)->accounting_lock);
2828 BTRFS_I(inode)->delalloc_extents) { 2833 if (BTRFS_I(inode)->reserved_extents <=
2834 BTRFS_I(inode)->outstanding_extents) {
2835 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2829 spin_unlock(&meta_sinfo->lock); 2836 spin_unlock(&meta_sinfo->lock);
2830 return 0; 2837 return 0;
2831 } 2838 }
2839 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2832 2840
2833 BTRFS_I(inode)->delalloc_reserved_extents--; 2841 BTRFS_I(inode)->reserved_extents--;
2834 BUG_ON(BTRFS_I(inode)->delalloc_reserved_extents < 0); 2842 BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
2835 2843
2836 if (meta_sinfo->bytes_delalloc < num_bytes) { 2844 if (meta_sinfo->bytes_delalloc < num_bytes) {
2837 bug = true; 2845 bug = true;
@@ -2864,6 +2872,107 @@ static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
2864 meta_sinfo->force_delalloc = 0; 2872 meta_sinfo->force_delalloc = 0;
2865} 2873}
2866 2874
2875struct async_flush {
2876 struct btrfs_root *root;
2877 struct btrfs_space_info *info;
2878 struct btrfs_work work;
2879};
2880
2881static noinline void flush_delalloc_async(struct btrfs_work *work)
2882{
2883 struct async_flush *async;
2884 struct btrfs_root *root;
2885 struct btrfs_space_info *info;
2886
2887 async = container_of(work, struct async_flush, work);
2888 root = async->root;
2889 info = async->info;
2890
2891 btrfs_start_delalloc_inodes(root, 0);
2892 wake_up(&info->flush_wait);
2893 btrfs_wait_ordered_extents(root, 0, 0);
2894
2895 spin_lock(&info->lock);
2896 info->flushing = 0;
2897 spin_unlock(&info->lock);
2898 wake_up(&info->flush_wait);
2899
2900 kfree(async);
2901}
2902
2903static void wait_on_flush(struct btrfs_space_info *info)
2904{
2905 DEFINE_WAIT(wait);
2906 u64 used;
2907
2908 while (1) {
2909 prepare_to_wait(&info->flush_wait, &wait,
2910 TASK_UNINTERRUPTIBLE);
2911 spin_lock(&info->lock);
2912 if (!info->flushing) {
2913 spin_unlock(&info->lock);
2914 break;
2915 }
2916
2917 used = info->bytes_used + info->bytes_reserved +
2918 info->bytes_pinned + info->bytes_readonly +
2919 info->bytes_super + info->bytes_root +
2920 info->bytes_may_use + info->bytes_delalloc;
2921 if (used < info->total_bytes) {
2922 spin_unlock(&info->lock);
2923 break;
2924 }
2925 spin_unlock(&info->lock);
2926 schedule();
2927 }
2928 finish_wait(&info->flush_wait, &wait);
2929}
2930
2931static void flush_delalloc(struct btrfs_root *root,
2932 struct btrfs_space_info *info)
2933{
2934 struct async_flush *async;
2935 bool wait = false;
2936
2937 spin_lock(&info->lock);
2938
2939 if (!info->flushing) {
2940 info->flushing = 1;
2941 init_waitqueue_head(&info->flush_wait);
2942 } else {
2943 wait = true;
2944 }
2945
2946 spin_unlock(&info->lock);
2947
2948 if (wait) {
2949 wait_on_flush(info);
2950 return;
2951 }
2952
2953 async = kzalloc(sizeof(*async), GFP_NOFS);
2954 if (!async)
2955 goto flush;
2956
2957 async->root = root;
2958 async->info = info;
2959 async->work.func = flush_delalloc_async;
2960
2961 btrfs_queue_worker(&root->fs_info->enospc_workers,
2962 &async->work);
2963 wait_on_flush(info);
2964 return;
2965
2966flush:
2967 btrfs_start_delalloc_inodes(root, 0);
2968 btrfs_wait_ordered_extents(root, 0, 0);
2969
2970 spin_lock(&info->lock);
2971 info->flushing = 0;
2972 spin_unlock(&info->lock);
2973 wake_up(&info->flush_wait);
2974}
2975
2867static int maybe_allocate_chunk(struct btrfs_root *root, 2976static int maybe_allocate_chunk(struct btrfs_root *root,
2868 struct btrfs_space_info *info) 2977 struct btrfs_space_info *info)
2869{ 2978{
@@ -2876,10 +2985,10 @@ static int maybe_allocate_chunk(struct btrfs_root *root,
2876 2985
2877 free_space = btrfs_super_total_bytes(disk_super); 2986 free_space = btrfs_super_total_bytes(disk_super);
2878 /* 2987 /*
2879 * we allow the metadata to grow to a max of either 5gb or 5% of the 2988 * we allow the metadata to grow to a max of either 10gb or 5% of the
2880 * space in the volume. 2989 * space in the volume.
2881 */ 2990 */
2882 min_metadata = min((u64)5 * 1024 * 1024 * 1024, 2991 min_metadata = min((u64)10 * 1024 * 1024 * 1024,
2883 div64_u64(free_space * 5, 100)); 2992 div64_u64(free_space * 5, 100));
2884 if (info->total_bytes >= min_metadata) { 2993 if (info->total_bytes >= min_metadata) {
2885 spin_unlock(&info->lock); 2994 spin_unlock(&info->lock);
@@ -2894,7 +3003,7 @@ static int maybe_allocate_chunk(struct btrfs_root *root,
2894 if (!info->allocating_chunk) { 3003 if (!info->allocating_chunk) {
2895 info->force_alloc = 1; 3004 info->force_alloc = 1;
2896 info->allocating_chunk = 1; 3005 info->allocating_chunk = 1;
2897 init_waitqueue_head(&info->wait); 3006 init_waitqueue_head(&info->allocate_wait);
2898 } else { 3007 } else {
2899 wait = true; 3008 wait = true;
2900 } 3009 }
@@ -2902,7 +3011,7 @@ static int maybe_allocate_chunk(struct btrfs_root *root,
2902 spin_unlock(&info->lock); 3011 spin_unlock(&info->lock);
2903 3012
2904 if (wait) { 3013 if (wait) {
2905 wait_event(info->wait, 3014 wait_event(info->allocate_wait,
2906 !info->allocating_chunk); 3015 !info->allocating_chunk);
2907 return 1; 3016 return 1;
2908 } 3017 }
@@ -2923,7 +3032,7 @@ out:
2923 spin_lock(&info->lock); 3032 spin_lock(&info->lock);
2924 info->allocating_chunk = 0; 3033 info->allocating_chunk = 0;
2925 spin_unlock(&info->lock); 3034 spin_unlock(&info->lock);
2926 wake_up(&info->wait); 3035 wake_up(&info->allocate_wait);
2927 3036
2928 if (ret) 3037 if (ret)
2929 return 0; 3038 return 0;
@@ -2981,21 +3090,20 @@ again:
2981 filemap_flush(inode->i_mapping); 3090 filemap_flush(inode->i_mapping);
2982 goto again; 3091 goto again;
2983 } else if (flushed == 3) { 3092 } else if (flushed == 3) {
2984 btrfs_start_delalloc_inodes(root); 3093 flush_delalloc(root, meta_sinfo);
2985 btrfs_wait_ordered_extents(root, 0);
2986 goto again; 3094 goto again;
2987 } 3095 }
2988 spin_lock(&meta_sinfo->lock); 3096 spin_lock(&meta_sinfo->lock);
2989 meta_sinfo->bytes_delalloc -= num_bytes; 3097 meta_sinfo->bytes_delalloc -= num_bytes;
2990 spin_unlock(&meta_sinfo->lock); 3098 spin_unlock(&meta_sinfo->lock);
2991 printk(KERN_ERR "enospc, has %d, reserved %d\n", 3099 printk(KERN_ERR "enospc, has %d, reserved %d\n",
2992 BTRFS_I(inode)->delalloc_extents, 3100 BTRFS_I(inode)->outstanding_extents,
2993 BTRFS_I(inode)->delalloc_reserved_extents); 3101 BTRFS_I(inode)->reserved_extents);
2994 dump_space_info(meta_sinfo, 0, 0); 3102 dump_space_info(meta_sinfo, 0, 0);
2995 return -ENOSPC; 3103 return -ENOSPC;
2996 } 3104 }
2997 3105
2998 BTRFS_I(inode)->delalloc_reserved_extents++; 3106 BTRFS_I(inode)->reserved_extents++;
2999 check_force_delalloc(meta_sinfo); 3107 check_force_delalloc(meta_sinfo);
3000 spin_unlock(&meta_sinfo->lock); 3108 spin_unlock(&meta_sinfo->lock);
3001 3109
@@ -3094,8 +3202,7 @@ again:
3094 } 3202 }
3095 3203
3096 if (retries == 2) { 3204 if (retries == 2) {
3097 btrfs_start_delalloc_inodes(root); 3205 flush_delalloc(root, meta_sinfo);
3098 btrfs_wait_ordered_extents(root, 0);
3099 goto again; 3206 goto again;
3100 } 3207 }
3101 spin_lock(&meta_sinfo->lock); 3208 spin_lock(&meta_sinfo->lock);
@@ -3355,14 +3462,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
3355 else 3462 else
3356 old_val -= num_bytes; 3463 old_val -= num_bytes;
3357 btrfs_set_super_bytes_used(&info->super_copy, old_val); 3464 btrfs_set_super_bytes_used(&info->super_copy, old_val);
3358
3359 /* block accounting for root item */
3360 old_val = btrfs_root_used(&root->root_item);
3361 if (alloc)
3362 old_val += num_bytes;
3363 else
3364 old_val -= num_bytes;
3365 btrfs_set_root_used(&root->root_item, old_val);
3366 spin_unlock(&info->delalloc_lock); 3465 spin_unlock(&info->delalloc_lock);
3367 3466
3368 while (total) { 3467 while (total) {
@@ -3588,6 +3687,14 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
3588 if (is_data) 3687 if (is_data)
3589 goto pinit; 3688 goto pinit;
3590 3689
3690 /*
3691 * discard is sloooow, and so triggering discards on
3692 * individual btree blocks isn't a good plan. Just
3693 * pin everything in discard mode.
3694 */
3695 if (btrfs_test_opt(root, DISCARD))
3696 goto pinit;
3697
3591 buf = btrfs_find_tree_block(root, bytenr, num_bytes); 3698 buf = btrfs_find_tree_block(root, bytenr, num_bytes);
3592 if (!buf) 3699 if (!buf)
3593 goto pinit; 3700 goto pinit;
@@ -3942,6 +4049,21 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
3942 return ret; 4049 return ret;
3943} 4050}
3944 4051
4052int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4053 struct btrfs_root *root,
4054 u64 bytenr, u32 blocksize,
4055 u64 parent, u64 root_objectid, int level)
4056{
4057 u64 used;
4058 spin_lock(&root->node_lock);
4059 used = btrfs_root_used(&root->root_item) - blocksize;
4060 btrfs_set_root_used(&root->root_item, used);
4061 spin_unlock(&root->node_lock);
4062
4063 return btrfs_free_extent(trans, root, bytenr, blocksize,
4064 parent, root_objectid, level, 0);
4065}
4066
3945static u64 stripe_align(struct btrfs_root *root, u64 val) 4067static u64 stripe_align(struct btrfs_root *root, u64 val)
3946{ 4068{
3947 u64 mask = ((u64)root->stripesize - 1); 4069 u64 mask = ((u64)root->stripesize - 1);
@@ -3995,7 +4117,7 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
3995} 4117}
3996 4118
3997enum btrfs_loop_type { 4119enum btrfs_loop_type {
3998 LOOP_CACHED_ONLY = 0, 4120 LOOP_FIND_IDEAL = 0,
3999 LOOP_CACHING_NOWAIT = 1, 4121 LOOP_CACHING_NOWAIT = 1,
4000 LOOP_CACHING_WAIT = 2, 4122 LOOP_CACHING_WAIT = 2,
4001 LOOP_ALLOC_CHUNK = 3, 4123 LOOP_ALLOC_CHUNK = 3,
@@ -4024,11 +4146,15 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4024 struct btrfs_block_group_cache *block_group = NULL; 4146 struct btrfs_block_group_cache *block_group = NULL;
4025 int empty_cluster = 2 * 1024 * 1024; 4147 int empty_cluster = 2 * 1024 * 1024;
4026 int allowed_chunk_alloc = 0; 4148 int allowed_chunk_alloc = 0;
4149 int done_chunk_alloc = 0;
4027 struct btrfs_space_info *space_info; 4150 struct btrfs_space_info *space_info;
4028 int last_ptr_loop = 0; 4151 int last_ptr_loop = 0;
4029 int loop = 0; 4152 int loop = 0;
4030 bool found_uncached_bg = false; 4153 bool found_uncached_bg = false;
4031 bool failed_cluster_refill = false; 4154 bool failed_cluster_refill = false;
4155 bool failed_alloc = false;
4156 u64 ideal_cache_percent = 0;
4157 u64 ideal_cache_offset = 0;
4032 4158
4033 WARN_ON(num_bytes < root->sectorsize); 4159 WARN_ON(num_bytes < root->sectorsize);
4034 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 4160 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -4064,14 +4190,19 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4064 empty_cluster = 0; 4190 empty_cluster = 0;
4065 4191
4066 if (search_start == hint_byte) { 4192 if (search_start == hint_byte) {
4193ideal_cache:
4067 block_group = btrfs_lookup_block_group(root->fs_info, 4194 block_group = btrfs_lookup_block_group(root->fs_info,
4068 search_start); 4195 search_start);
4069 /* 4196 /*
4070 * we don't want to use the block group if it doesn't match our 4197 * we don't want to use the block group if it doesn't match our
4071 * allocation bits, or if its not cached. 4198 * allocation bits, or if its not cached.
4199 *
4200 * However if we are re-searching with an ideal block group
4201 * picked out then we don't care that the block group is cached.
4072 */ 4202 */
4073 if (block_group && block_group_bits(block_group, data) && 4203 if (block_group && block_group_bits(block_group, data) &&
4074 block_group_cache_done(block_group)) { 4204 (block_group->cached != BTRFS_CACHE_NO ||
4205 search_start == ideal_cache_offset)) {
4075 down_read(&space_info->groups_sem); 4206 down_read(&space_info->groups_sem);
4076 if (list_empty(&block_group->list) || 4207 if (list_empty(&block_group->list) ||
4077 block_group->ro) { 4208 block_group->ro) {
@@ -4083,13 +4214,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4083 */ 4214 */
4084 btrfs_put_block_group(block_group); 4215 btrfs_put_block_group(block_group);
4085 up_read(&space_info->groups_sem); 4216 up_read(&space_info->groups_sem);
4086 } else 4217 } else {
4087 goto have_block_group; 4218 goto have_block_group;
4219 }
4088 } else if (block_group) { 4220 } else if (block_group) {
4089 btrfs_put_block_group(block_group); 4221 btrfs_put_block_group(block_group);
4090 } 4222 }
4091 } 4223 }
4092
4093search: 4224search:
4094 down_read(&space_info->groups_sem); 4225 down_read(&space_info->groups_sem);
4095 list_for_each_entry(block_group, &space_info->block_groups, list) { 4226 list_for_each_entry(block_group, &space_info->block_groups, list) {
@@ -4101,28 +4232,45 @@ search:
4101 4232
4102have_block_group: 4233have_block_group:
4103 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { 4234 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
4235 u64 free_percent;
4236
4237 free_percent = btrfs_block_group_used(&block_group->item);
4238 free_percent *= 100;
4239 free_percent = div64_u64(free_percent,
4240 block_group->key.offset);
4241 free_percent = 100 - free_percent;
4242 if (free_percent > ideal_cache_percent &&
4243 likely(!block_group->ro)) {
4244 ideal_cache_offset = block_group->key.objectid;
4245 ideal_cache_percent = free_percent;
4246 }
4247
4104 /* 4248 /*
4105 * we want to start caching kthreads, but not too many 4249 * We only want to start kthread caching if we are at
4106 * right off the bat so we don't overwhelm the system, 4250 * the point where we will wait for caching to make
4107 * so only start them if there are less than 2 and we're 4251 * progress, or if our ideal search is over and we've
4108 * in the initial allocation phase. 4252 * found somebody to start caching.
4109 */ 4253 */
4110 if (loop > LOOP_CACHING_NOWAIT || 4254 if (loop > LOOP_CACHING_NOWAIT ||
4111 atomic_read(&space_info->caching_threads) < 2) { 4255 (loop > LOOP_FIND_IDEAL &&
4256 atomic_read(&space_info->caching_threads) < 2)) {
4112 ret = cache_block_group(block_group); 4257 ret = cache_block_group(block_group);
4113 BUG_ON(ret); 4258 BUG_ON(ret);
4114 } 4259 }
4115 }
4116
4117 cached = block_group_cache_done(block_group);
4118 if (unlikely(!cached)) {
4119 found_uncached_bg = true; 4260 found_uncached_bg = true;
4120 4261
4121 /* if we only want cached bgs, loop */ 4262 /*
4122 if (loop == LOOP_CACHED_ONLY) 4263 * If loop is set for cached only, try the next block
4264 * group.
4265 */
4266 if (loop == LOOP_FIND_IDEAL)
4123 goto loop; 4267 goto loop;
4124 } 4268 }
4125 4269
4270 cached = block_group_cache_done(block_group);
4271 if (unlikely(!cached))
4272 found_uncached_bg = true;
4273
4126 if (unlikely(block_group->ro)) 4274 if (unlikely(block_group->ro))
4127 goto loop; 4275 goto loop;
4128 4276
@@ -4233,14 +4381,23 @@ refill_cluster:
4233 4381
4234 offset = btrfs_find_space_for_alloc(block_group, search_start, 4382 offset = btrfs_find_space_for_alloc(block_group, search_start,
4235 num_bytes, empty_size); 4383 num_bytes, empty_size);
4236 if (!offset && (cached || (!cached && 4384 /*
4237 loop == LOOP_CACHING_NOWAIT))) { 4385 * If we didn't find a chunk, and we haven't failed on this
4238 goto loop; 4386 * block group before, and this block group is in the middle of
4239 } else if (!offset && (!cached && 4387 * caching and we are ok with waiting, then go ahead and wait
4240 loop > LOOP_CACHING_NOWAIT)) { 4388 * for progress to be made, and set failed_alloc to true.
4389 *
4390 * If failed_alloc is true then we've already waited on this
4391 * block group once and should move on to the next block group.
4392 */
4393 if (!offset && !failed_alloc && !cached &&
4394 loop > LOOP_CACHING_NOWAIT) {
4241 wait_block_group_cache_progress(block_group, 4395 wait_block_group_cache_progress(block_group,
4242 num_bytes + empty_size); 4396 num_bytes + empty_size);
4397 failed_alloc = true;
4243 goto have_block_group; 4398 goto have_block_group;
4399 } else if (!offset) {
4400 goto loop;
4244 } 4401 }
4245checks: 4402checks:
4246 search_start = stripe_align(root, offset); 4403 search_start = stripe_align(root, offset);
@@ -4288,13 +4445,16 @@ checks:
4288 break; 4445 break;
4289loop: 4446loop:
4290 failed_cluster_refill = false; 4447 failed_cluster_refill = false;
4448 failed_alloc = false;
4291 btrfs_put_block_group(block_group); 4449 btrfs_put_block_group(block_group);
4292 } 4450 }
4293 up_read(&space_info->groups_sem); 4451 up_read(&space_info->groups_sem);
4294 4452
4295 /* LOOP_CACHED_ONLY, only search fully cached block groups 4453 /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
4296 * LOOP_CACHING_NOWAIT, search partially cached block groups, but 4454 * for them to make caching progress. Also
4297 * dont wait foR them to finish caching 4455 * determine the best possible bg to cache
4456 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
4457 * caching kthreads as we move along
4298 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching 4458 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
4299 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again 4459 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
4300 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try 4460 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
@@ -4303,12 +4463,47 @@ loop:
4303 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && 4463 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
4304 (found_uncached_bg || empty_size || empty_cluster || 4464 (found_uncached_bg || empty_size || empty_cluster ||
4305 allowed_chunk_alloc)) { 4465 allowed_chunk_alloc)) {
4306 if (found_uncached_bg) { 4466 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
4307 found_uncached_bg = false; 4467 found_uncached_bg = false;
4308 if (loop < LOOP_CACHING_WAIT) { 4468 loop++;
4309 loop++; 4469 if (!ideal_cache_percent &&
4470 atomic_read(&space_info->caching_threads))
4310 goto search; 4471 goto search;
4311 } 4472
4473 /*
4474 * 1 of the following 2 things have happened so far
4475 *
4476 * 1) We found an ideal block group for caching that
4477 * is mostly full and will cache quickly, so we might
4478 * as well wait for it.
4479 *
4480 * 2) We searched for cached only and we didn't find
4481 * anything, and we didn't start any caching kthreads
4482 * either, so chances are we will loop through and
4483 * start a couple caching kthreads, and then come back
4484 * around and just wait for them. This will be slower
4485 * because we will have 2 caching kthreads reading at
4486 * the same time when we could have just started one
4487 * and waited for it to get far enough to give us an
4488 * allocation, so go ahead and go to the wait caching
4489 * loop.
4490 */
4491 loop = LOOP_CACHING_WAIT;
4492 search_start = ideal_cache_offset;
4493 ideal_cache_percent = 0;
4494 goto ideal_cache;
4495 } else if (loop == LOOP_FIND_IDEAL) {
4496 /*
4497 * Didn't find a uncached bg, wait on anything we find
4498 * next.
4499 */
4500 loop = LOOP_CACHING_WAIT;
4501 goto search;
4502 }
4503
4504 if (loop < LOOP_CACHING_WAIT) {
4505 loop++;
4506 goto search;
4312 } 4507 }
4313 4508
4314 if (loop == LOOP_ALLOC_CHUNK) { 4509 if (loop == LOOP_ALLOC_CHUNK) {
@@ -4320,7 +4515,8 @@ loop:
4320 ret = do_chunk_alloc(trans, root, num_bytes + 4515 ret = do_chunk_alloc(trans, root, num_bytes +
4321 2 * 1024 * 1024, data, 1); 4516 2 * 1024 * 1024, data, 1);
4322 allowed_chunk_alloc = 0; 4517 allowed_chunk_alloc = 0;
4323 } else { 4518 done_chunk_alloc = 1;
4519 } else if (!done_chunk_alloc) {
4324 space_info->force_alloc = 1; 4520 space_info->force_alloc = 1;
4325 } 4521 }
4326 4522
@@ -4397,7 +4593,6 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
4397{ 4593{
4398 int ret; 4594 int ret;
4399 u64 search_start = 0; 4595 u64 search_start = 0;
4400 struct btrfs_fs_info *info = root->fs_info;
4401 4596
4402 data = btrfs_get_alloc_profile(root, data); 4597 data = btrfs_get_alloc_profile(root, data);
4403again: 4598again:
@@ -4405,17 +4600,9 @@ again:
4405 * the only place that sets empty_size is btrfs_realloc_node, which 4600 * the only place that sets empty_size is btrfs_realloc_node, which
4406 * is not called recursively on allocations 4601 * is not called recursively on allocations
4407 */ 4602 */
4408 if (empty_size || root->ref_cows) { 4603 if (empty_size || root->ref_cows)
4409 if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
4410 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4411 2 * 1024 * 1024,
4412 BTRFS_BLOCK_GROUP_METADATA |
4413 (info->metadata_alloc_profile &
4414 info->avail_metadata_alloc_bits), 0);
4415 }
4416 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 4604 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4417 num_bytes + 2 * 1024 * 1024, data, 0); 4605 num_bytes + 2 * 1024 * 1024, data, 0);
4418 }
4419 4606
4420 WARN_ON(num_bytes < root->sectorsize); 4607 WARN_ON(num_bytes < root->sectorsize);
4421 ret = find_free_extent(trans, root, num_bytes, empty_size, 4608 ret = find_free_extent(trans, root, num_bytes, empty_size,
@@ -4716,6 +4903,14 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
4716 extent_op); 4903 extent_op);
4717 BUG_ON(ret); 4904 BUG_ON(ret);
4718 } 4905 }
4906
4907 if (root_objectid == root->root_key.objectid) {
4908 u64 used;
4909 spin_lock(&root->node_lock);
4910 used = btrfs_root_used(&root->root_item) + num_bytes;
4911 btrfs_set_root_used(&root->root_item, used);
4912 spin_unlock(&root->node_lock);
4913 }
4719 return ret; 4914 return ret;
4720} 4915}
4721 4916
@@ -4738,8 +4933,16 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
4738 btrfs_set_buffer_uptodate(buf); 4933 btrfs_set_buffer_uptodate(buf);
4739 4934
4740 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 4935 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
4741 set_extent_dirty(&root->dirty_log_pages, buf->start, 4936 /*
4742 buf->start + buf->len - 1, GFP_NOFS); 4937 * we allow two log transactions at a time, use different
4938 * EXENT bit to differentiate dirty pages.
4939 */
4940 if (root->log_transid % 2 == 0)
4941 set_extent_dirty(&root->dirty_log_pages, buf->start,
4942 buf->start + buf->len - 1, GFP_NOFS);
4943 else
4944 set_extent_new(&root->dirty_log_pages, buf->start,
4945 buf->start + buf->len - 1, GFP_NOFS);
4743 } else { 4946 } else {
4744 set_extent_dirty(&trans->transaction->dirty_pages, buf->start, 4947 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
4745 buf->start + buf->len - 1, GFP_NOFS); 4948 buf->start + buf->len - 1, GFP_NOFS);
@@ -4799,6 +5002,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
4799 u64 bytenr; 5002 u64 bytenr;
4800 u64 generation; 5003 u64 generation;
4801 u64 refs; 5004 u64 refs;
5005 u64 flags;
4802 u64 last = 0; 5006 u64 last = 0;
4803 u32 nritems; 5007 u32 nritems;
4804 u32 blocksize; 5008 u32 blocksize;
@@ -4836,15 +5040,19 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
4836 generation <= root->root_key.offset) 5040 generation <= root->root_key.offset)
4837 continue; 5041 continue;
4838 5042
5043 /* We don't lock the tree block, it's OK to be racy here */
5044 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
5045 &refs, &flags);
5046 BUG_ON(ret);
5047 BUG_ON(refs == 0);
5048
4839 if (wc->stage == DROP_REFERENCE) { 5049 if (wc->stage == DROP_REFERENCE) {
4840 ret = btrfs_lookup_extent_info(trans, root,
4841 bytenr, blocksize,
4842 &refs, NULL);
4843 BUG_ON(ret);
4844 BUG_ON(refs == 0);
4845 if (refs == 1) 5050 if (refs == 1)
4846 goto reada; 5051 goto reada;
4847 5052
5053 if (wc->level == 1 &&
5054 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5055 continue;
4848 if (!wc->update_ref || 5056 if (!wc->update_ref ||
4849 generation <= root->root_key.offset) 5057 generation <= root->root_key.offset)
4850 continue; 5058 continue;
@@ -4853,6 +5061,10 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
4853 &wc->update_progress); 5061 &wc->update_progress);
4854 if (ret < 0) 5062 if (ret < 0)
4855 continue; 5063 continue;
5064 } else {
5065 if (wc->level == 1 &&
5066 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5067 continue;
4856 } 5068 }
4857reada: 5069reada:
4858 ret = readahead_tree_block(root, bytenr, blocksize, 5070 ret = readahead_tree_block(root, bytenr, blocksize,
@@ -4876,7 +5088,7 @@ reada:
4876static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 5088static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4877 struct btrfs_root *root, 5089 struct btrfs_root *root,
4878 struct btrfs_path *path, 5090 struct btrfs_path *path,
4879 struct walk_control *wc) 5091 struct walk_control *wc, int lookup_info)
4880{ 5092{
4881 int level = wc->level; 5093 int level = wc->level;
4882 struct extent_buffer *eb = path->nodes[level]; 5094 struct extent_buffer *eb = path->nodes[level];
@@ -4891,8 +5103,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4891 * when reference count of tree block is 1, it won't increase 5103 * when reference count of tree block is 1, it won't increase
4892 * again. once full backref flag is set, we never clear it. 5104 * again. once full backref flag is set, we never clear it.
4893 */ 5105 */
4894 if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 5106 if (lookup_info &&
4895 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) { 5107 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
5108 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
4896 BUG_ON(!path->locks[level]); 5109 BUG_ON(!path->locks[level]);
4897 ret = btrfs_lookup_extent_info(trans, root, 5110 ret = btrfs_lookup_extent_info(trans, root,
4898 eb->start, eb->len, 5111 eb->start, eb->len,
@@ -4953,7 +5166,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4953static noinline int do_walk_down(struct btrfs_trans_handle *trans, 5166static noinline int do_walk_down(struct btrfs_trans_handle *trans,
4954 struct btrfs_root *root, 5167 struct btrfs_root *root,
4955 struct btrfs_path *path, 5168 struct btrfs_path *path,
4956 struct walk_control *wc) 5169 struct walk_control *wc, int *lookup_info)
4957{ 5170{
4958 u64 bytenr; 5171 u64 bytenr;
4959 u64 generation; 5172 u64 generation;
@@ -4973,8 +5186,10 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
4973 * for the subtree 5186 * for the subtree
4974 */ 5187 */
4975 if (wc->stage == UPDATE_BACKREF && 5188 if (wc->stage == UPDATE_BACKREF &&
4976 generation <= root->root_key.offset) 5189 generation <= root->root_key.offset) {
5190 *lookup_info = 1;
4977 return 1; 5191 return 1;
5192 }
4978 5193
4979 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 5194 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
4980 blocksize = btrfs_level_size(root, level - 1); 5195 blocksize = btrfs_level_size(root, level - 1);
@@ -4987,14 +5202,19 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
4987 btrfs_tree_lock(next); 5202 btrfs_tree_lock(next);
4988 btrfs_set_lock_blocking(next); 5203 btrfs_set_lock_blocking(next);
4989 5204
4990 if (wc->stage == DROP_REFERENCE) { 5205 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
4991 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, 5206 &wc->refs[level - 1],
4992 &wc->refs[level - 1], 5207 &wc->flags[level - 1]);
4993 &wc->flags[level - 1]); 5208 BUG_ON(ret);
4994 BUG_ON(ret); 5209 BUG_ON(wc->refs[level - 1] == 0);
4995 BUG_ON(wc->refs[level - 1] == 0); 5210 *lookup_info = 0;
4996 5211
5212 if (wc->stage == DROP_REFERENCE) {
4997 if (wc->refs[level - 1] > 1) { 5213 if (wc->refs[level - 1] > 1) {
5214 if (level == 1 &&
5215 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5216 goto skip;
5217
4998 if (!wc->update_ref || 5218 if (!wc->update_ref ||
4999 generation <= root->root_key.offset) 5219 generation <= root->root_key.offset)
5000 goto skip; 5220 goto skip;
@@ -5008,12 +5228,17 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5008 wc->stage = UPDATE_BACKREF; 5228 wc->stage = UPDATE_BACKREF;
5009 wc->shared_level = level - 1; 5229 wc->shared_level = level - 1;
5010 } 5230 }
5231 } else {
5232 if (level == 1 &&
5233 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5234 goto skip;
5011 } 5235 }
5012 5236
5013 if (!btrfs_buffer_uptodate(next, generation)) { 5237 if (!btrfs_buffer_uptodate(next, generation)) {
5014 btrfs_tree_unlock(next); 5238 btrfs_tree_unlock(next);
5015 free_extent_buffer(next); 5239 free_extent_buffer(next);
5016 next = NULL; 5240 next = NULL;
5241 *lookup_info = 1;
5017 } 5242 }
5018 5243
5019 if (!next) { 5244 if (!next) {
@@ -5036,21 +5261,22 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5036skip: 5261skip:
5037 wc->refs[level - 1] = 0; 5262 wc->refs[level - 1] = 0;
5038 wc->flags[level - 1] = 0; 5263 wc->flags[level - 1] = 0;
5264 if (wc->stage == DROP_REFERENCE) {
5265 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5266 parent = path->nodes[level]->start;
5267 } else {
5268 BUG_ON(root->root_key.objectid !=
5269 btrfs_header_owner(path->nodes[level]));
5270 parent = 0;
5271 }
5039 5272
5040 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 5273 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
5041 parent = path->nodes[level]->start; 5274 root->root_key.objectid, level - 1, 0);
5042 } else { 5275 BUG_ON(ret);
5043 BUG_ON(root->root_key.objectid !=
5044 btrfs_header_owner(path->nodes[level]));
5045 parent = 0;
5046 } 5276 }
5047
5048 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
5049 root->root_key.objectid, level - 1, 0);
5050 BUG_ON(ret);
5051
5052 btrfs_tree_unlock(next); 5277 btrfs_tree_unlock(next);
5053 free_extent_buffer(next); 5278 free_extent_buffer(next);
5279 *lookup_info = 1;
5054 return 1; 5280 return 1;
5055} 5281}
5056 5282
@@ -5164,6 +5390,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5164 struct walk_control *wc) 5390 struct walk_control *wc)
5165{ 5391{
5166 int level = wc->level; 5392 int level = wc->level;
5393 int lookup_info = 1;
5167 int ret; 5394 int ret;
5168 5395
5169 while (level >= 0) { 5396 while (level >= 0) {
@@ -5171,14 +5398,14 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5171 btrfs_header_nritems(path->nodes[level])) 5398 btrfs_header_nritems(path->nodes[level]))
5172 break; 5399 break;
5173 5400
5174 ret = walk_down_proc(trans, root, path, wc); 5401 ret = walk_down_proc(trans, root, path, wc, lookup_info);
5175 if (ret > 0) 5402 if (ret > 0)
5176 break; 5403 break;
5177 5404
5178 if (level == 0) 5405 if (level == 0)
5179 break; 5406 break;
5180 5407
5181 ret = do_walk_down(trans, root, path, wc); 5408 ret = do_walk_down(trans, root, path, wc, &lookup_info);
5182 if (ret > 0) { 5409 if (ret > 0) {
5183 path->slots[level]++; 5410 path->slots[level]++;
5184 continue; 5411 continue;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index de1793ba004a..96577e8bf9fd 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -460,7 +460,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
460 struct extent_state *state, int bits, int wake, 460 struct extent_state *state, int bits, int wake,
461 int delete) 461 int delete)
462{ 462{
463 int ret = state->state & bits; 463 int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING;
464 int ret = state->state & bits_to_clear;
464 465
465 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 466 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
466 u64 range = state->end - state->start + 1; 467 u64 range = state->end - state->start + 1;
@@ -468,7 +469,7 @@ static int clear_state_bit(struct extent_io_tree *tree,
468 tree->dirty_bytes -= range; 469 tree->dirty_bytes -= range;
469 } 470 }
470 clear_state_cb(tree, state, bits); 471 clear_state_cb(tree, state, bits);
471 state->state &= ~bits; 472 state->state &= ~bits_to_clear;
472 if (wake) 473 if (wake)
473 wake_up(&state->wq); 474 wake_up(&state->wq);
474 if (delete || state->state == 0) { 475 if (delete || state->state == 0) {
@@ -956,7 +957,8 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
956 gfp_t mask) 957 gfp_t mask)
957{ 958{
958 return clear_extent_bit(tree, start, end, 959 return clear_extent_bit(tree, start, end,
959 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, 960 EXTENT_DIRTY | EXTENT_DELALLOC |
961 EXTENT_DO_ACCOUNTING, 0, 0,
960 NULL, mask); 962 NULL, mask);
961} 963}
962 964
@@ -1401,12 +1403,7 @@ out_failed:
1401int extent_clear_unlock_delalloc(struct inode *inode, 1403int extent_clear_unlock_delalloc(struct inode *inode,
1402 struct extent_io_tree *tree, 1404 struct extent_io_tree *tree,
1403 u64 start, u64 end, struct page *locked_page, 1405 u64 start, u64 end, struct page *locked_page,
1404 int unlock_pages, 1406 unsigned long op)
1405 int clear_unlock,
1406 int clear_delalloc, int clear_dirty,
1407 int set_writeback,
1408 int end_writeback,
1409 int set_private2)
1410{ 1407{
1411 int ret; 1408 int ret;
1412 struct page *pages[16]; 1409 struct page *pages[16];
@@ -1416,17 +1413,21 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1416 int i; 1413 int i;
1417 int clear_bits = 0; 1414 int clear_bits = 0;
1418 1415
1419 if (clear_unlock) 1416 if (op & EXTENT_CLEAR_UNLOCK)
1420 clear_bits |= EXTENT_LOCKED; 1417 clear_bits |= EXTENT_LOCKED;
1421 if (clear_dirty) 1418 if (op & EXTENT_CLEAR_DIRTY)
1422 clear_bits |= EXTENT_DIRTY; 1419 clear_bits |= EXTENT_DIRTY;
1423 1420
1424 if (clear_delalloc) 1421 if (op & EXTENT_CLEAR_DELALLOC)
1425 clear_bits |= EXTENT_DELALLOC; 1422 clear_bits |= EXTENT_DELALLOC;
1426 1423
1424 if (op & EXTENT_CLEAR_ACCOUNTING)
1425 clear_bits |= EXTENT_DO_ACCOUNTING;
1426
1427 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); 1427 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1428 if (!(unlock_pages || clear_dirty || set_writeback || end_writeback || 1428 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1429 set_private2)) 1429 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
1430 EXTENT_SET_PRIVATE2)))
1430 return 0; 1431 return 0;
1431 1432
1432 while (nr_pages > 0) { 1433 while (nr_pages > 0) {
@@ -1435,20 +1436,20 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1435 nr_pages, ARRAY_SIZE(pages)), pages); 1436 nr_pages, ARRAY_SIZE(pages)), pages);
1436 for (i = 0; i < ret; i++) { 1437 for (i = 0; i < ret; i++) {
1437 1438
1438 if (set_private2) 1439 if (op & EXTENT_SET_PRIVATE2)
1439 SetPagePrivate2(pages[i]); 1440 SetPagePrivate2(pages[i]);
1440 1441
1441 if (pages[i] == locked_page) { 1442 if (pages[i] == locked_page) {
1442 page_cache_release(pages[i]); 1443 page_cache_release(pages[i]);
1443 continue; 1444 continue;
1444 } 1445 }
1445 if (clear_dirty) 1446 if (op & EXTENT_CLEAR_DIRTY)
1446 clear_page_dirty_for_io(pages[i]); 1447 clear_page_dirty_for_io(pages[i]);
1447 if (set_writeback) 1448 if (op & EXTENT_SET_WRITEBACK)
1448 set_page_writeback(pages[i]); 1449 set_page_writeback(pages[i]);
1449 if (end_writeback) 1450 if (op & EXTENT_END_WRITEBACK)
1450 end_page_writeback(pages[i]); 1451 end_page_writeback(pages[i]);
1451 if (unlock_pages) 1452 if (op & EXTENT_CLEAR_UNLOCK_PAGE)
1452 unlock_page(pages[i]); 1453 unlock_page(pages[i]);
1453 page_cache_release(pages[i]); 1454 page_cache_release(pages[i]);
1454 } 1455 }
@@ -2714,7 +2715,8 @@ int extent_invalidatepage(struct extent_io_tree *tree,
2714 lock_extent(tree, start, end, GFP_NOFS); 2715 lock_extent(tree, start, end, GFP_NOFS);
2715 wait_on_page_writeback(page); 2716 wait_on_page_writeback(page);
2716 clear_extent_bit(tree, start, end, 2717 clear_extent_bit(tree, start, end,
2717 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, 2718 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
2719 EXTENT_DO_ACCOUNTING,
2718 1, 1, NULL, GFP_NOFS); 2720 1, 1, NULL, GFP_NOFS);
2719 return 0; 2721 return 0;
2720} 2722}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 4794ec891fed..36de250a7b2b 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -15,6 +15,7 @@
15#define EXTENT_BUFFER_FILLED (1 << 8) 15#define EXTENT_BUFFER_FILLED (1 << 8)
16#define EXTENT_BOUNDARY (1 << 9) 16#define EXTENT_BOUNDARY (1 << 9)
17#define EXTENT_NODATASUM (1 << 10) 17#define EXTENT_NODATASUM (1 << 10)
18#define EXTENT_DO_ACCOUNTING (1 << 11)
18#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 19#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
19 20
20/* flags for bio submission */ 21/* flags for bio submission */
@@ -25,6 +26,16 @@
25#define EXTENT_BUFFER_BLOCKING 1 26#define EXTENT_BUFFER_BLOCKING 1
26#define EXTENT_BUFFER_DIRTY 2 27#define EXTENT_BUFFER_DIRTY 2
27 28
29/* these are flags for extent_clear_unlock_delalloc */
30#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
31#define EXTENT_CLEAR_UNLOCK 0x2
32#define EXTENT_CLEAR_DELALLOC 0x4
33#define EXTENT_CLEAR_DIRTY 0x8
34#define EXTENT_SET_WRITEBACK 0x10
35#define EXTENT_END_WRITEBACK 0x20
36#define EXTENT_SET_PRIVATE2 0x40
37#define EXTENT_CLEAR_ACCOUNTING 0x80
38
28/* 39/*
29 * page->private values. Every page that is controlled by the extent 40 * page->private values. Every page that is controlled by the extent
30 * map has page->private set to one. 41 * map has page->private set to one.
@@ -288,10 +299,5 @@ int extent_range_uptodate(struct extent_io_tree *tree,
288int extent_clear_unlock_delalloc(struct inode *inode, 299int extent_clear_unlock_delalloc(struct inode *inode,
289 struct extent_io_tree *tree, 300 struct extent_io_tree *tree,
290 u64 start, u64 end, struct page *locked_page, 301 u64 start, u64 end, struct page *locked_page,
291 int unlock_page, 302 unsigned long op);
292 int clear_unlock,
293 int clear_delalloc, int clear_dirty,
294 int set_writeback,
295 int end_writeback,
296 int set_private2);
297#endif 303#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 2c726b7b9faa..46bea0f4dc7b 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -208,7 +208,7 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
208 write_lock(&tree->lock); 208 write_lock(&tree->lock);
209 em = lookup_extent_mapping(tree, start, len); 209 em = lookup_extent_mapping(tree, start, len);
210 210
211 WARN_ON(em->start != start || !em); 211 WARN_ON(!em || em->start != start);
212 212
213 if (!em) 213 if (!em)
214 goto out; 214 goto out;
@@ -256,7 +256,7 @@ out:
256 * Insert @em into @tree or perform a simple forward/backward merge with 256 * Insert @em into @tree or perform a simple forward/backward merge with
257 * existing mappings. The extent_map struct passed in will be inserted 257 * existing mappings. The extent_map struct passed in will be inserted
258 * into the tree directly, with an additional reference taken, or a 258 * into the tree directly, with an additional reference taken, or a
259 * reference dropped if the merge attempt was sucessfull. 259 * reference dropped if the merge attempt was successfull.
260 */ 260 */
261int add_extent_mapping(struct extent_map_tree *tree, 261int add_extent_mapping(struct extent_map_tree *tree,
262 struct extent_map *em) 262 struct extent_map *em)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f19e1259a971..feaa13b105d9 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -179,18 +179,14 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
179 } 179 }
180 flags = em->flags; 180 flags = em->flags;
181 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { 181 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
182 if (em->start <= start && 182 if (testend && em->start + em->len >= start + len) {
183 (!testend || em->start + em->len >= start + len)) {
184 free_extent_map(em); 183 free_extent_map(em);
185 write_unlock(&em_tree->lock); 184 write_unlock(&em_tree->lock);
186 break; 185 break;
187 } 186 }
188 if (start < em->start) { 187 start = em->start + em->len;
189 len = em->start - start; 188 if (testend)
190 } else {
191 len = start + len - (em->start + em->len); 189 len = start + len - (em->start + em->len);
192 start = em->start + em->len;
193 }
194 free_extent_map(em); 190 free_extent_map(em);
195 write_unlock(&em_tree->lock); 191 write_unlock(&em_tree->lock);
196 continue; 192 continue;
@@ -265,319 +261,247 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
265 * If an extent intersects the range but is not entirely inside the range 261 * If an extent intersects the range but is not entirely inside the range
266 * it is either truncated or split. Anything entirely inside the range 262 * it is either truncated or split. Anything entirely inside the range
267 * is deleted from the tree. 263 * is deleted from the tree.
268 *
269 * inline_limit is used to tell this code which offsets in the file to keep
270 * if they contain inline extents.
271 */ 264 */
272noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, 265int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
273 struct btrfs_root *root, struct inode *inode, 266 u64 start, u64 end, u64 *hint_byte, int drop_cache)
274 u64 start, u64 end, u64 locked_end,
275 u64 inline_limit, u64 *hint_byte, int drop_cache)
276{ 267{
277 u64 extent_end = 0; 268 struct btrfs_root *root = BTRFS_I(inode)->root;
278 u64 search_start = start;
279 u64 ram_bytes = 0;
280 u64 disk_bytenr = 0;
281 u64 orig_locked_end = locked_end;
282 u8 compression;
283 u8 encryption;
284 u16 other_encoding = 0;
285 struct extent_buffer *leaf; 269 struct extent_buffer *leaf;
286 struct btrfs_file_extent_item *extent; 270 struct btrfs_file_extent_item *fi;
287 struct btrfs_path *path; 271 struct btrfs_path *path;
288 struct btrfs_key key; 272 struct btrfs_key key;
289 struct btrfs_file_extent_item old; 273 struct btrfs_key new_key;
290 int keep; 274 u64 search_start = start;
291 int slot; 275 u64 disk_bytenr = 0;
292 int bookend; 276 u64 num_bytes = 0;
293 int found_type = 0; 277 u64 extent_offset = 0;
294 int found_extent; 278 u64 extent_end = 0;
295 int found_inline; 279 int del_nr = 0;
280 int del_slot = 0;
281 int extent_type;
296 int recow; 282 int recow;
297 int ret; 283 int ret;
298 284
299 inline_limit = 0;
300 if (drop_cache) 285 if (drop_cache)
301 btrfs_drop_extent_cache(inode, start, end - 1, 0); 286 btrfs_drop_extent_cache(inode, start, end - 1, 0);
302 287
303 path = btrfs_alloc_path(); 288 path = btrfs_alloc_path();
304 if (!path) 289 if (!path)
305 return -ENOMEM; 290 return -ENOMEM;
291
306 while (1) { 292 while (1) {
307 recow = 0; 293 recow = 0;
308 btrfs_release_path(root, path);
309 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, 294 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
310 search_start, -1); 295 search_start, -1);
311 if (ret < 0) 296 if (ret < 0)
312 goto out; 297 break;
313 if (ret > 0) { 298 if (ret > 0 && path->slots[0] > 0 && search_start == start) {
314 if (path->slots[0] == 0) { 299 leaf = path->nodes[0];
315 ret = 0; 300 btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
316 goto out; 301 if (key.objectid == inode->i_ino &&
317 } 302 key.type == BTRFS_EXTENT_DATA_KEY)
318 path->slots[0]--; 303 path->slots[0]--;
319 } 304 }
305 ret = 0;
320next_slot: 306next_slot:
321 keep = 0;
322 bookend = 0;
323 found_extent = 0;
324 found_inline = 0;
325 compression = 0;
326 encryption = 0;
327 extent = NULL;
328 leaf = path->nodes[0]; 307 leaf = path->nodes[0];
329 slot = path->slots[0]; 308 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
330 ret = 0; 309 BUG_ON(del_nr > 0);
331 btrfs_item_key_to_cpu(leaf, &key, slot); 310 ret = btrfs_next_leaf(root, path);
332 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY && 311 if (ret < 0)
333 key.offset >= end) { 312 break;
334 goto out; 313 if (ret > 0) {
335 } 314 ret = 0;
336 if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY || 315 break;
337 key.objectid != inode->i_ino) {
338 goto out;
339 }
340 if (recow) {
341 search_start = max(key.offset, start);
342 continue;
343 }
344 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
345 extent = btrfs_item_ptr(leaf, slot,
346 struct btrfs_file_extent_item);
347 found_type = btrfs_file_extent_type(leaf, extent);
348 compression = btrfs_file_extent_compression(leaf,
349 extent);
350 encryption = btrfs_file_extent_encryption(leaf,
351 extent);
352 other_encoding = btrfs_file_extent_other_encoding(leaf,
353 extent);
354 if (found_type == BTRFS_FILE_EXTENT_REG ||
355 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
356 extent_end =
357 btrfs_file_extent_disk_bytenr(leaf,
358 extent);
359 if (extent_end)
360 *hint_byte = extent_end;
361
362 extent_end = key.offset +
363 btrfs_file_extent_num_bytes(leaf, extent);
364 ram_bytes = btrfs_file_extent_ram_bytes(leaf,
365 extent);
366 found_extent = 1;
367 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
368 found_inline = 1;
369 extent_end = key.offset +
370 btrfs_file_extent_inline_len(leaf, extent);
371 } 316 }
317 leaf = path->nodes[0];
318 recow = 1;
319 }
320
321 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
322 if (key.objectid > inode->i_ino ||
323 key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
324 break;
325
326 fi = btrfs_item_ptr(leaf, path->slots[0],
327 struct btrfs_file_extent_item);
328 extent_type = btrfs_file_extent_type(leaf, fi);
329
330 if (extent_type == BTRFS_FILE_EXTENT_REG ||
331 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
332 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
333 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
334 extent_offset = btrfs_file_extent_offset(leaf, fi);
335 extent_end = key.offset +
336 btrfs_file_extent_num_bytes(leaf, fi);
337 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
338 extent_end = key.offset +
339 btrfs_file_extent_inline_len(leaf, fi);
372 } else { 340 } else {
341 WARN_ON(1);
373 extent_end = search_start; 342 extent_end = search_start;
374 } 343 }
375 344
376 /* we found nothing we can drop */ 345 if (extent_end <= search_start) {
377 if ((!found_extent && !found_inline) || 346 path->slots[0]++;
378 search_start >= extent_end) {
379 int nextret;
380 u32 nritems;
381 nritems = btrfs_header_nritems(leaf);
382 if (slot >= nritems - 1) {
383 nextret = btrfs_next_leaf(root, path);
384 if (nextret)
385 goto out;
386 recow = 1;
387 } else {
388 path->slots[0]++;
389 }
390 goto next_slot; 347 goto next_slot;
391 } 348 }
392 349
393 if (end <= extent_end && start >= key.offset && found_inline) 350 search_start = max(key.offset, start);
394 *hint_byte = EXTENT_MAP_INLINE; 351 if (recow) {
395 352 btrfs_release_path(root, path);
396 if (found_extent) { 353 continue;
397 read_extent_buffer(leaf, &old, (unsigned long)extent,
398 sizeof(old));
399 }
400
401 if (end < extent_end && end >= key.offset) {
402 bookend = 1;
403 if (found_inline && start <= key.offset)
404 keep = 1;
405 } 354 }
406 355
407 if (bookend && found_extent) { 356 /*
408 if (locked_end < extent_end) { 357 * | - range to drop - |
409 ret = try_lock_extent(&BTRFS_I(inode)->io_tree, 358 * | -------- extent -------- |
410 locked_end, extent_end - 1, 359 */
411 GFP_NOFS); 360 if (start > key.offset && end < extent_end) {
412 if (!ret) { 361 BUG_ON(del_nr > 0);
413 btrfs_release_path(root, path); 362 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
414 lock_extent(&BTRFS_I(inode)->io_tree, 363
415 locked_end, extent_end - 1, 364 memcpy(&new_key, &key, sizeof(new_key));
416 GFP_NOFS); 365 new_key.offset = start;
417 locked_end = extent_end; 366 ret = btrfs_duplicate_item(trans, root, path,
418 continue; 367 &new_key);
419 } 368 if (ret == -EAGAIN) {
420 locked_end = extent_end; 369 btrfs_release_path(root, path);
370 continue;
421 } 371 }
422 disk_bytenr = le64_to_cpu(old.disk_bytenr); 372 if (ret < 0)
423 if (disk_bytenr != 0) { 373 break;
374
375 leaf = path->nodes[0];
376 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
377 struct btrfs_file_extent_item);
378 btrfs_set_file_extent_num_bytes(leaf, fi,
379 start - key.offset);
380
381 fi = btrfs_item_ptr(leaf, path->slots[0],
382 struct btrfs_file_extent_item);
383
384 extent_offset += start - key.offset;
385 btrfs_set_file_extent_offset(leaf, fi, extent_offset);
386 btrfs_set_file_extent_num_bytes(leaf, fi,
387 extent_end - start);
388 btrfs_mark_buffer_dirty(leaf);
389
390 if (disk_bytenr > 0) {
424 ret = btrfs_inc_extent_ref(trans, root, 391 ret = btrfs_inc_extent_ref(trans, root,
425 disk_bytenr, 392 disk_bytenr, num_bytes, 0,
426 le64_to_cpu(old.disk_num_bytes), 0, 393 root->root_key.objectid,
427 root->root_key.objectid, 394 new_key.objectid,
428 key.objectid, key.offset - 395 start - extent_offset);
429 le64_to_cpu(old.offset));
430 BUG_ON(ret); 396 BUG_ON(ret);
397 *hint_byte = disk_bytenr;
431 } 398 }
399 key.offset = start;
432 } 400 }
401 /*
402 * | ---- range to drop ----- |
403 * | -------- extent -------- |
404 */
405 if (start <= key.offset && end < extent_end) {
406 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
433 407
434 if (found_inline) { 408 memcpy(&new_key, &key, sizeof(new_key));
435 u64 mask = root->sectorsize - 1; 409 new_key.offset = end;
436 search_start = (extent_end + mask) & ~mask; 410 btrfs_set_item_key_safe(trans, root, path, &new_key);
437 } else 411
438 search_start = extent_end; 412 extent_offset += end - key.offset;
439 413 btrfs_set_file_extent_offset(leaf, fi, extent_offset);
440 /* truncate existing extent */ 414 btrfs_set_file_extent_num_bytes(leaf, fi,
441 if (start > key.offset) { 415 extent_end - end);
442 u64 new_num; 416 btrfs_mark_buffer_dirty(leaf);
443 u64 old_num; 417 if (disk_bytenr > 0) {
444 keep = 1; 418 inode_sub_bytes(inode, end - key.offset);
445 WARN_ON(start & (root->sectorsize - 1)); 419 *hint_byte = disk_bytenr;
446 if (found_extent) {
447 new_num = start - key.offset;
448 old_num = btrfs_file_extent_num_bytes(leaf,
449 extent);
450 *hint_byte =
451 btrfs_file_extent_disk_bytenr(leaf,
452 extent);
453 if (btrfs_file_extent_disk_bytenr(leaf,
454 extent)) {
455 inode_sub_bytes(inode, old_num -
456 new_num);
457 }
458 btrfs_set_file_extent_num_bytes(leaf,
459 extent, new_num);
460 btrfs_mark_buffer_dirty(leaf);
461 } else if (key.offset < inline_limit &&
462 (end > extent_end) &&
463 (inline_limit < extent_end)) {
464 u32 new_size;
465 new_size = btrfs_file_extent_calc_inline_size(
466 inline_limit - key.offset);
467 inode_sub_bytes(inode, extent_end -
468 inline_limit);
469 btrfs_set_file_extent_ram_bytes(leaf, extent,
470 new_size);
471 if (!compression && !encryption) {
472 btrfs_truncate_item(trans, root, path,
473 new_size, 1);
474 }
475 } 420 }
421 break;
476 } 422 }
477 /* delete the entire extent */
478 if (!keep) {
479 if (found_inline)
480 inode_sub_bytes(inode, extent_end -
481 key.offset);
482 ret = btrfs_del_item(trans, root, path);
483 /* TODO update progress marker and return */
484 BUG_ON(ret);
485 extent = NULL;
486 btrfs_release_path(root, path);
487 /* the extent will be freed later */
488 }
489 if (bookend && found_inline && start <= key.offset) {
490 u32 new_size;
491 new_size = btrfs_file_extent_calc_inline_size(
492 extent_end - end);
493 inode_sub_bytes(inode, end - key.offset);
494 btrfs_set_file_extent_ram_bytes(leaf, extent,
495 new_size);
496 if (!compression && !encryption)
497 ret = btrfs_truncate_item(trans, root, path,
498 new_size, 0);
499 BUG_ON(ret);
500 }
501 /* create bookend, splitting the extent in two */
502 if (bookend && found_extent) {
503 struct btrfs_key ins;
504 ins.objectid = inode->i_ino;
505 ins.offset = end;
506 btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
507 423
508 btrfs_release_path(root, path); 424 search_start = extent_end;
509 path->leave_spinning = 1; 425 /*
510 ret = btrfs_insert_empty_item(trans, root, path, &ins, 426 * | ---- range to drop ----- |
511 sizeof(*extent)); 427 * | -------- extent -------- |
512 BUG_ON(ret); 428 */
429 if (start > key.offset && end >= extent_end) {
430 BUG_ON(del_nr > 0);
431 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
513 432
514 leaf = path->nodes[0]; 433 btrfs_set_file_extent_num_bytes(leaf, fi,
515 extent = btrfs_item_ptr(leaf, path->slots[0], 434 start - key.offset);
516 struct btrfs_file_extent_item); 435 btrfs_mark_buffer_dirty(leaf);
517 write_extent_buffer(leaf, &old, 436 if (disk_bytenr > 0) {
518 (unsigned long)extent, sizeof(old)); 437 inode_sub_bytes(inode, extent_end - start);
519 438 *hint_byte = disk_bytenr;
520 btrfs_set_file_extent_compression(leaf, extent, 439 }
521 compression); 440 if (end == extent_end)
522 btrfs_set_file_extent_encryption(leaf, extent, 441 break;
523 encryption);
524 btrfs_set_file_extent_other_encoding(leaf, extent,
525 other_encoding);
526 btrfs_set_file_extent_offset(leaf, extent,
527 le64_to_cpu(old.offset) + end - key.offset);
528 WARN_ON(le64_to_cpu(old.num_bytes) <
529 (extent_end - end));
530 btrfs_set_file_extent_num_bytes(leaf, extent,
531 extent_end - end);
532 442
533 /* 443 path->slots[0]++;
534 * set the ram bytes to the size of the full extent 444 goto next_slot;
535 * before splitting. This is a worst case flag,
536 * but its the best we can do because we don't know
537 * how splitting affects compression
538 */
539 btrfs_set_file_extent_ram_bytes(leaf, extent,
540 ram_bytes);
541 btrfs_set_file_extent_type(leaf, extent, found_type);
542
543 btrfs_unlock_up_safe(path, 1);
544 btrfs_mark_buffer_dirty(path->nodes[0]);
545 btrfs_set_lock_blocking(path->nodes[0]);
546
547 path->leave_spinning = 0;
548 btrfs_release_path(root, path);
549 if (disk_bytenr != 0)
550 inode_add_bytes(inode, extent_end - end);
551 } 445 }
552 446
553 if (found_extent && !keep) { 447 /*
554 u64 old_disk_bytenr = le64_to_cpu(old.disk_bytenr); 448 * | ---- range to drop ----- |
449 * | ------ extent ------ |
450 */
451 if (start <= key.offset && end >= extent_end) {
452 if (del_nr == 0) {
453 del_slot = path->slots[0];
454 del_nr = 1;
455 } else {
456 BUG_ON(del_slot + del_nr != path->slots[0]);
457 del_nr++;
458 }
555 459
556 if (old_disk_bytenr != 0) { 460 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
557 inode_sub_bytes(inode, 461 inode_sub_bytes(inode,
558 le64_to_cpu(old.num_bytes)); 462 extent_end - key.offset);
463 extent_end = ALIGN(extent_end,
464 root->sectorsize);
465 } else if (disk_bytenr > 0) {
559 ret = btrfs_free_extent(trans, root, 466 ret = btrfs_free_extent(trans, root,
560 old_disk_bytenr, 467 disk_bytenr, num_bytes, 0,
561 le64_to_cpu(old.disk_num_bytes), 468 root->root_key.objectid,
562 0, root->root_key.objectid,
563 key.objectid, key.offset - 469 key.objectid, key.offset -
564 le64_to_cpu(old.offset)); 470 extent_offset);
565 BUG_ON(ret); 471 BUG_ON(ret);
566 *hint_byte = old_disk_bytenr; 472 inode_sub_bytes(inode,
473 extent_end - key.offset);
474 *hint_byte = disk_bytenr;
567 } 475 }
568 }
569 476
570 if (search_start >= end) { 477 if (end == extent_end)
571 ret = 0; 478 break;
572 goto out; 479
480 if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
481 path->slots[0]++;
482 goto next_slot;
483 }
484
485 ret = btrfs_del_items(trans, root, path, del_slot,
486 del_nr);
487 BUG_ON(ret);
488
489 del_nr = 0;
490 del_slot = 0;
491
492 btrfs_release_path(root, path);
493 continue;
573 } 494 }
495
496 BUG_ON(1);
574 } 497 }
575out: 498
576 btrfs_free_path(path); 499 if (del_nr > 0) {
577 if (locked_end > orig_locked_end) { 500 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
578 unlock_extent(&BTRFS_I(inode)->io_tree, orig_locked_end, 501 BUG_ON(ret);
579 locked_end - 1, GFP_NOFS);
580 } 502 }
503
504 btrfs_free_path(path);
581 return ret; 505 return ret;
582} 506}
583 507
@@ -620,23 +544,23 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot,
620 * two or three. 544 * two or three.
621 */ 545 */
622int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, 546int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
623 struct btrfs_root *root,
624 struct inode *inode, u64 start, u64 end) 547 struct inode *inode, u64 start, u64 end)
625{ 548{
549 struct btrfs_root *root = BTRFS_I(inode)->root;
626 struct extent_buffer *leaf; 550 struct extent_buffer *leaf;
627 struct btrfs_path *path; 551 struct btrfs_path *path;
628 struct btrfs_file_extent_item *fi; 552 struct btrfs_file_extent_item *fi;
629 struct btrfs_key key; 553 struct btrfs_key key;
554 struct btrfs_key new_key;
630 u64 bytenr; 555 u64 bytenr;
631 u64 num_bytes; 556 u64 num_bytes;
632 u64 extent_end; 557 u64 extent_end;
633 u64 orig_offset; 558 u64 orig_offset;
634 u64 other_start; 559 u64 other_start;
635 u64 other_end; 560 u64 other_end;
636 u64 split = start; 561 u64 split;
637 u64 locked_end = end; 562 int del_nr = 0;
638 int extent_type; 563 int del_slot = 0;
639 int split_end = 1;
640 int ret; 564 int ret;
641 565
642 btrfs_drop_extent_cache(inode, start, end - 1, 0); 566 btrfs_drop_extent_cache(inode, start, end - 1, 0);
@@ -644,12 +568,10 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
644 path = btrfs_alloc_path(); 568 path = btrfs_alloc_path();
645 BUG_ON(!path); 569 BUG_ON(!path);
646again: 570again:
571 split = start;
647 key.objectid = inode->i_ino; 572 key.objectid = inode->i_ino;
648 key.type = BTRFS_EXTENT_DATA_KEY; 573 key.type = BTRFS_EXTENT_DATA_KEY;
649 if (split == start) 574 key.offset = split;
650 key.offset = split;
651 else
652 key.offset = split - 1;
653 575
654 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 576 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
655 if (ret > 0 && path->slots[0] > 0) 577 if (ret > 0 && path->slots[0] > 0)
@@ -661,8 +583,8 @@ again:
661 key.type != BTRFS_EXTENT_DATA_KEY); 583 key.type != BTRFS_EXTENT_DATA_KEY);
662 fi = btrfs_item_ptr(leaf, path->slots[0], 584 fi = btrfs_item_ptr(leaf, path->slots[0],
663 struct btrfs_file_extent_item); 585 struct btrfs_file_extent_item);
664 extent_type = btrfs_file_extent_type(leaf, fi); 586 BUG_ON(btrfs_file_extent_type(leaf, fi) !=
665 BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC); 587 BTRFS_FILE_EXTENT_PREALLOC);
666 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 588 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
667 BUG_ON(key.offset > start || extent_end < end); 589 BUG_ON(key.offset > start || extent_end < end);
668 590
@@ -670,150 +592,91 @@ again:
670 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 592 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
671 orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi); 593 orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
672 594
673 if (key.offset == start) 595 while (start > key.offset || end < extent_end) {
674 split = end; 596 if (key.offset == start)
675 597 split = end;
676 if (key.offset == start && extent_end == end) { 598
677 int del_nr = 0; 599 memcpy(&new_key, &key, sizeof(new_key));
678 int del_slot = 0; 600 new_key.offset = split;
679 other_start = end; 601 ret = btrfs_duplicate_item(trans, root, path, &new_key);
680 other_end = 0; 602 if (ret == -EAGAIN) {
681 if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino, 603 btrfs_release_path(root, path);
682 bytenr, &other_start, &other_end)) { 604 goto again;
683 extent_end = other_end;
684 del_slot = path->slots[0] + 1;
685 del_nr++;
686 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
687 0, root->root_key.objectid,
688 inode->i_ino, orig_offset);
689 BUG_ON(ret);
690 }
691 other_start = 0;
692 other_end = start;
693 if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino,
694 bytenr, &other_start, &other_end)) {
695 key.offset = other_start;
696 del_slot = path->slots[0];
697 del_nr++;
698 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
699 0, root->root_key.objectid,
700 inode->i_ino, orig_offset);
701 BUG_ON(ret);
702 }
703 split_end = 0;
704 if (del_nr == 0) {
705 btrfs_set_file_extent_type(leaf, fi,
706 BTRFS_FILE_EXTENT_REG);
707 goto done;
708 } 605 }
606 BUG_ON(ret < 0);
709 607
710 fi = btrfs_item_ptr(leaf, del_slot - 1, 608 leaf = path->nodes[0];
609 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
711 struct btrfs_file_extent_item); 610 struct btrfs_file_extent_item);
712 btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG);
713 btrfs_set_file_extent_num_bytes(leaf, fi, 611 btrfs_set_file_extent_num_bytes(leaf, fi,
714 extent_end - key.offset); 612 split - key.offset);
613
614 fi = btrfs_item_ptr(leaf, path->slots[0],
615 struct btrfs_file_extent_item);
616
617 btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
618 btrfs_set_file_extent_num_bytes(leaf, fi,
619 extent_end - split);
715 btrfs_mark_buffer_dirty(leaf); 620 btrfs_mark_buffer_dirty(leaf);
716 621
717 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 622 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
623 root->root_key.objectid,
624 inode->i_ino, orig_offset);
718 BUG_ON(ret); 625 BUG_ON(ret);
719 goto release;
720 } else if (split == start) {
721 if (locked_end < extent_end) {
722 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
723 locked_end, extent_end - 1, GFP_NOFS);
724 if (!ret) {
725 btrfs_release_path(root, path);
726 lock_extent(&BTRFS_I(inode)->io_tree,
727 locked_end, extent_end - 1, GFP_NOFS);
728 locked_end = extent_end;
729 goto again;
730 }
731 locked_end = extent_end;
732 }
733 btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
734 } else {
735 BUG_ON(key.offset != start);
736 key.offset = split;
737 btrfs_set_file_extent_offset(leaf, fi, key.offset -
738 orig_offset);
739 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
740 btrfs_set_item_key_safe(trans, root, path, &key);
741 extent_end = split;
742 }
743 626
744 if (extent_end == end) { 627 if (split == start) {
745 split_end = 0; 628 key.offset = start;
746 extent_type = BTRFS_FILE_EXTENT_REG; 629 } else {
747 } 630 BUG_ON(start != key.offset);
748 if (extent_end == end && split == start) {
749 other_start = end;
750 other_end = 0;
751 if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
752 bytenr, &other_start, &other_end)) {
753 path->slots[0]++;
754 fi = btrfs_item_ptr(leaf, path->slots[0],
755 struct btrfs_file_extent_item);
756 key.offset = split;
757 btrfs_set_item_key_safe(trans, root, path, &key);
758 btrfs_set_file_extent_offset(leaf, fi, key.offset -
759 orig_offset);
760 btrfs_set_file_extent_num_bytes(leaf, fi,
761 other_end - split);
762 goto done;
763 }
764 }
765 if (extent_end == end && split == end) {
766 other_start = 0;
767 other_end = start;
768 if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino,
769 bytenr, &other_start, &other_end)) {
770 path->slots[0]--; 631 path->slots[0]--;
771 fi = btrfs_item_ptr(leaf, path->slots[0], 632 extent_end = end;
772 struct btrfs_file_extent_item);
773 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end -
774 other_start);
775 goto done;
776 } 633 }
777 } 634 }
778 635
779 btrfs_mark_buffer_dirty(leaf);
780
781 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
782 root->root_key.objectid,
783 inode->i_ino, orig_offset);
784 BUG_ON(ret);
785 btrfs_release_path(root, path);
786
787 key.offset = start;
788 ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi));
789 BUG_ON(ret);
790
791 leaf = path->nodes[0];
792 fi = btrfs_item_ptr(leaf, path->slots[0], 636 fi = btrfs_item_ptr(leaf, path->slots[0],
793 struct btrfs_file_extent_item); 637 struct btrfs_file_extent_item);
794 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
795 btrfs_set_file_extent_type(leaf, fi, extent_type);
796 btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
797 btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
798 btrfs_set_file_extent_offset(leaf, fi, key.offset - orig_offset);
799 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
800 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
801 btrfs_set_file_extent_compression(leaf, fi, 0);
802 btrfs_set_file_extent_encryption(leaf, fi, 0);
803 btrfs_set_file_extent_other_encoding(leaf, fi, 0);
804done:
805 btrfs_mark_buffer_dirty(leaf);
806 638
807release: 639 other_start = end;
808 btrfs_release_path(root, path); 640 other_end = 0;
809 if (split_end && split == start) { 641 if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
810 split = end; 642 bytenr, &other_start, &other_end)) {
811 goto again; 643 extent_end = other_end;
644 del_slot = path->slots[0] + 1;
645 del_nr++;
646 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
647 0, root->root_key.objectid,
648 inode->i_ino, orig_offset);
649 BUG_ON(ret);
812 } 650 }
813 if (locked_end > end) { 651 other_start = 0;
814 unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1, 652 other_end = start;
815 GFP_NOFS); 653 if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino,
654 bytenr, &other_start, &other_end)) {
655 key.offset = other_start;
656 del_slot = path->slots[0];
657 del_nr++;
658 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
659 0, root->root_key.objectid,
660 inode->i_ino, orig_offset);
661 BUG_ON(ret);
816 } 662 }
663 if (del_nr == 0) {
664 btrfs_set_file_extent_type(leaf, fi,
665 BTRFS_FILE_EXTENT_REG);
666 btrfs_mark_buffer_dirty(leaf);
667 goto out;
668 }
669
670 fi = btrfs_item_ptr(leaf, del_slot - 1,
671 struct btrfs_file_extent_item);
672 btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG);
673 btrfs_set_file_extent_num_bytes(leaf, fi,
674 extent_end - key.offset);
675 btrfs_mark_buffer_dirty(leaf);
676
677 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
678 BUG_ON(ret);
679out:
817 btrfs_free_path(path); 680 btrfs_free_path(path);
818 return 0; 681 return 0;
819} 682}
@@ -878,7 +741,8 @@ again:
878 btrfs_put_ordered_extent(ordered); 741 btrfs_put_ordered_extent(ordered);
879 742
880 clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos, 743 clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
881 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC, 744 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
745 EXTENT_DO_ACCOUNTING,
882 GFP_NOFS); 746 GFP_NOFS);
883 unlock_extent(&BTRFS_I(inode)->io_tree, 747 unlock_extent(&BTRFS_I(inode)->io_tree,
884 start_pos, last_pos - 1, GFP_NOFS); 748 start_pos, last_pos - 1, GFP_NOFS);
@@ -908,7 +772,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
908 unsigned long last_index; 772 unsigned long last_index;
909 int will_write; 773 int will_write;
910 774
911 will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) || 775 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
912 (file->f_flags & O_DIRECT)); 776 (file->f_flags & O_DIRECT));
913 777
914 nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, 778 nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
@@ -1075,7 +939,7 @@ out_nolock:
1075 if (err) 939 if (err)
1076 num_written = err; 940 num_written = err;
1077 941
1078 if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { 942 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
1079 trans = btrfs_start_transaction(root, 1); 943 trans = btrfs_start_transaction(root, 1);
1080 ret = btrfs_log_dentry_safe(trans, root, 944 ret = btrfs_log_dentry_safe(trans, root,
1081 file->f_dentry); 945 file->f_dentry);
@@ -1085,8 +949,10 @@ out_nolock:
1085 btrfs_end_transaction(trans, root); 949 btrfs_end_transaction(trans, root);
1086 else 950 else
1087 btrfs_commit_transaction(trans, root); 951 btrfs_commit_transaction(trans, root);
1088 } else { 952 } else if (ret != BTRFS_NO_LOG_SYNC) {
1089 btrfs_commit_transaction(trans, root); 953 btrfs_commit_transaction(trans, root);
954 } else {
955 btrfs_end_transaction(trans, root);
1090 } 956 }
1091 } 957 }
1092 if (file->f_flags & O_DIRECT) { 958 if (file->f_flags & O_DIRECT) {
@@ -1136,6 +1002,13 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1136 int ret = 0; 1002 int ret = 0;
1137 struct btrfs_trans_handle *trans; 1003 struct btrfs_trans_handle *trans;
1138 1004
1005
1006 /* we wait first, since the writeback may change the inode */
1007 root->log_batch++;
1008 /* the VFS called filemap_fdatawrite for us */
1009 btrfs_wait_ordered_range(inode, 0, (u64)-1);
1010 root->log_batch++;
1011
1139 /* 1012 /*
1140 * check the transaction that last modified this inode 1013 * check the transaction that last modified this inode
1141 * and see if its already been committed 1014 * and see if its already been committed
@@ -1143,6 +1016,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1143 if (!BTRFS_I(inode)->last_trans) 1016 if (!BTRFS_I(inode)->last_trans)
1144 goto out; 1017 goto out;
1145 1018
1019 /*
1020 * if the last transaction that changed this file was before
1021 * the current transaction, we can bail out now without any
1022 * syncing
1023 */
1146 mutex_lock(&root->fs_info->trans_mutex); 1024 mutex_lock(&root->fs_info->trans_mutex);
1147 if (BTRFS_I(inode)->last_trans <= 1025 if (BTRFS_I(inode)->last_trans <=
1148 root->fs_info->last_trans_committed) { 1026 root->fs_info->last_trans_committed) {
@@ -1152,13 +1030,6 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1152 } 1030 }
1153 mutex_unlock(&root->fs_info->trans_mutex); 1031 mutex_unlock(&root->fs_info->trans_mutex);
1154 1032
1155 root->log_batch++;
1156 filemap_fdatawrite(inode->i_mapping);
1157 btrfs_wait_ordered_range(inode, 0, (u64)-1);
1158 root->log_batch++;
1159
1160 if (datasync && !(inode->i_state & I_DIRTY_PAGES))
1161 goto out;
1162 /* 1033 /*
1163 * ok we haven't committed the transaction yet, lets do a commit 1034 * ok we haven't committed the transaction yet, lets do a commit
1164 */ 1035 */
@@ -1187,14 +1058,18 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1187 */ 1058 */
1188 mutex_unlock(&dentry->d_inode->i_mutex); 1059 mutex_unlock(&dentry->d_inode->i_mutex);
1189 1060
1190 if (ret > 0) { 1061 if (ret != BTRFS_NO_LOG_SYNC) {
1191 ret = btrfs_commit_transaction(trans, root); 1062 if (ret > 0) {
1192 } else {
1193 ret = btrfs_sync_log(trans, root);
1194 if (ret == 0)
1195 ret = btrfs_end_transaction(trans, root);
1196 else
1197 ret = btrfs_commit_transaction(trans, root); 1063 ret = btrfs_commit_transaction(trans, root);
1064 } else {
1065 ret = btrfs_sync_log(trans, root);
1066 if (ret == 0)
1067 ret = btrfs_end_transaction(trans, root);
1068 else
1069 ret = btrfs_commit_transaction(trans, root);
1070 }
1071 } else {
1072 ret = btrfs_end_transaction(trans, root);
1198 } 1073 }
1199 mutex_lock(&dentry->d_inode->i_mutex); 1074 mutex_lock(&dentry->d_inode->i_mutex);
1200out: 1075out:
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 5c2caad76212..cb2849f03251 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1296,7 +1296,7 @@ again:
1296 window_start = entry->offset; 1296 window_start = entry->offset;
1297 window_free = entry->bytes; 1297 window_free = entry->bytes;
1298 last = entry; 1298 last = entry;
1299 max_extent = 0; 1299 max_extent = entry->bytes;
1300 } else { 1300 } else {
1301 last = next; 1301 last = next;
1302 window_free += next->bytes; 1302 window_free += next->bytes;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 112e5aa85892..5440bab23635 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -88,13 +88,14 @@ static noinline int cow_file_range(struct inode *inode,
88 u64 start, u64 end, int *page_started, 88 u64 start, u64 end, int *page_started,
89 unsigned long *nr_written, int unlock); 89 unsigned long *nr_written, int unlock);
90 90
91static int btrfs_init_inode_security(struct inode *inode, struct inode *dir) 91static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
92 struct inode *inode, struct inode *dir)
92{ 93{
93 int err; 94 int err;
94 95
95 err = btrfs_init_acl(inode, dir); 96 err = btrfs_init_acl(trans, inode, dir);
96 if (!err) 97 if (!err)
97 err = btrfs_xattr_security_init(inode, dir); 98 err = btrfs_xattr_security_init(trans, inode, dir);
98 return err; 99 return err;
99} 100}
100 101
@@ -188,8 +189,18 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
188 btrfs_mark_buffer_dirty(leaf); 189 btrfs_mark_buffer_dirty(leaf);
189 btrfs_free_path(path); 190 btrfs_free_path(path);
190 191
192 /*
193 * we're an inline extent, so nobody can
194 * extend the file past i_size without locking
195 * a page we already have locked.
196 *
197 * We must do any isize and inode updates
198 * before we unlock the pages. Otherwise we
199 * could end up racing with unlink.
200 */
191 BTRFS_I(inode)->disk_i_size = inode->i_size; 201 BTRFS_I(inode)->disk_i_size = inode->i_size;
192 btrfs_update_inode(trans, root, inode); 202 btrfs_update_inode(trans, root, inode);
203
193 return 0; 204 return 0;
194fail: 205fail:
195 btrfs_free_path(path); 206 btrfs_free_path(path);
@@ -230,8 +241,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
230 return 1; 241 return 1;
231 } 242 }
232 243
233 ret = btrfs_drop_extents(trans, root, inode, start, 244 ret = btrfs_drop_extents(trans, inode, start, aligned_end,
234 aligned_end, aligned_end, start,
235 &hint_byte, 1); 245 &hint_byte, 1);
236 BUG_ON(ret); 246 BUG_ON(ret);
237 247
@@ -416,7 +426,6 @@ again:
416 start, end, 426 start, end,
417 total_compressed, pages); 427 total_compressed, pages);
418 } 428 }
419 btrfs_end_transaction(trans, root);
420 if (ret == 0) { 429 if (ret == 0) {
421 /* 430 /*
422 * inline extent creation worked, we don't need 431 * inline extent creation worked, we don't need
@@ -424,12 +433,17 @@ again:
424 * and free up our temp pages. 433 * and free up our temp pages.
425 */ 434 */
426 extent_clear_unlock_delalloc(inode, 435 extent_clear_unlock_delalloc(inode,
427 &BTRFS_I(inode)->io_tree, 436 &BTRFS_I(inode)->io_tree,
428 start, end, NULL, 1, 0, 437 start, end, NULL,
429 0, 1, 1, 1, 0); 438 EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
430 ret = 0; 439 EXTENT_CLEAR_DELALLOC |
440 EXTENT_CLEAR_ACCOUNTING |
441 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
442
443 btrfs_end_transaction(trans, root);
431 goto free_pages_out; 444 goto free_pages_out;
432 } 445 }
446 btrfs_end_transaction(trans, root);
433 } 447 }
434 448
435 if (will_compress) { 449 if (will_compress) {
@@ -535,12 +549,11 @@ static noinline int submit_compressed_extents(struct inode *inode,
535 struct btrfs_root *root = BTRFS_I(inode)->root; 549 struct btrfs_root *root = BTRFS_I(inode)->root;
536 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 550 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
537 struct extent_io_tree *io_tree; 551 struct extent_io_tree *io_tree;
538 int ret; 552 int ret = 0;
539 553
540 if (list_empty(&async_cow->extents)) 554 if (list_empty(&async_cow->extents))
541 return 0; 555 return 0;
542 556
543 trans = btrfs_join_transaction(root, 1);
544 557
545 while (!list_empty(&async_cow->extents)) { 558 while (!list_empty(&async_cow->extents)) {
546 async_extent = list_entry(async_cow->extents.next, 559 async_extent = list_entry(async_cow->extents.next,
@@ -549,6 +562,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
549 562
550 io_tree = &BTRFS_I(inode)->io_tree; 563 io_tree = &BTRFS_I(inode)->io_tree;
551 564
565retry:
552 /* did the compression code fall back to uncompressed IO? */ 566 /* did the compression code fall back to uncompressed IO? */
553 if (!async_extent->pages) { 567 if (!async_extent->pages) {
554 int page_started = 0; 568 int page_started = 0;
@@ -559,11 +573,11 @@ static noinline int submit_compressed_extents(struct inode *inode,
559 async_extent->ram_size - 1, GFP_NOFS); 573 async_extent->ram_size - 1, GFP_NOFS);
560 574
561 /* allocate blocks */ 575 /* allocate blocks */
562 cow_file_range(inode, async_cow->locked_page, 576 ret = cow_file_range(inode, async_cow->locked_page,
563 async_extent->start, 577 async_extent->start,
564 async_extent->start + 578 async_extent->start +
565 async_extent->ram_size - 1, 579 async_extent->ram_size - 1,
566 &page_started, &nr_written, 0); 580 &page_started, &nr_written, 0);
567 581
568 /* 582 /*
569 * if page_started, cow_file_range inserted an 583 * if page_started, cow_file_range inserted an
@@ -571,7 +585,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
571 * and IO for us. Otherwise, we need to submit 585 * and IO for us. Otherwise, we need to submit
572 * all those pages down to the drive. 586 * all those pages down to the drive.
573 */ 587 */
574 if (!page_started) 588 if (!page_started && !ret)
575 extent_write_locked_range(io_tree, 589 extent_write_locked_range(io_tree,
576 inode, async_extent->start, 590 inode, async_extent->start,
577 async_extent->start + 591 async_extent->start +
@@ -586,6 +600,30 @@ static noinline int submit_compressed_extents(struct inode *inode,
586 lock_extent(io_tree, async_extent->start, 600 lock_extent(io_tree, async_extent->start,
587 async_extent->start + async_extent->ram_size - 1, 601 async_extent->start + async_extent->ram_size - 1,
588 GFP_NOFS); 602 GFP_NOFS);
603
604 trans = btrfs_join_transaction(root, 1);
605 ret = btrfs_reserve_extent(trans, root,
606 async_extent->compressed_size,
607 async_extent->compressed_size,
608 0, alloc_hint,
609 (u64)-1, &ins, 1);
610 btrfs_end_transaction(trans, root);
611
612 if (ret) {
613 int i;
614 for (i = 0; i < async_extent->nr_pages; i++) {
615 WARN_ON(async_extent->pages[i]->mapping);
616 page_cache_release(async_extent->pages[i]);
617 }
618 kfree(async_extent->pages);
619 async_extent->nr_pages = 0;
620 async_extent->pages = NULL;
621 unlock_extent(io_tree, async_extent->start,
622 async_extent->start +
623 async_extent->ram_size - 1, GFP_NOFS);
624 goto retry;
625 }
626
589 /* 627 /*
590 * here we're doing allocation and writeback of the 628 * here we're doing allocation and writeback of the
591 * compressed pages 629 * compressed pages
@@ -594,12 +632,6 @@ static noinline int submit_compressed_extents(struct inode *inode,
594 async_extent->start + 632 async_extent->start +
595 async_extent->ram_size - 1, 0); 633 async_extent->ram_size - 1, 0);
596 634
597 ret = btrfs_reserve_extent(trans, root,
598 async_extent->compressed_size,
599 async_extent->compressed_size,
600 0, alloc_hint,
601 (u64)-1, &ins, 1);
602 BUG_ON(ret);
603 em = alloc_extent_map(GFP_NOFS); 635 em = alloc_extent_map(GFP_NOFS);
604 em->start = async_extent->start; 636 em->start = async_extent->start;
605 em->len = async_extent->ram_size; 637 em->len = async_extent->ram_size;
@@ -631,17 +663,18 @@ static noinline int submit_compressed_extents(struct inode *inode,
631 BTRFS_ORDERED_COMPRESSED); 663 BTRFS_ORDERED_COMPRESSED);
632 BUG_ON(ret); 664 BUG_ON(ret);
633 665
634 btrfs_end_transaction(trans, root);
635
636 /* 666 /*
637 * clear dirty, set writeback and unlock the pages. 667 * clear dirty, set writeback and unlock the pages.
638 */ 668 */
639 extent_clear_unlock_delalloc(inode, 669 extent_clear_unlock_delalloc(inode,
640 &BTRFS_I(inode)->io_tree, 670 &BTRFS_I(inode)->io_tree,
641 async_extent->start, 671 async_extent->start,
642 async_extent->start + 672 async_extent->start +
643 async_extent->ram_size - 1, 673 async_extent->ram_size - 1,
644 NULL, 1, 1, 0, 1, 1, 0, 0); 674 NULL, EXTENT_CLEAR_UNLOCK_PAGE |
675 EXTENT_CLEAR_UNLOCK |
676 EXTENT_CLEAR_DELALLOC |
677 EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK);
645 678
646 ret = btrfs_submit_compressed_write(inode, 679 ret = btrfs_submit_compressed_write(inode,
647 async_extent->start, 680 async_extent->start,
@@ -651,13 +684,11 @@ static noinline int submit_compressed_extents(struct inode *inode,
651 async_extent->nr_pages); 684 async_extent->nr_pages);
652 685
653 BUG_ON(ret); 686 BUG_ON(ret);
654 trans = btrfs_join_transaction(root, 1);
655 alloc_hint = ins.objectid + ins.offset; 687 alloc_hint = ins.objectid + ins.offset;
656 kfree(async_extent); 688 kfree(async_extent);
657 cond_resched(); 689 cond_resched();
658 } 690 }
659 691
660 btrfs_end_transaction(trans, root);
661 return 0; 692 return 0;
662} 693}
663 694
@@ -712,9 +743,16 @@ static noinline int cow_file_range(struct inode *inode,
712 start, end, 0, NULL); 743 start, end, 0, NULL);
713 if (ret == 0) { 744 if (ret == 0) {
714 extent_clear_unlock_delalloc(inode, 745 extent_clear_unlock_delalloc(inode,
715 &BTRFS_I(inode)->io_tree, 746 &BTRFS_I(inode)->io_tree,
716 start, end, NULL, 1, 1, 747 start, end, NULL,
717 1, 1, 1, 1, 0); 748 EXTENT_CLEAR_UNLOCK_PAGE |
749 EXTENT_CLEAR_UNLOCK |
750 EXTENT_CLEAR_DELALLOC |
751 EXTENT_CLEAR_ACCOUNTING |
752 EXTENT_CLEAR_DIRTY |
753 EXTENT_SET_WRITEBACK |
754 EXTENT_END_WRITEBACK);
755
718 *nr_written = *nr_written + 756 *nr_written = *nr_written +
719 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; 757 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
720 *page_started = 1; 758 *page_started = 1;
@@ -731,13 +769,29 @@ static noinline int cow_file_range(struct inode *inode,
731 em = search_extent_mapping(&BTRFS_I(inode)->extent_tree, 769 em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
732 start, num_bytes); 770 start, num_bytes);
733 if (em) { 771 if (em) {
734 alloc_hint = em->block_start; 772 /*
735 free_extent_map(em); 773 * if block start isn't an actual block number then find the
774 * first block in this inode and use that as a hint. If that
775 * block is also bogus then just don't worry about it.
776 */
777 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
778 free_extent_map(em);
779 em = search_extent_mapping(em_tree, 0, 0);
780 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
781 alloc_hint = em->block_start;
782 if (em)
783 free_extent_map(em);
784 } else {
785 alloc_hint = em->block_start;
786 free_extent_map(em);
787 }
736 } 788 }
737 read_unlock(&BTRFS_I(inode)->extent_tree.lock); 789 read_unlock(&BTRFS_I(inode)->extent_tree.lock);
738 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 790 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
739 791
740 while (disk_num_bytes > 0) { 792 while (disk_num_bytes > 0) {
793 unsigned long op;
794
741 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); 795 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
742 ret = btrfs_reserve_extent(trans, root, cur_alloc_size, 796 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
743 root->sectorsize, 0, alloc_hint, 797 root->sectorsize, 0, alloc_hint,
@@ -789,10 +843,13 @@ static noinline int cow_file_range(struct inode *inode,
789 * Do set the Private2 bit so we know this page was properly 843 * Do set the Private2 bit so we know this page was properly
790 * setup for writepage 844 * setup for writepage
791 */ 845 */
846 op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0;
847 op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
848 EXTENT_SET_PRIVATE2;
849
792 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 850 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
793 start, start + ram_size - 1, 851 start, start + ram_size - 1,
794 locked_page, unlock, 1, 852 locked_page, op);
795 1, 0, 0, 0, 1);
796 disk_num_bytes -= cur_alloc_size; 853 disk_num_bytes -= cur_alloc_size;
797 num_bytes -= cur_alloc_size; 854 num_bytes -= cur_alloc_size;
798 alloc_hint = ins.objectid + ins.offset; 855 alloc_hint = ins.objectid + ins.offset;
@@ -864,8 +921,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
864 u64 cur_end; 921 u64 cur_end;
865 int limit = 10 * 1024 * 1042; 922 int limit = 10 * 1024 * 1042;
866 923
867 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | 924 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
868 EXTENT_DELALLOC, 1, 0, NULL, GFP_NOFS); 925 1, 0, NULL, GFP_NOFS);
869 while (start < end) { 926 while (start < end) {
870 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); 927 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
871 async_cow->inode = inode; 928 async_cow->inode = inode;
@@ -1006,6 +1063,7 @@ next_slot:
1006 1063
1007 if (found_key.offset > cur_offset) { 1064 if (found_key.offset > cur_offset) {
1008 extent_end = found_key.offset; 1065 extent_end = found_key.offset;
1066 extent_type = 0;
1009 goto out_check; 1067 goto out_check;
1010 } 1068 }
1011 1069
@@ -1112,8 +1170,10 @@ out_check:
1112 BUG_ON(ret); 1170 BUG_ON(ret);
1113 1171
1114 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 1172 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
1115 cur_offset, cur_offset + num_bytes - 1, 1173 cur_offset, cur_offset + num_bytes - 1,
1116 locked_page, 1, 1, 1, 0, 0, 0, 1); 1174 locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
1175 EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
1176 EXTENT_SET_PRIVATE2);
1117 cur_offset = extent_end; 1177 cur_offset = extent_end;
1118 if (cur_offset > end) 1178 if (cur_offset > end)
1119 break; 1179 break;
@@ -1178,15 +1238,17 @@ static int btrfs_split_extent_hook(struct inode *inode,
1178 root->fs_info->max_extent); 1238 root->fs_info->max_extent);
1179 1239
1180 /* 1240 /*
1181 * if we break a large extent up then leave delalloc_extents be, 1241 * if we break a large extent up then leave oustanding_extents
1182 * since we've already accounted for the large extent. 1242 * be, since we've already accounted for the large extent.
1183 */ 1243 */
1184 if (div64_u64(new_size + root->fs_info->max_extent - 1, 1244 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1185 root->fs_info->max_extent) < num_extents) 1245 root->fs_info->max_extent) < num_extents)
1186 return 0; 1246 return 0;
1187 } 1247 }
1188 1248
1189 BTRFS_I(inode)->delalloc_extents++; 1249 spin_lock(&BTRFS_I(inode)->accounting_lock);
1250 BTRFS_I(inode)->outstanding_extents++;
1251 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1190 1252
1191 return 0; 1253 return 0;
1192} 1254}
@@ -1217,7 +1279,9 @@ static int btrfs_merge_extent_hook(struct inode *inode,
1217 1279
1218 /* we're not bigger than the max, unreserve the space and go */ 1280 /* we're not bigger than the max, unreserve the space and go */
1219 if (new_size <= root->fs_info->max_extent) { 1281 if (new_size <= root->fs_info->max_extent) {
1220 BTRFS_I(inode)->delalloc_extents--; 1282 spin_lock(&BTRFS_I(inode)->accounting_lock);
1283 BTRFS_I(inode)->outstanding_extents--;
1284 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1221 return 0; 1285 return 0;
1222 } 1286 }
1223 1287
@@ -1231,7 +1295,9 @@ static int btrfs_merge_extent_hook(struct inode *inode,
1231 root->fs_info->max_extent) > num_extents) 1295 root->fs_info->max_extent) > num_extents)
1232 return 0; 1296 return 0;
1233 1297
1234 BTRFS_I(inode)->delalloc_extents--; 1298 spin_lock(&BTRFS_I(inode)->accounting_lock);
1299 BTRFS_I(inode)->outstanding_extents--;
1300 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1235 1301
1236 return 0; 1302 return 0;
1237} 1303}
@@ -1253,7 +1319,9 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1253 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1319 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1254 struct btrfs_root *root = BTRFS_I(inode)->root; 1320 struct btrfs_root *root = BTRFS_I(inode)->root;
1255 1321
1256 BTRFS_I(inode)->delalloc_extents++; 1322 spin_lock(&BTRFS_I(inode)->accounting_lock);
1323 BTRFS_I(inode)->outstanding_extents++;
1324 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1257 btrfs_delalloc_reserve_space(root, inode, end - start + 1); 1325 btrfs_delalloc_reserve_space(root, inode, end - start + 1);
1258 spin_lock(&root->fs_info->delalloc_lock); 1326 spin_lock(&root->fs_info->delalloc_lock);
1259 BTRFS_I(inode)->delalloc_bytes += end - start + 1; 1327 BTRFS_I(inode)->delalloc_bytes += end - start + 1;
@@ -1281,8 +1349,12 @@ static int btrfs_clear_bit_hook(struct inode *inode,
1281 if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1349 if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1282 struct btrfs_root *root = BTRFS_I(inode)->root; 1350 struct btrfs_root *root = BTRFS_I(inode)->root;
1283 1351
1284 BTRFS_I(inode)->delalloc_extents--; 1352 if (bits & EXTENT_DO_ACCOUNTING) {
1285 btrfs_unreserve_metadata_for_delalloc(root, inode, 1); 1353 spin_lock(&BTRFS_I(inode)->accounting_lock);
1354 BTRFS_I(inode)->outstanding_extents--;
1355 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1356 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
1357 }
1286 1358
1287 spin_lock(&root->fs_info->delalloc_lock); 1359 spin_lock(&root->fs_info->delalloc_lock);
1288 if (state->end - state->start + 1 > 1360 if (state->end - state->start + 1 >
@@ -1535,7 +1607,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1535 struct inode *inode, u64 file_pos, 1607 struct inode *inode, u64 file_pos,
1536 u64 disk_bytenr, u64 disk_num_bytes, 1608 u64 disk_bytenr, u64 disk_num_bytes,
1537 u64 num_bytes, u64 ram_bytes, 1609 u64 num_bytes, u64 ram_bytes,
1538 u64 locked_end,
1539 u8 compression, u8 encryption, 1610 u8 compression, u8 encryption,
1540 u16 other_encoding, int extent_type) 1611 u16 other_encoding, int extent_type)
1541{ 1612{
@@ -1561,9 +1632,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1561 * the caller is expected to unpin it and allow it to be merged 1632 * the caller is expected to unpin it and allow it to be merged
1562 * with the others. 1633 * with the others.
1563 */ 1634 */
1564 ret = btrfs_drop_extents(trans, root, inode, file_pos, 1635 ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes,
1565 file_pos + num_bytes, locked_end, 1636 &hint, 0);
1566 file_pos, &hint, 0);
1567 BUG_ON(ret); 1637 BUG_ON(ret);
1568 1638
1569 ins.objectid = inode->i_ino; 1639 ins.objectid = inode->i_ino;
@@ -1669,23 +1739,32 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1669 } 1739 }
1670 } 1740 }
1671 1741
1672 trans = btrfs_join_transaction(root, 1);
1673
1674 if (!ordered_extent) 1742 if (!ordered_extent)
1675 ordered_extent = btrfs_lookup_ordered_extent(inode, start); 1743 ordered_extent = btrfs_lookup_ordered_extent(inode, start);
1676 BUG_ON(!ordered_extent); 1744 BUG_ON(!ordered_extent);
1677 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) 1745 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
1678 goto nocow; 1746 BUG_ON(!list_empty(&ordered_extent->list));
1747 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1748 if (!ret) {
1749 trans = btrfs_join_transaction(root, 1);
1750 ret = btrfs_update_inode(trans, root, inode);
1751 BUG_ON(ret);
1752 btrfs_end_transaction(trans, root);
1753 }
1754 goto out;
1755 }
1679 1756
1680 lock_extent(io_tree, ordered_extent->file_offset, 1757 lock_extent(io_tree, ordered_extent->file_offset,
1681 ordered_extent->file_offset + ordered_extent->len - 1, 1758 ordered_extent->file_offset + ordered_extent->len - 1,
1682 GFP_NOFS); 1759 GFP_NOFS);
1683 1760
1761 trans = btrfs_join_transaction(root, 1);
1762
1684 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 1763 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1685 compressed = 1; 1764 compressed = 1;
1686 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1765 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1687 BUG_ON(compressed); 1766 BUG_ON(compressed);
1688 ret = btrfs_mark_extent_written(trans, root, inode, 1767 ret = btrfs_mark_extent_written(trans, inode,
1689 ordered_extent->file_offset, 1768 ordered_extent->file_offset,
1690 ordered_extent->file_offset + 1769 ordered_extent->file_offset +
1691 ordered_extent->len); 1770 ordered_extent->len);
@@ -1697,8 +1776,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1697 ordered_extent->disk_len, 1776 ordered_extent->disk_len,
1698 ordered_extent->len, 1777 ordered_extent->len,
1699 ordered_extent->len, 1778 ordered_extent->len,
1700 ordered_extent->file_offset +
1701 ordered_extent->len,
1702 compressed, 0, 0, 1779 compressed, 0, 0,
1703 BTRFS_FILE_EXTENT_REG); 1780 BTRFS_FILE_EXTENT_REG);
1704 unpin_extent_cache(&BTRFS_I(inode)->extent_tree, 1781 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
@@ -1709,22 +1786,20 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1709 unlock_extent(io_tree, ordered_extent->file_offset, 1786 unlock_extent(io_tree, ordered_extent->file_offset,
1710 ordered_extent->file_offset + ordered_extent->len - 1, 1787 ordered_extent->file_offset + ordered_extent->len - 1,
1711 GFP_NOFS); 1788 GFP_NOFS);
1712nocow:
1713 add_pending_csums(trans, inode, ordered_extent->file_offset, 1789 add_pending_csums(trans, inode, ordered_extent->file_offset,
1714 &ordered_extent->list); 1790 &ordered_extent->list);
1715 1791
1716 mutex_lock(&BTRFS_I(inode)->extent_mutex); 1792 /* this also removes the ordered extent from the tree */
1717 btrfs_ordered_update_i_size(inode, ordered_extent); 1793 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1718 btrfs_update_inode(trans, root, inode); 1794 ret = btrfs_update_inode(trans, root, inode);
1719 btrfs_remove_ordered_extent(inode, ordered_extent); 1795 BUG_ON(ret);
1720 mutex_unlock(&BTRFS_I(inode)->extent_mutex); 1796 btrfs_end_transaction(trans, root);
1721 1797out:
1722 /* once for us */ 1798 /* once for us */
1723 btrfs_put_ordered_extent(ordered_extent); 1799 btrfs_put_ordered_extent(ordered_extent);
1724 /* once for the tree */ 1800 /* once for the tree */
1725 btrfs_put_ordered_extent(ordered_extent); 1801 btrfs_put_ordered_extent(ordered_extent);
1726 1802
1727 btrfs_end_transaction(trans, root);
1728 return 0; 1803 return 0;
1729} 1804}
1730 1805
@@ -1947,6 +2022,54 @@ zeroit:
1947 return -EIO; 2022 return -EIO;
1948} 2023}
1949 2024
2025struct delayed_iput {
2026 struct list_head list;
2027 struct inode *inode;
2028};
2029
2030void btrfs_add_delayed_iput(struct inode *inode)
2031{
2032 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2033 struct delayed_iput *delayed;
2034
2035 if (atomic_add_unless(&inode->i_count, -1, 1))
2036 return;
2037
2038 delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
2039 delayed->inode = inode;
2040
2041 spin_lock(&fs_info->delayed_iput_lock);
2042 list_add_tail(&delayed->list, &fs_info->delayed_iputs);
2043 spin_unlock(&fs_info->delayed_iput_lock);
2044}
2045
2046void btrfs_run_delayed_iputs(struct btrfs_root *root)
2047{
2048 LIST_HEAD(list);
2049 struct btrfs_fs_info *fs_info = root->fs_info;
2050 struct delayed_iput *delayed;
2051 int empty;
2052
2053 spin_lock(&fs_info->delayed_iput_lock);
2054 empty = list_empty(&fs_info->delayed_iputs);
2055 spin_unlock(&fs_info->delayed_iput_lock);
2056 if (empty)
2057 return;
2058
2059 down_read(&root->fs_info->cleanup_work_sem);
2060 spin_lock(&fs_info->delayed_iput_lock);
2061 list_splice_init(&fs_info->delayed_iputs, &list);
2062 spin_unlock(&fs_info->delayed_iput_lock);
2063
2064 while (!list_empty(&list)) {
2065 delayed = list_entry(list.next, struct delayed_iput, list);
2066 list_del(&delayed->list);
2067 iput(delayed->inode);
2068 kfree(delayed);
2069 }
2070 up_read(&root->fs_info->cleanup_work_sem);
2071}
2072
1950/* 2073/*
1951 * This creates an orphan entry for the given inode in case something goes 2074 * This creates an orphan entry for the given inode in case something goes
1952 * wrong in the middle of an unlink/truncate. 2075 * wrong in the middle of an unlink/truncate.
@@ -2019,16 +2142,17 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2019 struct inode *inode; 2142 struct inode *inode;
2020 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2143 int ret = 0, nr_unlink = 0, nr_truncate = 0;
2021 2144
2022 path = btrfs_alloc_path(); 2145 if (!xchg(&root->clean_orphans, 0))
2023 if (!path)
2024 return; 2146 return;
2147
2148 path = btrfs_alloc_path();
2149 BUG_ON(!path);
2025 path->reada = -1; 2150 path->reada = -1;
2026 2151
2027 key.objectid = BTRFS_ORPHAN_OBJECTID; 2152 key.objectid = BTRFS_ORPHAN_OBJECTID;
2028 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 2153 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
2029 key.offset = (u64)-1; 2154 key.offset = (u64)-1;
2030 2155
2031
2032 while (1) { 2156 while (1) {
2033 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2157 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2034 if (ret < 0) { 2158 if (ret < 0) {
@@ -2442,7 +2566,19 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2442 2566
2443 root = BTRFS_I(dir)->root; 2567 root = BTRFS_I(dir)->root;
2444 2568
2569 /*
2570 * 5 items for unlink inode
2571 * 1 for orphan
2572 */
2573 ret = btrfs_reserve_metadata_space(root, 6);
2574 if (ret)
2575 return ret;
2576
2445 trans = btrfs_start_transaction(root, 1); 2577 trans = btrfs_start_transaction(root, 1);
2578 if (IS_ERR(trans)) {
2579 btrfs_unreserve_metadata_space(root, 6);
2580 return PTR_ERR(trans);
2581 }
2446 2582
2447 btrfs_set_trans_block_group(trans, dir); 2583 btrfs_set_trans_block_group(trans, dir);
2448 2584
@@ -2457,6 +2593,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2457 nr = trans->blocks_used; 2593 nr = trans->blocks_used;
2458 2594
2459 btrfs_end_transaction_throttle(trans, root); 2595 btrfs_end_transaction_throttle(trans, root);
2596 btrfs_unreserve_metadata_space(root, 6);
2460 btrfs_btree_balance_dirty(root, nr); 2597 btrfs_btree_balance_dirty(root, nr);
2461 return ret; 2598 return ret;
2462} 2599}
@@ -2537,7 +2674,16 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2537 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 2674 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
2538 return -ENOTEMPTY; 2675 return -ENOTEMPTY;
2539 2676
2677 ret = btrfs_reserve_metadata_space(root, 5);
2678 if (ret)
2679 return ret;
2680
2540 trans = btrfs_start_transaction(root, 1); 2681 trans = btrfs_start_transaction(root, 1);
2682 if (IS_ERR(trans)) {
2683 btrfs_unreserve_metadata_space(root, 5);
2684 return PTR_ERR(trans);
2685 }
2686
2541 btrfs_set_trans_block_group(trans, dir); 2687 btrfs_set_trans_block_group(trans, dir);
2542 2688
2543 if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 2689 if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
@@ -2560,6 +2706,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2560out: 2706out:
2561 nr = trans->blocks_used; 2707 nr = trans->blocks_used;
2562 ret = btrfs_end_transaction_throttle(trans, root); 2708 ret = btrfs_end_transaction_throttle(trans, root);
2709 btrfs_unreserve_metadata_space(root, 5);
2563 btrfs_btree_balance_dirty(root, nr); 2710 btrfs_btree_balance_dirty(root, nr);
2564 2711
2565 if (ret && !err) 2712 if (ret && !err)
@@ -2750,37 +2897,40 @@ out:
2750 * min_type is the minimum key type to truncate down to. If set to 0, this 2897 * min_type is the minimum key type to truncate down to. If set to 0, this
2751 * will kill all the items on this inode, including the INODE_ITEM_KEY. 2898 * will kill all the items on this inode, including the INODE_ITEM_KEY.
2752 */ 2899 */
2753noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, 2900int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2754 struct btrfs_root *root, 2901 struct btrfs_root *root,
2755 struct inode *inode, 2902 struct inode *inode,
2756 u64 new_size, u32 min_type) 2903 u64 new_size, u32 min_type)
2757{ 2904{
2758 int ret;
2759 struct btrfs_path *path; 2905 struct btrfs_path *path;
2760 struct btrfs_key key;
2761 struct btrfs_key found_key;
2762 u32 found_type = (u8)-1;
2763 struct extent_buffer *leaf; 2906 struct extent_buffer *leaf;
2764 struct btrfs_file_extent_item *fi; 2907 struct btrfs_file_extent_item *fi;
2908 struct btrfs_key key;
2909 struct btrfs_key found_key;
2765 u64 extent_start = 0; 2910 u64 extent_start = 0;
2766 u64 extent_num_bytes = 0; 2911 u64 extent_num_bytes = 0;
2767 u64 extent_offset = 0; 2912 u64 extent_offset = 0;
2768 u64 item_end = 0; 2913 u64 item_end = 0;
2914 u64 mask = root->sectorsize - 1;
2915 u32 found_type = (u8)-1;
2769 int found_extent; 2916 int found_extent;
2770 int del_item; 2917 int del_item;
2771 int pending_del_nr = 0; 2918 int pending_del_nr = 0;
2772 int pending_del_slot = 0; 2919 int pending_del_slot = 0;
2773 int extent_type = -1; 2920 int extent_type = -1;
2774 int encoding; 2921 int encoding;
2775 u64 mask = root->sectorsize - 1; 2922 int ret;
2923 int err = 0;
2924
2925 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
2776 2926
2777 if (root->ref_cows) 2927 if (root->ref_cows)
2778 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); 2928 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
2929
2779 path = btrfs_alloc_path(); 2930 path = btrfs_alloc_path();
2780 BUG_ON(!path); 2931 BUG_ON(!path);
2781 path->reada = -1; 2932 path->reada = -1;
2782 2933
2783 /* FIXME, add redo link to tree so we don't leak on crash */
2784 key.objectid = inode->i_ino; 2934 key.objectid = inode->i_ino;
2785 key.offset = (u64)-1; 2935 key.offset = (u64)-1;
2786 key.type = (u8)-1; 2936 key.type = (u8)-1;
@@ -2788,17 +2938,17 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2788search_again: 2938search_again:
2789 path->leave_spinning = 1; 2939 path->leave_spinning = 1;
2790 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2940 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2791 if (ret < 0) 2941 if (ret < 0) {
2792 goto error; 2942 err = ret;
2943 goto out;
2944 }
2793 2945
2794 if (ret > 0) { 2946 if (ret > 0) {
2795 /* there are no items in the tree for us to truncate, we're 2947 /* there are no items in the tree for us to truncate, we're
2796 * done 2948 * done
2797 */ 2949 */
2798 if (path->slots[0] == 0) { 2950 if (path->slots[0] == 0)
2799 ret = 0; 2951 goto out;
2800 goto error;
2801 }
2802 path->slots[0]--; 2952 path->slots[0]--;
2803 } 2953 }
2804 2954
@@ -2833,28 +2983,17 @@ search_again:
2833 } 2983 }
2834 item_end--; 2984 item_end--;
2835 } 2985 }
2836 if (item_end < new_size) { 2986 if (found_type > min_type) {
2837 if (found_type == BTRFS_DIR_ITEM_KEY) 2987 del_item = 1;
2838 found_type = BTRFS_INODE_ITEM_KEY; 2988 } else {
2839 else if (found_type == BTRFS_EXTENT_ITEM_KEY) 2989 if (item_end < new_size)
2840 found_type = BTRFS_EXTENT_DATA_KEY;
2841 else if (found_type == BTRFS_EXTENT_DATA_KEY)
2842 found_type = BTRFS_XATTR_ITEM_KEY;
2843 else if (found_type == BTRFS_XATTR_ITEM_KEY)
2844 found_type = BTRFS_INODE_REF_KEY;
2845 else if (found_type)
2846 found_type--;
2847 else
2848 break; 2990 break;
2849 btrfs_set_key_type(&key, found_type); 2991 if (found_key.offset >= new_size)
2850 goto next; 2992 del_item = 1;
2993 else
2994 del_item = 0;
2851 } 2995 }
2852 if (found_key.offset >= new_size)
2853 del_item = 1;
2854 else
2855 del_item = 0;
2856 found_extent = 0; 2996 found_extent = 0;
2857
2858 /* FIXME, shrink the extent if the ref count is only 1 */ 2997 /* FIXME, shrink the extent if the ref count is only 1 */
2859 if (found_type != BTRFS_EXTENT_DATA_KEY) 2998 if (found_type != BTRFS_EXTENT_DATA_KEY)
2860 goto delete; 2999 goto delete;
@@ -2941,42 +3080,36 @@ delete:
2941 inode->i_ino, extent_offset); 3080 inode->i_ino, extent_offset);
2942 BUG_ON(ret); 3081 BUG_ON(ret);
2943 } 3082 }
2944next:
2945 if (path->slots[0] == 0) {
2946 if (pending_del_nr)
2947 goto del_pending;
2948 btrfs_release_path(root, path);
2949 if (found_type == BTRFS_INODE_ITEM_KEY)
2950 break;
2951 goto search_again;
2952 }
2953 3083
2954 path->slots[0]--; 3084 if (found_type == BTRFS_INODE_ITEM_KEY)
2955 if (pending_del_nr && 3085 break;
2956 path->slots[0] + 1 != pending_del_slot) { 3086
2957 struct btrfs_key debug; 3087 if (path->slots[0] == 0 ||
2958del_pending: 3088 path->slots[0] != pending_del_slot) {
2959 btrfs_item_key_to_cpu(path->nodes[0], &debug, 3089 if (root->ref_cows) {
2960 pending_del_slot); 3090 err = -EAGAIN;
2961 ret = btrfs_del_items(trans, root, path, 3091 goto out;
2962 pending_del_slot, 3092 }
2963 pending_del_nr); 3093 if (pending_del_nr) {
2964 BUG_ON(ret); 3094 ret = btrfs_del_items(trans, root, path,
2965 pending_del_nr = 0; 3095 pending_del_slot,
3096 pending_del_nr);
3097 BUG_ON(ret);
3098 pending_del_nr = 0;
3099 }
2966 btrfs_release_path(root, path); 3100 btrfs_release_path(root, path);
2967 if (found_type == BTRFS_INODE_ITEM_KEY)
2968 break;
2969 goto search_again; 3101 goto search_again;
3102 } else {
3103 path->slots[0]--;
2970 } 3104 }
2971 } 3105 }
2972 ret = 0; 3106out:
2973error:
2974 if (pending_del_nr) { 3107 if (pending_del_nr) {
2975 ret = btrfs_del_items(trans, root, path, pending_del_slot, 3108 ret = btrfs_del_items(trans, root, path, pending_del_slot,
2976 pending_del_nr); 3109 pending_del_nr);
2977 } 3110 }
2978 btrfs_free_path(path); 3111 btrfs_free_path(path);
2979 return ret; 3112 return err;
2980} 3113}
2981 3114
2982/* 3115/*
@@ -3000,12 +3133,22 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3000 3133
3001 if ((offset & (blocksize - 1)) == 0) 3134 if ((offset & (blocksize - 1)) == 0)
3002 goto out; 3135 goto out;
3136 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
3137 if (ret)
3138 goto out;
3139
3140 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
3141 if (ret)
3142 goto out;
3003 3143
3004 ret = -ENOMEM; 3144 ret = -ENOMEM;
3005again: 3145again:
3006 page = grab_cache_page(mapping, index); 3146 page = grab_cache_page(mapping, index);
3007 if (!page) 3147 if (!page) {
3148 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
3149 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
3008 goto out; 3150 goto out;
3151 }
3009 3152
3010 page_start = page_offset(page); 3153 page_start = page_offset(page);
3011 page_end = page_start + PAGE_CACHE_SIZE - 1; 3154 page_end = page_start + PAGE_CACHE_SIZE - 1;
@@ -3038,6 +3181,10 @@ again:
3038 goto again; 3181 goto again;
3039 } 3182 }
3040 3183
3184 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
3185 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
3186 GFP_NOFS);
3187
3041 ret = btrfs_set_extent_delalloc(inode, page_start, page_end); 3188 ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
3042 if (ret) { 3189 if (ret) {
3043 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3190 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
@@ -3056,6 +3203,9 @@ again:
3056 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3203 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3057 3204
3058out_unlock: 3205out_unlock:
3206 if (ret)
3207 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
3208 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
3059 unlock_page(page); 3209 unlock_page(page);
3060 page_cache_release(page); 3210 page_cache_release(page);
3061out: 3211out:
@@ -3079,8 +3229,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3079 if (size <= hole_start) 3229 if (size <= hole_start)
3080 return 0; 3230 return 0;
3081 3231
3082 btrfs_truncate_page(inode->i_mapping, inode->i_size);
3083
3084 while (1) { 3232 while (1) {
3085 struct btrfs_ordered_extent *ordered; 3233 struct btrfs_ordered_extent *ordered;
3086 btrfs_wait_ordered_range(inode, hole_start, 3234 btrfs_wait_ordered_range(inode, hole_start,
@@ -3093,9 +3241,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3093 btrfs_put_ordered_extent(ordered); 3241 btrfs_put_ordered_extent(ordered);
3094 } 3242 }
3095 3243
3096 trans = btrfs_start_transaction(root, 1);
3097 btrfs_set_trans_block_group(trans, inode);
3098
3099 cur_offset = hole_start; 3244 cur_offset = hole_start;
3100 while (1) { 3245 while (1) {
3101 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 3246 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
@@ -3103,40 +3248,120 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3103 BUG_ON(IS_ERR(em) || !em); 3248 BUG_ON(IS_ERR(em) || !em);
3104 last_byte = min(extent_map_end(em), block_end); 3249 last_byte = min(extent_map_end(em), block_end);
3105 last_byte = (last_byte + mask) & ~mask; 3250 last_byte = (last_byte + mask) & ~mask;
3106 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { 3251 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3107 u64 hint_byte = 0; 3252 u64 hint_byte = 0;
3108 hole_size = last_byte - cur_offset; 3253 hole_size = last_byte - cur_offset;
3109 err = btrfs_drop_extents(trans, root, inode,
3110 cur_offset,
3111 cur_offset + hole_size,
3112 block_end,
3113 cur_offset, &hint_byte, 1);
3114 if (err)
3115 break;
3116 3254
3117 err = btrfs_reserve_metadata_space(root, 1); 3255 err = btrfs_reserve_metadata_space(root, 2);
3118 if (err) 3256 if (err)
3119 break; 3257 break;
3120 3258
3259 trans = btrfs_start_transaction(root, 1);
3260 btrfs_set_trans_block_group(trans, inode);
3261
3262 err = btrfs_drop_extents(trans, inode, cur_offset,
3263 cur_offset + hole_size,
3264 &hint_byte, 1);
3265 BUG_ON(err);
3266
3121 err = btrfs_insert_file_extent(trans, root, 3267 err = btrfs_insert_file_extent(trans, root,
3122 inode->i_ino, cur_offset, 0, 3268 inode->i_ino, cur_offset, 0,
3123 0, hole_size, 0, hole_size, 3269 0, hole_size, 0, hole_size,
3124 0, 0, 0); 3270 0, 0, 0);
3271 BUG_ON(err);
3272
3125 btrfs_drop_extent_cache(inode, hole_start, 3273 btrfs_drop_extent_cache(inode, hole_start,
3126 last_byte - 1, 0); 3274 last_byte - 1, 0);
3127 btrfs_unreserve_metadata_space(root, 1); 3275
3276 btrfs_end_transaction(trans, root);
3277 btrfs_unreserve_metadata_space(root, 2);
3128 } 3278 }
3129 free_extent_map(em); 3279 free_extent_map(em);
3130 cur_offset = last_byte; 3280 cur_offset = last_byte;
3131 if (err || cur_offset >= block_end) 3281 if (cur_offset >= block_end)
3132 break; 3282 break;
3133 } 3283 }
3134 3284
3135 btrfs_end_transaction(trans, root);
3136 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); 3285 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
3137 return err; 3286 return err;
3138} 3287}
3139 3288
3289static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3290{
3291 struct btrfs_root *root = BTRFS_I(inode)->root;
3292 struct btrfs_trans_handle *trans;
3293 unsigned long nr;
3294 int ret;
3295
3296 if (attr->ia_size == inode->i_size)
3297 return 0;
3298
3299 if (attr->ia_size > inode->i_size) {
3300 unsigned long limit;
3301 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
3302 if (attr->ia_size > inode->i_sb->s_maxbytes)
3303 return -EFBIG;
3304 if (limit != RLIM_INFINITY && attr->ia_size > limit) {
3305 send_sig(SIGXFSZ, current, 0);
3306 return -EFBIG;
3307 }
3308 }
3309
3310 ret = btrfs_reserve_metadata_space(root, 1);
3311 if (ret)
3312 return ret;
3313
3314 trans = btrfs_start_transaction(root, 1);
3315 btrfs_set_trans_block_group(trans, inode);
3316
3317 ret = btrfs_orphan_add(trans, inode);
3318 BUG_ON(ret);
3319
3320 nr = trans->blocks_used;
3321 btrfs_end_transaction(trans, root);
3322 btrfs_unreserve_metadata_space(root, 1);
3323 btrfs_btree_balance_dirty(root, nr);
3324
3325 if (attr->ia_size > inode->i_size) {
3326 ret = btrfs_cont_expand(inode, attr->ia_size);
3327 if (ret) {
3328 btrfs_truncate(inode);
3329 return ret;
3330 }
3331
3332 i_size_write(inode, attr->ia_size);
3333 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
3334
3335 trans = btrfs_start_transaction(root, 1);
3336 btrfs_set_trans_block_group(trans, inode);
3337
3338 ret = btrfs_update_inode(trans, root, inode);
3339 BUG_ON(ret);
3340 if (inode->i_nlink > 0) {
3341 ret = btrfs_orphan_del(trans, inode);
3342 BUG_ON(ret);
3343 }
3344 nr = trans->blocks_used;
3345 btrfs_end_transaction(trans, root);
3346 btrfs_btree_balance_dirty(root, nr);
3347 return 0;
3348 }
3349
3350 /*
3351 * We're truncating a file that used to have good data down to
3352 * zero. Make sure it gets into the ordered flush list so that
3353 * any new writes get down to disk quickly.
3354 */
3355 if (attr->ia_size == 0)
3356 BTRFS_I(inode)->ordered_data_close = 1;
3357
3358 /* we don't support swapfiles, so vmtruncate shouldn't fail */
3359 ret = vmtruncate(inode, attr->ia_size);
3360 BUG_ON(ret);
3361
3362 return 0;
3363}
3364
3140static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) 3365static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
3141{ 3366{
3142 struct inode *inode = dentry->d_inode; 3367 struct inode *inode = dentry->d_inode;
@@ -3147,23 +3372,14 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
3147 return err; 3372 return err;
3148 3373
3149 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 3374 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
3150 if (attr->ia_size > inode->i_size) { 3375 err = btrfs_setattr_size(inode, attr);
3151 err = btrfs_cont_expand(inode, attr->ia_size); 3376 if (err)
3152 if (err) 3377 return err;
3153 return err;
3154 } else if (inode->i_size > 0 &&
3155 attr->ia_size == 0) {
3156
3157 /* we're truncating a file that used to have good
3158 * data down to zero. Make sure it gets into
3159 * the ordered flush list so that any new writes
3160 * get down to disk quickly.
3161 */
3162 BTRFS_I(inode)->ordered_data_close = 1;
3163 }
3164 } 3378 }
3379 attr->ia_valid &= ~ATTR_SIZE;
3165 3380
3166 err = inode_setattr(inode, attr); 3381 if (attr->ia_valid)
3382 err = inode_setattr(inode, attr);
3167 3383
3168 if (!err && ((attr->ia_valid & ATTR_MODE))) 3384 if (!err && ((attr->ia_valid & ATTR_MODE)))
3169 err = btrfs_acl_chmod(inode); 3385 err = btrfs_acl_chmod(inode);
@@ -3184,36 +3400,43 @@ void btrfs_delete_inode(struct inode *inode)
3184 } 3400 }
3185 btrfs_wait_ordered_range(inode, 0, (u64)-1); 3401 btrfs_wait_ordered_range(inode, 0, (u64)-1);
3186 3402
3403 if (root->fs_info->log_root_recovering) {
3404 BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan));
3405 goto no_delete;
3406 }
3407
3187 if (inode->i_nlink > 0) { 3408 if (inode->i_nlink > 0) {
3188 BUG_ON(btrfs_root_refs(&root->root_item) != 0); 3409 BUG_ON(btrfs_root_refs(&root->root_item) != 0);
3189 goto no_delete; 3410 goto no_delete;
3190 } 3411 }
3191 3412
3192 btrfs_i_size_write(inode, 0); 3413 btrfs_i_size_write(inode, 0);
3193 trans = btrfs_join_transaction(root, 1);
3194 3414
3195 btrfs_set_trans_block_group(trans, inode); 3415 while (1) {
3196 ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0); 3416 trans = btrfs_start_transaction(root, 1);
3197 if (ret) { 3417 btrfs_set_trans_block_group(trans, inode);
3198 btrfs_orphan_del(NULL, inode); 3418 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
3199 goto no_delete_lock;
3200 }
3201 3419
3202 btrfs_orphan_del(trans, inode); 3420 if (ret != -EAGAIN)
3421 break;
3203 3422
3204 nr = trans->blocks_used; 3423 nr = trans->blocks_used;
3205 clear_inode(inode); 3424 btrfs_end_transaction(trans, root);
3425 trans = NULL;
3426 btrfs_btree_balance_dirty(root, nr);
3427 }
3206 3428
3207 btrfs_end_transaction(trans, root); 3429 if (ret == 0) {
3208 btrfs_btree_balance_dirty(root, nr); 3430 ret = btrfs_orphan_del(trans, inode);
3209 return; 3431 BUG_ON(ret);
3432 }
3210 3433
3211no_delete_lock:
3212 nr = trans->blocks_used; 3434 nr = trans->blocks_used;
3213 btrfs_end_transaction(trans, root); 3435 btrfs_end_transaction(trans, root);
3214 btrfs_btree_balance_dirty(root, nr); 3436 btrfs_btree_balance_dirty(root, nr);
3215no_delete: 3437no_delete:
3216 clear_inode(inode); 3438 clear_inode(inode);
3439 return;
3217} 3440}
3218 3441
3219/* 3442/*
@@ -3448,6 +3671,7 @@ static noinline void init_btrfs_i(struct inode *inode)
3448 bi->generation = 0; 3671 bi->generation = 0;
3449 bi->sequence = 0; 3672 bi->sequence = 0;
3450 bi->last_trans = 0; 3673 bi->last_trans = 0;
3674 bi->last_sub_trans = 0;
3451 bi->logged_trans = 0; 3675 bi->logged_trans = 0;
3452 bi->delalloc_bytes = 0; 3676 bi->delalloc_bytes = 0;
3453 bi->reserved_bytes = 0; 3677 bi->reserved_bytes = 0;
@@ -3465,7 +3689,6 @@ static noinline void init_btrfs_i(struct inode *inode)
3465 INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations); 3689 INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
3466 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); 3690 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
3467 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); 3691 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
3468 mutex_init(&BTRFS_I(inode)->extent_mutex);
3469 mutex_init(&BTRFS_I(inode)->log_mutex); 3692 mutex_init(&BTRFS_I(inode)->log_mutex);
3470} 3693}
3471 3694
@@ -3591,6 +3814,13 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3591 } 3814 }
3592 srcu_read_unlock(&root->fs_info->subvol_srcu, index); 3815 srcu_read_unlock(&root->fs_info->subvol_srcu, index);
3593 3816
3817 if (root != sub_root) {
3818 down_read(&root->fs_info->cleanup_work_sem);
3819 if (!(inode->i_sb->s_flags & MS_RDONLY))
3820 btrfs_orphan_cleanup(sub_root);
3821 up_read(&root->fs_info->cleanup_work_sem);
3822 }
3823
3594 return inode; 3824 return inode;
3595} 3825}
3596 3826
@@ -3598,12 +3828,14 @@ static int btrfs_dentry_delete(struct dentry *dentry)
3598{ 3828{
3599 struct btrfs_root *root; 3829 struct btrfs_root *root;
3600 3830
3601 if (!dentry->d_inode) 3831 if (!dentry->d_inode && !IS_ROOT(dentry))
3602 return 0; 3832 dentry = dentry->d_parent;
3603 3833
3604 root = BTRFS_I(dentry->d_inode)->root; 3834 if (dentry->d_inode) {
3605 if (btrfs_root_refs(&root->root_item) == 0) 3835 root = BTRFS_I(dentry->d_inode)->root;
3606 return 1; 3836 if (btrfs_root_refs(&root->root_item) == 0)
3837 return 1;
3838 }
3607 return 0; 3839 return 0;
3608} 3840}
3609 3841
@@ -4113,7 +4345,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4113 if (IS_ERR(inode)) 4345 if (IS_ERR(inode))
4114 goto out_unlock; 4346 goto out_unlock;
4115 4347
4116 err = btrfs_init_inode_security(inode, dir); 4348 err = btrfs_init_inode_security(trans, inode, dir);
4117 if (err) { 4349 if (err) {
4118 drop_inode = 1; 4350 drop_inode = 1;
4119 goto out_unlock; 4351 goto out_unlock;
@@ -4184,7 +4416,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4184 if (IS_ERR(inode)) 4416 if (IS_ERR(inode))
4185 goto out_unlock; 4417 goto out_unlock;
4186 4418
4187 err = btrfs_init_inode_security(inode, dir); 4419 err = btrfs_init_inode_security(trans, inode, dir);
4188 if (err) { 4420 if (err) {
4189 drop_inode = 1; 4421 drop_inode = 1;
4190 goto out_unlock; 4422 goto out_unlock;
@@ -4230,6 +4462,10 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4230 if (inode->i_nlink == 0) 4462 if (inode->i_nlink == 0)
4231 return -ENOENT; 4463 return -ENOENT;
4232 4464
4465 /* do not allow sys_link's with other subvols of the same device */
4466 if (root->objectid != BTRFS_I(inode)->root->objectid)
4467 return -EPERM;
4468
4233 /* 4469 /*
4234 * 1 item for inode ref 4470 * 1 item for inode ref
4235 * 2 items for dir items 4471 * 2 items for dir items
@@ -4317,7 +4553,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4317 4553
4318 drop_on_err = 1; 4554 drop_on_err = 1;
4319 4555
4320 err = btrfs_init_inode_security(inode, dir); 4556 err = btrfs_init_inode_security(trans, inode, dir);
4321 if (err) 4557 if (err)
4322 goto out_fail; 4558 goto out_fail;
4323 4559
@@ -4808,7 +5044,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4808 */ 5044 */
4809 clear_extent_bit(tree, page_start, page_end, 5045 clear_extent_bit(tree, page_start, page_end,
4810 EXTENT_DIRTY | EXTENT_DELALLOC | 5046 EXTENT_DIRTY | EXTENT_DELALLOC |
4811 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); 5047 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
5048 NULL, GFP_NOFS);
4812 /* 5049 /*
4813 * whoever cleared the private bit is responsible 5050 * whoever cleared the private bit is responsible
4814 * for the finish_ordered_io 5051 * for the finish_ordered_io
@@ -4821,8 +5058,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4821 lock_extent(tree, page_start, page_end, GFP_NOFS); 5058 lock_extent(tree, page_start, page_end, GFP_NOFS);
4822 } 5059 }
4823 clear_extent_bit(tree, page_start, page_end, 5060 clear_extent_bit(tree, page_start, page_end,
4824 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, 5061 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
4825 1, 1, NULL, GFP_NOFS); 5062 EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS);
4826 __btrfs_releasepage(page, GFP_NOFS); 5063 __btrfs_releasepage(page, GFP_NOFS);
4827 5064
4828 ClearPageChecked(page); 5065 ClearPageChecked(page);
@@ -4917,7 +5154,8 @@ again:
4917 * prepare_pages in the normal write path. 5154 * prepare_pages in the normal write path.
4918 */ 5155 */
4919 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 5156 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
4920 EXTENT_DIRTY | EXTENT_DELALLOC, GFP_NOFS); 5157 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
5158 GFP_NOFS);
4921 5159
4922 ret = btrfs_set_extent_delalloc(inode, page_start, page_end); 5160 ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
4923 if (ret) { 5161 if (ret) {
@@ -4944,7 +5182,9 @@ again:
4944 set_page_dirty(page); 5182 set_page_dirty(page);
4945 SetPageUptodate(page); 5183 SetPageUptodate(page);
4946 5184
4947 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; 5185 BTRFS_I(inode)->last_trans = root->fs_info->generation;
5186 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
5187
4948 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 5188 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4949 5189
4950out_unlock: 5190out_unlock:
@@ -4964,15 +5204,20 @@ static void btrfs_truncate(struct inode *inode)
4964 unsigned long nr; 5204 unsigned long nr;
4965 u64 mask = root->sectorsize - 1; 5205 u64 mask = root->sectorsize - 1;
4966 5206
4967 if (!S_ISREG(inode->i_mode)) 5207 if (!S_ISREG(inode->i_mode)) {
5208 WARN_ON(1);
4968 return; 5209 return;
4969 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 5210 }
5211
5212 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
5213 if (ret)
4970 return; 5214 return;
4971 5215
4972 btrfs_truncate_page(inode->i_mapping, inode->i_size);
4973 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 5216 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
5217 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
4974 5218
4975 trans = btrfs_start_transaction(root, 1); 5219 trans = btrfs_start_transaction(root, 1);
5220 btrfs_set_trans_block_group(trans, inode);
4976 5221
4977 /* 5222 /*
4978 * setattr is responsible for setting the ordered_data_close flag, 5223 * setattr is responsible for setting the ordered_data_close flag,
@@ -4994,21 +5239,32 @@ static void btrfs_truncate(struct inode *inode)
4994 if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) 5239 if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
4995 btrfs_add_ordered_operation(trans, root, inode); 5240 btrfs_add_ordered_operation(trans, root, inode);
4996 5241
4997 btrfs_set_trans_block_group(trans, inode); 5242 while (1) {
4998 btrfs_i_size_write(inode, inode->i_size); 5243 ret = btrfs_truncate_inode_items(trans, root, inode,
5244 inode->i_size,
5245 BTRFS_EXTENT_DATA_KEY);
5246 if (ret != -EAGAIN)
5247 break;
4999 5248
5000 ret = btrfs_orphan_add(trans, inode); 5249 ret = btrfs_update_inode(trans, root, inode);
5001 if (ret) 5250 BUG_ON(ret);
5002 goto out; 5251
5003 /* FIXME, add redo link to tree so we don't leak on crash */ 5252 nr = trans->blocks_used;
5004 ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 5253 btrfs_end_transaction(trans, root);
5005 BTRFS_EXTENT_DATA_KEY); 5254 btrfs_btree_balance_dirty(root, nr);
5006 btrfs_update_inode(trans, root, inode); 5255
5256 trans = btrfs_start_transaction(root, 1);
5257 btrfs_set_trans_block_group(trans, inode);
5258 }
5007 5259
5008 ret = btrfs_orphan_del(trans, inode); 5260 if (ret == 0 && inode->i_nlink > 0) {
5261 ret = btrfs_orphan_del(trans, inode);
5262 BUG_ON(ret);
5263 }
5264
5265 ret = btrfs_update_inode(trans, root, inode);
5009 BUG_ON(ret); 5266 BUG_ON(ret);
5010 5267
5011out:
5012 nr = trans->blocks_used; 5268 nr = trans->blocks_used;
5013 ret = btrfs_end_transaction_throttle(trans, root); 5269 ret = btrfs_end_transaction_throttle(trans, root);
5014 BUG_ON(ret); 5270 BUG_ON(ret);
@@ -5064,9 +5320,12 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
5064 if (!ei) 5320 if (!ei)
5065 return NULL; 5321 return NULL;
5066 ei->last_trans = 0; 5322 ei->last_trans = 0;
5323 ei->last_sub_trans = 0;
5067 ei->logged_trans = 0; 5324 ei->logged_trans = 0;
5068 ei->delalloc_extents = 0; 5325 ei->outstanding_extents = 0;
5069 ei->delalloc_reserved_extents = 0; 5326 ei->reserved_extents = 0;
5327 ei->root = NULL;
5328 spin_lock_init(&ei->accounting_lock);
5070 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 5329 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
5071 INIT_LIST_HEAD(&ei->i_orphan); 5330 INIT_LIST_HEAD(&ei->i_orphan);
5072 INIT_LIST_HEAD(&ei->ordered_operations); 5331 INIT_LIST_HEAD(&ei->ordered_operations);
@@ -5082,6 +5341,14 @@ void btrfs_destroy_inode(struct inode *inode)
5082 WARN_ON(inode->i_data.nrpages); 5341 WARN_ON(inode->i_data.nrpages);
5083 5342
5084 /* 5343 /*
5344 * This can happen where we create an inode, but somebody else also
5345 * created the same inode and we need to destroy the one we already
5346 * created.
5347 */
5348 if (!root)
5349 goto free;
5350
5351 /*
5085 * Make sure we're properly removed from the ordered operation 5352 * Make sure we're properly removed from the ordered operation
5086 * lists. 5353 * lists.
5087 */ 5354 */
@@ -5094,9 +5361,9 @@ void btrfs_destroy_inode(struct inode *inode)
5094 5361
5095 spin_lock(&root->list_lock); 5362 spin_lock(&root->list_lock);
5096 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 5363 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
5097 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan" 5364 printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
5098 " list\n", inode->i_ino); 5365 inode->i_ino);
5099 dump_stack(); 5366 list_del_init(&BTRFS_I(inode)->i_orphan);
5100 } 5367 }
5101 spin_unlock(&root->list_lock); 5368 spin_unlock(&root->list_lock);
5102 5369
@@ -5116,6 +5383,7 @@ void btrfs_destroy_inode(struct inode *inode)
5116 } 5383 }
5117 inode_tree_del(inode); 5384 inode_tree_del(inode);
5118 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 5385 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
5386free:
5119 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 5387 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
5120} 5388}
5121 5389
@@ -5221,11 +5489,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
5221 return -ENOTEMPTY; 5489 return -ENOTEMPTY;
5222 5490
5223 /* 5491 /*
5224 * 2 items for dir items 5492 * We want to reserve the absolute worst case amount of items. So if
5225 * 1 item for orphan entry 5493 * both inodes are subvols and we need to unlink them then that would
5226 * 1 item for ref 5494 * require 4 item modifications, but if they are both normal inodes it
5495 * would require 5 item modifications, so we'll assume their normal
5496 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
5497 * should cover the worst case number of items we'll modify.
5227 */ 5498 */
5228 ret = btrfs_reserve_metadata_space(root, 4); 5499 ret = btrfs_reserve_metadata_space(root, 11);
5229 if (ret) 5500 if (ret)
5230 return ret; 5501 return ret;
5231 5502
@@ -5341,7 +5612,7 @@ out_fail:
5341 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 5612 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
5342 up_read(&root->fs_info->subvol_sem); 5613 up_read(&root->fs_info->subvol_sem);
5343 5614
5344 btrfs_unreserve_metadata_space(root, 4); 5615 btrfs_unreserve_metadata_space(root, 11);
5345 return ret; 5616 return ret;
5346} 5617}
5347 5618
@@ -5349,7 +5620,7 @@ out_fail:
5349 * some fairly slow code that needs optimization. This walks the list 5620 * some fairly slow code that needs optimization. This walks the list
5350 * of all the inodes with pending delalloc and forces them to disk. 5621 * of all the inodes with pending delalloc and forces them to disk.
5351 */ 5622 */
5352int btrfs_start_delalloc_inodes(struct btrfs_root *root) 5623int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
5353{ 5624{
5354 struct list_head *head = &root->fs_info->delalloc_inodes; 5625 struct list_head *head = &root->fs_info->delalloc_inodes;
5355 struct btrfs_inode *binode; 5626 struct btrfs_inode *binode;
@@ -5368,7 +5639,10 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)
5368 spin_unlock(&root->fs_info->delalloc_lock); 5639 spin_unlock(&root->fs_info->delalloc_lock);
5369 if (inode) { 5640 if (inode) {
5370 filemap_flush(inode->i_mapping); 5641 filemap_flush(inode->i_mapping);
5371 iput(inode); 5642 if (delay_iput)
5643 btrfs_add_delayed_iput(inode);
5644 else
5645 iput(inode);
5372 } 5646 }
5373 cond_resched(); 5647 cond_resched();
5374 spin_lock(&root->fs_info->delalloc_lock); 5648 spin_lock(&root->fs_info->delalloc_lock);
@@ -5442,7 +5716,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5442 if (IS_ERR(inode)) 5716 if (IS_ERR(inode))
5443 goto out_unlock; 5717 goto out_unlock;
5444 5718
5445 err = btrfs_init_inode_security(inode, dir); 5719 err = btrfs_init_inode_security(trans, inode, dir);
5446 if (err) { 5720 if (err) {
5447 drop_inode = 1; 5721 drop_inode = 1;
5448 goto out_unlock; 5722 goto out_unlock;
@@ -5514,10 +5788,10 @@ out_fail:
5514 return err; 5788 return err;
5515} 5789}
5516 5790
5517static int prealloc_file_range(struct btrfs_trans_handle *trans, 5791static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
5518 struct inode *inode, u64 start, u64 end, 5792 u64 alloc_hint, int mode)
5519 u64 locked_end, u64 alloc_hint, int mode)
5520{ 5793{
5794 struct btrfs_trans_handle *trans;
5521 struct btrfs_root *root = BTRFS_I(inode)->root; 5795 struct btrfs_root *root = BTRFS_I(inode)->root;
5522 struct btrfs_key ins; 5796 struct btrfs_key ins;
5523 u64 alloc_size; 5797 u64 alloc_size;
@@ -5528,43 +5802,56 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
5528 while (num_bytes > 0) { 5802 while (num_bytes > 0) {
5529 alloc_size = min(num_bytes, root->fs_info->max_extent); 5803 alloc_size = min(num_bytes, root->fs_info->max_extent);
5530 5804
5531 ret = btrfs_reserve_metadata_space(root, 1); 5805 trans = btrfs_start_transaction(root, 1);
5532 if (ret)
5533 goto out;
5534 5806
5535 ret = btrfs_reserve_extent(trans, root, alloc_size, 5807 ret = btrfs_reserve_extent(trans, root, alloc_size,
5536 root->sectorsize, 0, alloc_hint, 5808 root->sectorsize, 0, alloc_hint,
5537 (u64)-1, &ins, 1); 5809 (u64)-1, &ins, 1);
5538 if (ret) { 5810 if (ret) {
5539 WARN_ON(1); 5811 WARN_ON(1);
5540 goto out; 5812 goto stop_trans;
5541 } 5813 }
5814
5815 ret = btrfs_reserve_metadata_space(root, 3);
5816 if (ret) {
5817 btrfs_free_reserved_extent(root, ins.objectid,
5818 ins.offset);
5819 goto stop_trans;
5820 }
5821
5542 ret = insert_reserved_file_extent(trans, inode, 5822 ret = insert_reserved_file_extent(trans, inode,
5543 cur_offset, ins.objectid, 5823 cur_offset, ins.objectid,
5544 ins.offset, ins.offset, 5824 ins.offset, ins.offset,
5545 ins.offset, locked_end, 5825 ins.offset, 0, 0, 0,
5546 0, 0, 0,
5547 BTRFS_FILE_EXTENT_PREALLOC); 5826 BTRFS_FILE_EXTENT_PREALLOC);
5548 BUG_ON(ret); 5827 BUG_ON(ret);
5549 btrfs_drop_extent_cache(inode, cur_offset, 5828 btrfs_drop_extent_cache(inode, cur_offset,
5550 cur_offset + ins.offset -1, 0); 5829 cur_offset + ins.offset -1, 0);
5830
5551 num_bytes -= ins.offset; 5831 num_bytes -= ins.offset;
5552 cur_offset += ins.offset; 5832 cur_offset += ins.offset;
5553 alloc_hint = ins.objectid + ins.offset; 5833 alloc_hint = ins.objectid + ins.offset;
5554 btrfs_unreserve_metadata_space(root, 1); 5834
5555 }
5556out:
5557 if (cur_offset > start) {
5558 inode->i_ctime = CURRENT_TIME; 5835 inode->i_ctime = CURRENT_TIME;
5559 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 5836 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
5560 if (!(mode & FALLOC_FL_KEEP_SIZE) && 5837 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
5561 cur_offset > i_size_read(inode)) 5838 cur_offset > inode->i_size) {
5562 btrfs_i_size_write(inode, cur_offset); 5839 i_size_write(inode, cur_offset);
5840 btrfs_ordered_update_i_size(inode, cur_offset, NULL);
5841 }
5842
5563 ret = btrfs_update_inode(trans, root, inode); 5843 ret = btrfs_update_inode(trans, root, inode);
5564 BUG_ON(ret); 5844 BUG_ON(ret);
5845
5846 btrfs_end_transaction(trans, root);
5847 btrfs_unreserve_metadata_space(root, 3);
5565 } 5848 }
5849 return ret;
5566 5850
5851stop_trans:
5852 btrfs_end_transaction(trans, root);
5567 return ret; 5853 return ret;
5854
5568} 5855}
5569 5856
5570static long btrfs_fallocate(struct inode *inode, int mode, 5857static long btrfs_fallocate(struct inode *inode, int mode,
@@ -5578,8 +5865,6 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5578 u64 locked_end; 5865 u64 locked_end;
5579 u64 mask = BTRFS_I(inode)->root->sectorsize - 1; 5866 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
5580 struct extent_map *em; 5867 struct extent_map *em;
5581 struct btrfs_trans_handle *trans;
5582 struct btrfs_root *root;
5583 int ret; 5868 int ret;
5584 5869
5585 alloc_start = offset & ~mask; 5870 alloc_start = offset & ~mask;
@@ -5598,9 +5883,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5598 goto out; 5883 goto out;
5599 } 5884 }
5600 5885
5601 root = BTRFS_I(inode)->root; 5886 ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode,
5602
5603 ret = btrfs_check_data_free_space(root, inode,
5604 alloc_end - alloc_start); 5887 alloc_end - alloc_start);
5605 if (ret) 5888 if (ret)
5606 goto out; 5889 goto out;
@@ -5609,12 +5892,6 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5609 while (1) { 5892 while (1) {
5610 struct btrfs_ordered_extent *ordered; 5893 struct btrfs_ordered_extent *ordered;
5611 5894
5612 trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
5613 if (!trans) {
5614 ret = -EIO;
5615 goto out_free;
5616 }
5617
5618 /* the extent lock is ordered inside the running 5895 /* the extent lock is ordered inside the running
5619 * transaction 5896 * transaction
5620 */ 5897 */
@@ -5628,8 +5905,6 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5628 btrfs_put_ordered_extent(ordered); 5905 btrfs_put_ordered_extent(ordered);
5629 unlock_extent(&BTRFS_I(inode)->io_tree, 5906 unlock_extent(&BTRFS_I(inode)->io_tree,
5630 alloc_start, locked_end, GFP_NOFS); 5907 alloc_start, locked_end, GFP_NOFS);
5631 btrfs_end_transaction(trans, BTRFS_I(inode)->root);
5632
5633 /* 5908 /*
5634 * we can't wait on the range with the transaction 5909 * we can't wait on the range with the transaction
5635 * running or with the extent lock held 5910 * running or with the extent lock held
@@ -5650,10 +5925,12 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5650 BUG_ON(IS_ERR(em) || !em); 5925 BUG_ON(IS_ERR(em) || !em);
5651 last_byte = min(extent_map_end(em), alloc_end); 5926 last_byte = min(extent_map_end(em), alloc_end);
5652 last_byte = (last_byte + mask) & ~mask; 5927 last_byte = (last_byte + mask) & ~mask;
5653 if (em->block_start == EXTENT_MAP_HOLE) { 5928 if (em->block_start == EXTENT_MAP_HOLE ||
5654 ret = prealloc_file_range(trans, inode, cur_offset, 5929 (cur_offset >= inode->i_size &&
5655 last_byte, locked_end + 1, 5930 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
5656 alloc_hint, mode); 5931 ret = prealloc_file_range(inode,
5932 cur_offset, last_byte,
5933 alloc_hint, mode);
5657 if (ret < 0) { 5934 if (ret < 0) {
5658 free_extent_map(em); 5935 free_extent_map(em);
5659 break; 5936 break;
@@ -5672,9 +5949,8 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5672 unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 5949 unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
5673 GFP_NOFS); 5950 GFP_NOFS);
5674 5951
5675 btrfs_end_transaction(trans, BTRFS_I(inode)->root); 5952 btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode,
5676out_free: 5953 alloc_end - alloc_start);
5677 btrfs_free_reserved_data_space(root, inode, alloc_end - alloc_start);
5678out: 5954out:
5679 mutex_unlock(&inode->i_mutex); 5955 mutex_unlock(&inode->i_mutex);
5680 return ret; 5956 return ret;
@@ -5805,6 +6081,6 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
5805 .removexattr = btrfs_removexattr, 6081 .removexattr = btrfs_removexattr,
5806}; 6082};
5807 6083
5808struct dentry_operations btrfs_dentry_operations = { 6084const struct dentry_operations btrfs_dentry_operations = {
5809 .d_delete = btrfs_dentry_delete, 6085 .d_delete = btrfs_dentry_delete,
5810}; 6086};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9a780c8d0ac8..645a17927a8f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -237,7 +237,6 @@ static noinline int create_subvol(struct btrfs_root *root,
237 u64 objectid; 237 u64 objectid;
238 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; 238 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
239 u64 index = 0; 239 u64 index = 0;
240 unsigned long nr = 1;
241 240
242 /* 241 /*
243 * 1 - inode item 242 * 1 - inode item
@@ -290,7 +289,7 @@ static noinline int create_subvol(struct btrfs_root *root,
290 btrfs_set_root_generation(&root_item, trans->transid); 289 btrfs_set_root_generation(&root_item, trans->transid);
291 btrfs_set_root_level(&root_item, 0); 290 btrfs_set_root_level(&root_item, 0);
292 btrfs_set_root_refs(&root_item, 1); 291 btrfs_set_root_refs(&root_item, 1);
293 btrfs_set_root_used(&root_item, 0); 292 btrfs_set_root_used(&root_item, leaf->len);
294 btrfs_set_root_last_snapshot(&root_item, 0); 293 btrfs_set_root_last_snapshot(&root_item, 0);
295 294
296 memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); 295 memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
@@ -342,24 +341,21 @@ static noinline int create_subvol(struct btrfs_root *root,
342 341
343 d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); 342 d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
344fail: 343fail:
345 nr = trans->blocks_used;
346 err = btrfs_commit_transaction(trans, root); 344 err = btrfs_commit_transaction(trans, root);
347 if (err && !ret) 345 if (err && !ret)
348 ret = err; 346 ret = err;
349 347
350 btrfs_unreserve_metadata_space(root, 6); 348 btrfs_unreserve_metadata_space(root, 6);
351 btrfs_btree_balance_dirty(root, nr);
352 return ret; 349 return ret;
353} 350}
354 351
355static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, 352static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
356 char *name, int namelen) 353 char *name, int namelen)
357{ 354{
355 struct inode *inode;
358 struct btrfs_pending_snapshot *pending_snapshot; 356 struct btrfs_pending_snapshot *pending_snapshot;
359 struct btrfs_trans_handle *trans; 357 struct btrfs_trans_handle *trans;
360 int ret = 0; 358 int ret;
361 int err;
362 unsigned long nr = 0;
363 359
364 if (!root->ref_cows) 360 if (!root->ref_cows)
365 return -EINVAL; 361 return -EINVAL;
@@ -372,20 +368,20 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
372 */ 368 */
373 ret = btrfs_reserve_metadata_space(root, 6); 369 ret = btrfs_reserve_metadata_space(root, 6);
374 if (ret) 370 if (ret)
375 goto fail_unlock; 371 goto fail;
376 372
377 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 373 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
378 if (!pending_snapshot) { 374 if (!pending_snapshot) {
379 ret = -ENOMEM; 375 ret = -ENOMEM;
380 btrfs_unreserve_metadata_space(root, 6); 376 btrfs_unreserve_metadata_space(root, 6);
381 goto fail_unlock; 377 goto fail;
382 } 378 }
383 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); 379 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
384 if (!pending_snapshot->name) { 380 if (!pending_snapshot->name) {
385 ret = -ENOMEM; 381 ret = -ENOMEM;
386 kfree(pending_snapshot); 382 kfree(pending_snapshot);
387 btrfs_unreserve_metadata_space(root, 6); 383 btrfs_unreserve_metadata_space(root, 6);
388 goto fail_unlock; 384 goto fail;
389 } 385 }
390 memcpy(pending_snapshot->name, name, namelen); 386 memcpy(pending_snapshot->name, name, namelen);
391 pending_snapshot->name[namelen] = '\0'; 387 pending_snapshot->name[namelen] = '\0';
@@ -395,10 +391,19 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
395 pending_snapshot->root = root; 391 pending_snapshot->root = root;
396 list_add(&pending_snapshot->list, 392 list_add(&pending_snapshot->list,
397 &trans->transaction->pending_snapshots); 393 &trans->transaction->pending_snapshots);
398 err = btrfs_commit_transaction(trans, root); 394 ret = btrfs_commit_transaction(trans, root);
395 BUG_ON(ret);
396 btrfs_unreserve_metadata_space(root, 6);
399 397
400fail_unlock: 398 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
401 btrfs_btree_balance_dirty(root, nr); 399 if (IS_ERR(inode)) {
400 ret = PTR_ERR(inode);
401 goto fail;
402 }
403 BUG_ON(!inode);
404 d_instantiate(dentry, inode);
405 ret = 0;
406fail:
402 return ret; 407 return ret;
403} 408}
404 409
@@ -830,6 +835,7 @@ out_up_write:
830out_unlock: 835out_unlock:
831 mutex_unlock(&inode->i_mutex); 836 mutex_unlock(&inode->i_mutex);
832 if (!err) { 837 if (!err) {
838 shrink_dcache_sb(root->fs_info->sb);
833 btrfs_invalidate_inodes(dest); 839 btrfs_invalidate_inodes(dest);
834 d_delete(dentry); 840 d_delete(dentry);
835 } 841 }
@@ -1026,8 +1032,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1026 BUG_ON(!trans); 1032 BUG_ON(!trans);
1027 1033
1028 /* punch hole in destination first */ 1034 /* punch hole in destination first */
1029 btrfs_drop_extents(trans, root, inode, off, off + len, 1035 btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1);
1030 off + len, 0, &hint_byte, 1);
1031 1036
1032 /* clone data */ 1037 /* clone data */
1033 key.objectid = src->i_ino; 1038 key.objectid = src->i_ino;
@@ -1122,8 +1127,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1122 datao += off - key.offset; 1127 datao += off - key.offset;
1123 datal -= off - key.offset; 1128 datal -= off - key.offset;
1124 } 1129 }
1125 if (key.offset + datao + datal > off + len) 1130
1126 datal = off + len - key.offset - datao; 1131 if (key.offset + datal > off + len)
1132 datal = off + len - key.offset;
1133
1127 /* disko == 0 means it's a hole */ 1134 /* disko == 0 means it's a hole */
1128 if (!disko) 1135 if (!disko)
1129 datao = 0; 1136 datao = 0;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 897fba835f89..b10a49d4bc6a 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -291,21 +291,27 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
291 291
292/* 292/*
293 * remove an ordered extent from the tree. No references are dropped 293 * remove an ordered extent from the tree. No references are dropped
294 * but, anyone waiting on this extent is woken up. 294 * and you must wake_up entry->wait. You must hold the tree mutex
295 * while you call this function.
295 */ 296 */
296int btrfs_remove_ordered_extent(struct inode *inode, 297static int __btrfs_remove_ordered_extent(struct inode *inode,
297 struct btrfs_ordered_extent *entry) 298 struct btrfs_ordered_extent *entry)
298{ 299{
299 struct btrfs_ordered_inode_tree *tree; 300 struct btrfs_ordered_inode_tree *tree;
300 struct rb_node *node; 301 struct rb_node *node;
301 302
302 tree = &BTRFS_I(inode)->ordered_tree; 303 tree = &BTRFS_I(inode)->ordered_tree;
303 mutex_lock(&tree->mutex);
304 node = &entry->rb_node; 304 node = &entry->rb_node;
305 rb_erase(node, &tree->tree); 305 rb_erase(node, &tree->tree);
306 tree->last = NULL; 306 tree->last = NULL;
307 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 307 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
308 308
309 spin_lock(&BTRFS_I(inode)->accounting_lock);
310 BTRFS_I(inode)->outstanding_extents--;
311 spin_unlock(&BTRFS_I(inode)->accounting_lock);
312 btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
313 inode, 1);
314
309 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 315 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
310 list_del_init(&entry->root_extent_list); 316 list_del_init(&entry->root_extent_list);
311 317
@@ -320,16 +326,34 @@ int btrfs_remove_ordered_extent(struct inode *inode,
320 } 326 }
321 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 327 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
322 328
329 return 0;
330}
331
332/*
333 * remove an ordered extent from the tree. No references are dropped
334 * but any waiters are woken.
335 */
336int btrfs_remove_ordered_extent(struct inode *inode,
337 struct btrfs_ordered_extent *entry)
338{
339 struct btrfs_ordered_inode_tree *tree;
340 int ret;
341
342 tree = &BTRFS_I(inode)->ordered_tree;
343 mutex_lock(&tree->mutex);
344 ret = __btrfs_remove_ordered_extent(inode, entry);
323 mutex_unlock(&tree->mutex); 345 mutex_unlock(&tree->mutex);
324 wake_up(&entry->wait); 346 wake_up(&entry->wait);
325 return 0; 347
348 return ret;
326} 349}
327 350
328/* 351/*
329 * wait for all the ordered extents in a root. This is done when balancing 352 * wait for all the ordered extents in a root. This is done when balancing
330 * space between drives. 353 * space between drives.
331 */ 354 */
332int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only) 355int btrfs_wait_ordered_extents(struct btrfs_root *root,
356 int nocow_only, int delay_iput)
333{ 357{
334 struct list_head splice; 358 struct list_head splice;
335 struct list_head *cur; 359 struct list_head *cur;
@@ -366,7 +390,10 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
366 if (inode) { 390 if (inode) {
367 btrfs_start_ordered_extent(inode, ordered, 1); 391 btrfs_start_ordered_extent(inode, ordered, 1);
368 btrfs_put_ordered_extent(ordered); 392 btrfs_put_ordered_extent(ordered);
369 iput(inode); 393 if (delay_iput)
394 btrfs_add_delayed_iput(inode);
395 else
396 iput(inode);
370 } else { 397 } else {
371 btrfs_put_ordered_extent(ordered); 398 btrfs_put_ordered_extent(ordered);
372 } 399 }
@@ -424,7 +451,7 @@ again:
424 btrfs_wait_ordered_range(inode, 0, (u64)-1); 451 btrfs_wait_ordered_range(inode, 0, (u64)-1);
425 else 452 else
426 filemap_flush(inode->i_mapping); 453 filemap_flush(inode->i_mapping);
427 iput(inode); 454 btrfs_add_delayed_iput(inode);
428 } 455 }
429 456
430 cond_resched(); 457 cond_resched();
@@ -583,7 +610,7 @@ out:
583 * After an extent is done, call this to conditionally update the on disk 610 * After an extent is done, call this to conditionally update the on disk
584 * i_size. i_size is updated to cover any fully written part of the file. 611 * i_size. i_size is updated to cover any fully written part of the file.
585 */ 612 */
586int btrfs_ordered_update_i_size(struct inode *inode, 613int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
587 struct btrfs_ordered_extent *ordered) 614 struct btrfs_ordered_extent *ordered)
588{ 615{
589 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 616 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
@@ -591,18 +618,30 @@ int btrfs_ordered_update_i_size(struct inode *inode,
591 u64 disk_i_size; 618 u64 disk_i_size;
592 u64 new_i_size; 619 u64 new_i_size;
593 u64 i_size_test; 620 u64 i_size_test;
621 u64 i_size = i_size_read(inode);
594 struct rb_node *node; 622 struct rb_node *node;
623 struct rb_node *prev = NULL;
595 struct btrfs_ordered_extent *test; 624 struct btrfs_ordered_extent *test;
625 int ret = 1;
626
627 if (ordered)
628 offset = entry_end(ordered);
596 629
597 mutex_lock(&tree->mutex); 630 mutex_lock(&tree->mutex);
598 disk_i_size = BTRFS_I(inode)->disk_i_size; 631 disk_i_size = BTRFS_I(inode)->disk_i_size;
599 632
633 /* truncate file */
634 if (disk_i_size > i_size) {
635 BTRFS_I(inode)->disk_i_size = i_size;
636 ret = 0;
637 goto out;
638 }
639
600 /* 640 /*
601 * if the disk i_size is already at the inode->i_size, or 641 * if the disk i_size is already at the inode->i_size, or
602 * this ordered extent is inside the disk i_size, we're done 642 * this ordered extent is inside the disk i_size, we're done
603 */ 643 */
604 if (disk_i_size >= inode->i_size || 644 if (disk_i_size == i_size || offset <= disk_i_size) {
605 ordered->file_offset + ordered->len <= disk_i_size) {
606 goto out; 645 goto out;
607 } 646 }
608 647
@@ -610,8 +649,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
610 * we can't update the disk_isize if there are delalloc bytes 649 * we can't update the disk_isize if there are delalloc bytes
611 * between disk_i_size and this ordered extent 650 * between disk_i_size and this ordered extent
612 */ 651 */
613 if (test_range_bit(io_tree, disk_i_size, 652 if (test_range_bit(io_tree, disk_i_size, offset - 1,
614 ordered->file_offset + ordered->len - 1,
615 EXTENT_DELALLOC, 0, NULL)) { 653 EXTENT_DELALLOC, 0, NULL)) {
616 goto out; 654 goto out;
617 } 655 }
@@ -620,20 +658,32 @@ int btrfs_ordered_update_i_size(struct inode *inode,
620 * if we find an ordered extent then we can't update disk i_size 658 * if we find an ordered extent then we can't update disk i_size
621 * yet 659 * yet
622 */ 660 */
623 node = &ordered->rb_node; 661 if (ordered) {
624 while (1) { 662 node = rb_prev(&ordered->rb_node);
625 node = rb_prev(node); 663 } else {
626 if (!node) 664 prev = tree_search(tree, offset);
627 break; 665 /*
666 * we insert file extents without involving ordered struct,
667 * so there should be no ordered struct cover this offset
668 */
669 if (prev) {
670 test = rb_entry(prev, struct btrfs_ordered_extent,
671 rb_node);
672 BUG_ON(offset_in_entry(test, offset));
673 }
674 node = prev;
675 }
676 while (node) {
628 test = rb_entry(node, struct btrfs_ordered_extent, rb_node); 677 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
629 if (test->file_offset + test->len <= disk_i_size) 678 if (test->file_offset + test->len <= disk_i_size)
630 break; 679 break;
631 if (test->file_offset >= inode->i_size) 680 if (test->file_offset >= i_size)
632 break; 681 break;
633 if (test->file_offset >= disk_i_size) 682 if (test->file_offset >= disk_i_size)
634 goto out; 683 goto out;
684 node = rb_prev(node);
635 } 685 }
636 new_i_size = min_t(u64, entry_end(ordered), i_size_read(inode)); 686 new_i_size = min_t(u64, offset, i_size);
637 687
638 /* 688 /*
639 * at this point, we know we can safely update i_size to at least 689 * at this point, we know we can safely update i_size to at least
@@ -641,7 +691,14 @@ int btrfs_ordered_update_i_size(struct inode *inode,
641 * walk forward and see if ios from higher up in the file have 691 * walk forward and see if ios from higher up in the file have
642 * finished. 692 * finished.
643 */ 693 */
644 node = rb_next(&ordered->rb_node); 694 if (ordered) {
695 node = rb_next(&ordered->rb_node);
696 } else {
697 if (prev)
698 node = rb_next(prev);
699 else
700 node = rb_first(&tree->tree);
701 }
645 i_size_test = 0; 702 i_size_test = 0;
646 if (node) { 703 if (node) {
647 /* 704 /*
@@ -649,10 +706,10 @@ int btrfs_ordered_update_i_size(struct inode *inode,
649 * between our ordered extent and the next one. 706 * between our ordered extent and the next one.
650 */ 707 */
651 test = rb_entry(node, struct btrfs_ordered_extent, rb_node); 708 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
652 if (test->file_offset > entry_end(ordered)) 709 if (test->file_offset > offset)
653 i_size_test = test->file_offset; 710 i_size_test = test->file_offset;
654 } else { 711 } else {
655 i_size_test = i_size_read(inode); 712 i_size_test = i_size;
656 } 713 }
657 714
658 /* 715 /*
@@ -661,15 +718,25 @@ int btrfs_ordered_update_i_size(struct inode *inode,
661 * are no delalloc bytes in this area, it is safe to update 718 * are no delalloc bytes in this area, it is safe to update
662 * disk_i_size to the end of the region. 719 * disk_i_size to the end of the region.
663 */ 720 */
664 if (i_size_test > entry_end(ordered) && 721 if (i_size_test > offset &&
665 !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1, 722 !test_range_bit(io_tree, offset, i_size_test - 1,
666 EXTENT_DELALLOC, 0, NULL)) { 723 EXTENT_DELALLOC, 0, NULL)) {
667 new_i_size = min_t(u64, i_size_test, i_size_read(inode)); 724 new_i_size = min_t(u64, i_size_test, i_size);
668 } 725 }
669 BTRFS_I(inode)->disk_i_size = new_i_size; 726 BTRFS_I(inode)->disk_i_size = new_i_size;
727 ret = 0;
670out: 728out:
729 /*
730 * we need to remove the ordered extent with the tree lock held
731 * so that other people calling this function don't find our fully
732 * processed ordered entry and skip updating the i_size
733 */
734 if (ordered)
735 __btrfs_remove_ordered_extent(inode, ordered);
671 mutex_unlock(&tree->mutex); 736 mutex_unlock(&tree->mutex);
672 return 0; 737 if (ordered)
738 wake_up(&ordered->wait);
739 return ret;
673} 740}
674 741
675/* 742/*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index f82e87488ca8..1fe1282ef47c 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -150,12 +150,13 @@ void btrfs_start_ordered_extent(struct inode *inode,
150int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); 150int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
151struct btrfs_ordered_extent * 151struct btrfs_ordered_extent *
152btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); 152btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
153int btrfs_ordered_update_i_size(struct inode *inode, 153int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
154 struct btrfs_ordered_extent *ordered); 154 struct btrfs_ordered_extent *ordered);
155int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); 155int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
156int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
157int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); 156int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
158int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, 157int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
159 struct btrfs_root *root, 158 struct btrfs_root *root,
160 struct inode *inode); 159 struct inode *inode);
160int btrfs_wait_ordered_extents(struct btrfs_root *root,
161 int nocow_only, int delay_iput);
161#endif 162#endif
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 361ad323faac..a9728680eca8 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1561,6 +1561,20 @@ static int invalidate_extent_cache(struct btrfs_root *root,
1561 return 0; 1561 return 0;
1562} 1562}
1563 1563
1564static void put_inodes(struct list_head *list)
1565{
1566 struct inodevec *ivec;
1567 while (!list_empty(list)) {
1568 ivec = list_entry(list->next, struct inodevec, list);
1569 list_del(&ivec->list);
1570 while (ivec->nr > 0) {
1571 ivec->nr--;
1572 iput(ivec->inode[ivec->nr]);
1573 }
1574 kfree(ivec);
1575 }
1576}
1577
1564static int find_next_key(struct btrfs_path *path, int level, 1578static int find_next_key(struct btrfs_path *path, int level,
1565 struct btrfs_key *key) 1579 struct btrfs_key *key)
1566 1580
@@ -1723,6 +1737,11 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1723 1737
1724 btrfs_btree_balance_dirty(root, nr); 1738 btrfs_btree_balance_dirty(root, nr);
1725 1739
1740 /*
1741 * put inodes outside transaction, otherwise we may deadlock.
1742 */
1743 put_inodes(&inode_list);
1744
1726 if (replaced && rc->stage == UPDATE_DATA_PTRS) 1745 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1727 invalidate_extent_cache(root, &key, &next_key); 1746 invalidate_extent_cache(root, &key, &next_key);
1728 } 1747 }
@@ -1752,19 +1771,7 @@ out:
1752 1771
1753 btrfs_btree_balance_dirty(root, nr); 1772 btrfs_btree_balance_dirty(root, nr);
1754 1773
1755 /* 1774 put_inodes(&inode_list);
1756 * put inodes while we aren't holding the tree locks
1757 */
1758 while (!list_empty(&inode_list)) {
1759 struct inodevec *ivec;
1760 ivec = list_entry(inode_list.next, struct inodevec, list);
1761 list_del(&ivec->list);
1762 while (ivec->nr > 0) {
1763 ivec->nr--;
1764 iput(ivec->inode[ivec->nr]);
1765 }
1766 kfree(ivec);
1767 }
1768 1775
1769 if (replaced && rc->stage == UPDATE_DATA_PTRS) 1776 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1770 invalidate_extent_cache(root, &key, &next_key); 1777 invalidate_extent_cache(root, &key, &next_key);
@@ -3518,7 +3525,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3518 BUG_ON(!rc->block_group); 3525 BUG_ON(!rc->block_group);
3519 3526
3520 btrfs_init_workers(&rc->workers, "relocate", 3527 btrfs_init_workers(&rc->workers, "relocate",
3521 fs_info->thread_pool_size); 3528 fs_info->thread_pool_size, NULL);
3522 3529
3523 rc->extent_root = extent_root; 3530 rc->extent_root = extent_root;
3524 btrfs_prepare_block_group_relocation(extent_root, rc->block_group); 3531 btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
@@ -3534,8 +3541,8 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3534 (unsigned long long)rc->block_group->key.objectid, 3541 (unsigned long long)rc->block_group->key.objectid,
3535 (unsigned long long)rc->block_group->flags); 3542 (unsigned long long)rc->block_group->flags);
3536 3543
3537 btrfs_start_delalloc_inodes(fs_info->tree_root); 3544 btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
3538 btrfs_wait_ordered_extents(fs_info->tree_root, 0); 3545 btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
3539 3546
3540 while (1) { 3547 while (1) {
3541 rc->extents_found = 0; 3548 rc->extents_found = 0;
@@ -3701,7 +3708,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
3701 mapping_tree_init(&rc->reloc_root_tree); 3708 mapping_tree_init(&rc->reloc_root_tree);
3702 INIT_LIST_HEAD(&rc->reloc_roots); 3709 INIT_LIST_HEAD(&rc->reloc_roots);
3703 btrfs_init_workers(&rc->workers, "relocate", 3710 btrfs_init_workers(&rc->workers, "relocate",
3704 root->fs_info->thread_pool_size); 3711 root->fs_info->thread_pool_size, NULL);
3705 rc->extent_root = root->fs_info->extent_root; 3712 rc->extent_root = root->fs_info->extent_root;
3706 3713
3707 set_reloc_control(rc); 3714 set_reloc_control(rc);
@@ -3755,6 +3762,7 @@ out:
3755 BTRFS_DATA_RELOC_TREE_OBJECTID); 3762 BTRFS_DATA_RELOC_TREE_OBJECTID);
3756 if (IS_ERR(fs_root)) 3763 if (IS_ERR(fs_root))
3757 err = PTR_ERR(fs_root); 3764 err = PTR_ERR(fs_root);
3765 btrfs_orphan_cleanup(fs_root);
3758 } 3766 }
3759 return err; 3767 return err;
3760} 3768}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 9351428f30e2..67fa2d29d663 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -159,7 +159,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
159 write_extent_buffer(l, item, ptr, sizeof(*item)); 159 write_extent_buffer(l, item, ptr, sizeof(*item));
160 btrfs_mark_buffer_dirty(path->nodes[0]); 160 btrfs_mark_buffer_dirty(path->nodes[0]);
161out: 161out:
162 btrfs_release_path(root, path);
163 btrfs_free_path(path); 162 btrfs_free_path(path);
164 return ret; 163 return ret;
165} 164}
@@ -332,7 +331,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
332 BUG_ON(refs != 0); 331 BUG_ON(refs != 0);
333 ret = btrfs_del_item(trans, root, path); 332 ret = btrfs_del_item(trans, root, path);
334out: 333out:
335 btrfs_release_path(root, path);
336 btrfs_free_path(path); 334 btrfs_free_path(path);
337 return ret; 335 return ret;
338} 336}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9de9b2236419..3f9b45704fcd 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -66,7 +66,8 @@ enum {
66 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, 66 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
67 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, 67 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
68 Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, 68 Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl,
69 Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err, 69 Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
70 Opt_discard, Opt_err,
70}; 71};
71 72
72static match_table_t tokens = { 73static match_table_t tokens = {
@@ -88,6 +89,7 @@ static match_table_t tokens = {
88 {Opt_notreelog, "notreelog"}, 89 {Opt_notreelog, "notreelog"},
89 {Opt_flushoncommit, "flushoncommit"}, 90 {Opt_flushoncommit, "flushoncommit"},
90 {Opt_ratio, "metadata_ratio=%d"}, 91 {Opt_ratio, "metadata_ratio=%d"},
92 {Opt_discard, "discard"},
91 {Opt_err, NULL}, 93 {Opt_err, NULL},
92}; 94};
93 95
@@ -126,6 +128,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
126 substring_t args[MAX_OPT_ARGS]; 128 substring_t args[MAX_OPT_ARGS];
127 char *p, *num; 129 char *p, *num;
128 int intarg; 130 int intarg;
131 int ret = 0;
129 132
130 if (!options) 133 if (!options)
131 return 0; 134 return 0;
@@ -257,12 +260,21 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
257 info->metadata_ratio); 260 info->metadata_ratio);
258 } 261 }
259 break; 262 break;
263 case Opt_discard:
264 btrfs_set_opt(info->mount_opt, DISCARD);
265 break;
266 case Opt_err:
267 printk(KERN_INFO "btrfs: unrecognized mount option "
268 "'%s'\n", p);
269 ret = -EINVAL;
270 goto out;
260 default: 271 default:
261 break; 272 break;
262 } 273 }
263 } 274 }
275out:
264 kfree(options); 276 kfree(options);
265 return 0; 277 return ret;
266} 278}
267 279
268/* 280/*
@@ -344,7 +356,7 @@ static int btrfs_fill_super(struct super_block *sb,
344 sb->s_export_op = &btrfs_export_ops; 356 sb->s_export_op = &btrfs_export_ops;
345 sb->s_xattr = btrfs_xattr_handlers; 357 sb->s_xattr = btrfs_xattr_handlers;
346 sb->s_time_gran = 1; 358 sb->s_time_gran = 1;
347#ifdef CONFIG_BTRFS_POSIX_ACL 359#ifdef CONFIG_BTRFS_FS_POSIX_ACL
348 sb->s_flags |= MS_POSIXACL; 360 sb->s_flags |= MS_POSIXACL;
349#endif 361#endif
350 362
@@ -400,8 +412,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
400 return 0; 412 return 0;
401 } 413 }
402 414
403 btrfs_start_delalloc_inodes(root); 415 btrfs_start_delalloc_inodes(root, 0);
404 btrfs_wait_ordered_extents(root, 0); 416 btrfs_wait_ordered_extents(root, 0, 0);
405 417
406 trans = btrfs_start_transaction(root, 1); 418 trans = btrfs_start_transaction(root, 1);
407 ret = btrfs_commit_transaction(trans, root); 419 ret = btrfs_commit_transaction(trans, root);
@@ -445,6 +457,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
445 seq_puts(seq, ",notreelog"); 457 seq_puts(seq, ",notreelog");
446 if (btrfs_test_opt(root, FLUSHONCOMMIT)) 458 if (btrfs_test_opt(root, FLUSHONCOMMIT))
447 seq_puts(seq, ",flushoncommit"); 459 seq_puts(seq, ",flushoncommit");
460 if (btrfs_test_opt(root, DISCARD))
461 seq_puts(seq, ",discard");
448 if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) 462 if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
449 seq_puts(seq, ",noacl"); 463 seq_puts(seq, ",noacl");
450 return 0; 464 return 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0b8f36d4400a..b2acc79f1b34 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -163,8 +163,14 @@ static void wait_current_trans(struct btrfs_root *root)
163 } 163 }
164} 164}
165 165
166enum btrfs_trans_type {
167 TRANS_START,
168 TRANS_JOIN,
169 TRANS_USERSPACE,
170};
171
166static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, 172static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
167 int num_blocks, int wait) 173 int num_blocks, int type)
168{ 174{
169 struct btrfs_trans_handle *h = 175 struct btrfs_trans_handle *h =
170 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 176 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
@@ -172,7 +178,8 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
172 178
173 mutex_lock(&root->fs_info->trans_mutex); 179 mutex_lock(&root->fs_info->trans_mutex);
174 if (!root->fs_info->log_root_recovering && 180 if (!root->fs_info->log_root_recovering &&
175 ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2)) 181 ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
182 type == TRANS_USERSPACE))
176 wait_current_trans(root); 183 wait_current_trans(root);
177 ret = join_transaction(root); 184 ret = join_transaction(root);
178 BUG_ON(ret); 185 BUG_ON(ret);
@@ -186,7 +193,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
186 h->alloc_exclude_start = 0; 193 h->alloc_exclude_start = 0;
187 h->delayed_ref_updates = 0; 194 h->delayed_ref_updates = 0;
188 195
189 if (!current->journal_info) 196 if (!current->journal_info && type != TRANS_USERSPACE)
190 current->journal_info = h; 197 current->journal_info = h;
191 198
192 root->fs_info->running_transaction->use_count++; 199 root->fs_info->running_transaction->use_count++;
@@ -198,18 +205,18 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
198struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 205struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
199 int num_blocks) 206 int num_blocks)
200{ 207{
201 return start_transaction(root, num_blocks, 1); 208 return start_transaction(root, num_blocks, TRANS_START);
202} 209}
203struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 210struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
204 int num_blocks) 211 int num_blocks)
205{ 212{
206 return start_transaction(root, num_blocks, 0); 213 return start_transaction(root, num_blocks, TRANS_JOIN);
207} 214}
208 215
209struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 216struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
210 int num_blocks) 217 int num_blocks)
211{ 218{
212 return start_transaction(r, num_blocks, 2); 219 return start_transaction(r, num_blocks, TRANS_USERSPACE);
213} 220}
214 221
215/* wait for a transaction commit to be fully complete */ 222/* wait for a transaction commit to be fully complete */
@@ -326,6 +333,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
326 memset(trans, 0, sizeof(*trans)); 333 memset(trans, 0, sizeof(*trans));
327 kmem_cache_free(btrfs_trans_handle_cachep, trans); 334 kmem_cache_free(btrfs_trans_handle_cachep, trans);
328 335
336 if (throttle)
337 btrfs_run_delayed_iputs(root);
338
329 return 0; 339 return 0;
330} 340}
331 341
@@ -344,10 +354,10 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
344/* 354/*
345 * when btree blocks are allocated, they have some corresponding bits set for 355 * when btree blocks are allocated, they have some corresponding bits set for
346 * them in one of two extent_io trees. This is used to make sure all of 356 * them in one of two extent_io trees. This is used to make sure all of
347 * those extents are on disk for transaction or log commit 357 * those extents are sent to disk but does not wait on them
348 */ 358 */
349int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, 359int btrfs_write_marked_extents(struct btrfs_root *root,
350 struct extent_io_tree *dirty_pages) 360 struct extent_io_tree *dirty_pages, int mark)
351{ 361{
352 int ret; 362 int ret;
353 int err = 0; 363 int err = 0;
@@ -360,7 +370,7 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
360 370
361 while (1) { 371 while (1) {
362 ret = find_first_extent_bit(dirty_pages, start, &start, &end, 372 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
363 EXTENT_DIRTY); 373 mark);
364 if (ret) 374 if (ret)
365 break; 375 break;
366 while (start <= end) { 376 while (start <= end) {
@@ -394,13 +404,36 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
394 page_cache_release(page); 404 page_cache_release(page);
395 } 405 }
396 } 406 }
407 if (err)
408 werr = err;
409 return werr;
410}
411
412/*
413 * when btree blocks are allocated, they have some corresponding bits set for
414 * them in one of two extent_io trees. This is used to make sure all of
415 * those extents are on disk for transaction or log commit. We wait
416 * on all the pages and clear them from the dirty pages state tree
417 */
418int btrfs_wait_marked_extents(struct btrfs_root *root,
419 struct extent_io_tree *dirty_pages, int mark)
420{
421 int ret;
422 int err = 0;
423 int werr = 0;
424 struct page *page;
425 struct inode *btree_inode = root->fs_info->btree_inode;
426 u64 start = 0;
427 u64 end;
428 unsigned long index;
429
397 while (1) { 430 while (1) {
398 ret = find_first_extent_bit(dirty_pages, 0, &start, &end, 431 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
399 EXTENT_DIRTY); 432 mark);
400 if (ret) 433 if (ret)
401 break; 434 break;
402 435
403 clear_extent_dirty(dirty_pages, start, end, GFP_NOFS); 436 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
404 while (start <= end) { 437 while (start <= end) {
405 index = start >> PAGE_CACHE_SHIFT; 438 index = start >> PAGE_CACHE_SHIFT;
406 start = (u64)(index + 1) << PAGE_CACHE_SHIFT; 439 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
@@ -424,6 +457,22 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
424 return werr; 457 return werr;
425} 458}
426 459
460/*
461 * when btree blocks are allocated, they have some corresponding bits set for
462 * them in one of two extent_io trees. This is used to make sure all of
463 * those extents are on disk for transaction or log commit
464 */
465int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
466 struct extent_io_tree *dirty_pages, int mark)
467{
468 int ret;
469 int ret2;
470
471 ret = btrfs_write_marked_extents(root, dirty_pages, mark);
472 ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
473 return ret || ret2;
474}
475
427int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 476int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
428 struct btrfs_root *root) 477 struct btrfs_root *root)
429{ 478{
@@ -433,7 +482,8 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
433 return filemap_write_and_wait(btree_inode->i_mapping); 482 return filemap_write_and_wait(btree_inode->i_mapping);
434 } 483 }
435 return btrfs_write_and_wait_marked_extents(root, 484 return btrfs_write_and_wait_marked_extents(root,
436 &trans->transaction->dirty_pages); 485 &trans->transaction->dirty_pages,
486 EXTENT_DIRTY);
437} 487}
438 488
439/* 489/*
@@ -451,13 +501,16 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
451{ 501{
452 int ret; 502 int ret;
453 u64 old_root_bytenr; 503 u64 old_root_bytenr;
504 u64 old_root_used;
454 struct btrfs_root *tree_root = root->fs_info->tree_root; 505 struct btrfs_root *tree_root = root->fs_info->tree_root;
455 506
507 old_root_used = btrfs_root_used(&root->root_item);
456 btrfs_write_dirty_block_groups(trans, root); 508 btrfs_write_dirty_block_groups(trans, root);
457 509
458 while (1) { 510 while (1) {
459 old_root_bytenr = btrfs_root_bytenr(&root->root_item); 511 old_root_bytenr = btrfs_root_bytenr(&root->root_item);
460 if (old_root_bytenr == root->node->start) 512 if (old_root_bytenr == root->node->start &&
513 old_root_used == btrfs_root_used(&root->root_item))
461 break; 514 break;
462 515
463 btrfs_set_root_node(&root->root_item, root->node); 516 btrfs_set_root_node(&root->root_item, root->node);
@@ -466,6 +519,7 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
466 &root->root_item); 519 &root->root_item);
467 BUG_ON(ret); 520 BUG_ON(ret);
468 521
522 old_root_used = btrfs_root_used(&root->root_item);
469 ret = btrfs_write_dirty_block_groups(trans, root); 523 ret = btrfs_write_dirty_block_groups(trans, root);
470 BUG_ON(ret); 524 BUG_ON(ret);
471 } 525 }
@@ -749,7 +803,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
749 memcpy(&pending->root_key, &key, sizeof(key)); 803 memcpy(&pending->root_key, &key, sizeof(key));
750fail: 804fail:
751 kfree(new_root_item); 805 kfree(new_root_item);
752 btrfs_unreserve_metadata_space(root, 6);
753 return ret; 806 return ret;
754} 807}
755 808
@@ -761,7 +814,6 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
761 u64 index = 0; 814 u64 index = 0;
762 struct btrfs_trans_handle *trans; 815 struct btrfs_trans_handle *trans;
763 struct inode *parent_inode; 816 struct inode *parent_inode;
764 struct inode *inode;
765 struct btrfs_root *parent_root; 817 struct btrfs_root *parent_root;
766 818
767 parent_inode = pending->dentry->d_parent->d_inode; 819 parent_inode = pending->dentry->d_parent->d_inode;
@@ -793,8 +845,6 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
793 845
794 BUG_ON(ret); 846 BUG_ON(ret);
795 847
796 inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
797 d_instantiate(pending->dentry, inode);
798fail: 848fail:
799 btrfs_end_transaction(trans, fs_info->fs_root); 849 btrfs_end_transaction(trans, fs_info->fs_root);
800 return ret; 850 return ret;
@@ -948,11 +998,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
948 mutex_unlock(&root->fs_info->trans_mutex); 998 mutex_unlock(&root->fs_info->trans_mutex);
949 999
950 if (flush_on_commit) { 1000 if (flush_on_commit) {
951 btrfs_start_delalloc_inodes(root); 1001 btrfs_start_delalloc_inodes(root, 1);
952 ret = btrfs_wait_ordered_extents(root, 0); 1002 ret = btrfs_wait_ordered_extents(root, 0, 1);
953 BUG_ON(ret); 1003 BUG_ON(ret);
954 } else if (snap_pending) { 1004 } else if (snap_pending) {
955 ret = btrfs_wait_ordered_extents(root, 1); 1005 ret = btrfs_wait_ordered_extents(root, 0, 1);
956 BUG_ON(ret); 1006 BUG_ON(ret);
957 } 1007 }
958 1008
@@ -1070,6 +1120,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1070 current->journal_info = NULL; 1120 current->journal_info = NULL;
1071 1121
1072 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1122 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1123
1124 if (current != root->fs_info->transaction_kthread)
1125 btrfs_run_delayed_iputs(root);
1126
1073 return ret; 1127 return ret;
1074} 1128}
1075 1129
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 663c67404918..93c7ccb33118 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -79,6 +79,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
79 struct inode *inode) 79 struct inode *inode)
80{ 80{
81 BTRFS_I(inode)->last_trans = trans->transaction->transid; 81 BTRFS_I(inode)->last_trans = trans->transaction->transid;
82 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
82} 83}
83 84
84int btrfs_end_transaction(struct btrfs_trans_handle *trans, 85int btrfs_end_transaction(struct btrfs_trans_handle *trans,
@@ -106,6 +107,10 @@ void btrfs_throttle(struct btrfs_root *root);
106int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, 107int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
107 struct btrfs_root *root); 108 struct btrfs_root *root);
108int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, 109int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
109 struct extent_io_tree *dirty_pages); 110 struct extent_io_tree *dirty_pages, int mark);
111int btrfs_write_marked_extents(struct btrfs_root *root,
112 struct extent_io_tree *dirty_pages, int mark);
113int btrfs_wait_marked_extents(struct btrfs_root *root,
114 struct extent_io_tree *dirty_pages, int mark);
110int btrfs_transaction_in_commit(struct btrfs_fs_info *info); 115int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
111#endif 116#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 7827841b55cb..4a9434b622ec 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -137,11 +137,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
137 137
138 mutex_lock(&root->log_mutex); 138 mutex_lock(&root->log_mutex);
139 if (root->log_root) { 139 if (root->log_root) {
140 if (!root->log_start_pid) {
141 root->log_start_pid = current->pid;
142 root->log_multiple_pids = false;
143 } else if (root->log_start_pid != current->pid) {
144 root->log_multiple_pids = true;
145 }
146
140 root->log_batch++; 147 root->log_batch++;
141 atomic_inc(&root->log_writers); 148 atomic_inc(&root->log_writers);
142 mutex_unlock(&root->log_mutex); 149 mutex_unlock(&root->log_mutex);
143 return 0; 150 return 0;
144 } 151 }
152 root->log_multiple_pids = false;
153 root->log_start_pid = current->pid;
145 mutex_lock(&root->fs_info->tree_log_mutex); 154 mutex_lock(&root->fs_info->tree_log_mutex);
146 if (!root->fs_info->log_root_tree) { 155 if (!root->fs_info->log_root_tree) {
147 ret = btrfs_init_log_root_tree(trans, root->fs_info); 156 ret = btrfs_init_log_root_tree(trans, root->fs_info);
@@ -533,8 +542,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
533 542
534 saved_nbytes = inode_get_bytes(inode); 543 saved_nbytes = inode_get_bytes(inode);
535 /* drop any overlapping extents */ 544 /* drop any overlapping extents */
536 ret = btrfs_drop_extents(trans, root, inode, 545 ret = btrfs_drop_extents(trans, inode, start, extent_end,
537 start, extent_end, extent_end, start, &alloc_hint, 1); 546 &alloc_hint, 1);
538 BUG_ON(ret); 547 BUG_ON(ret);
539 548
540 if (found_type == BTRFS_FILE_EXTENT_REG || 549 if (found_type == BTRFS_FILE_EXTENT_REG ||
@@ -921,6 +930,17 @@ out_nowrite:
921 return 0; 930 return 0;
922} 931}
923 932
933static int insert_orphan_item(struct btrfs_trans_handle *trans,
934 struct btrfs_root *root, u64 offset)
935{
936 int ret;
937 ret = btrfs_find_orphan_item(root, offset);
938 if (ret > 0)
939 ret = btrfs_insert_orphan_item(trans, root, offset);
940 return ret;
941}
942
943
924/* 944/*
925 * There are a few corners where the link count of the file can't 945 * There are a few corners where the link count of the file can't
926 * be properly maintained during replay. So, instead of adding 946 * be properly maintained during replay. So, instead of adding
@@ -988,9 +1008,13 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
988 } 1008 }
989 BTRFS_I(inode)->index_cnt = (u64)-1; 1009 BTRFS_I(inode)->index_cnt = (u64)-1;
990 1010
991 if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) { 1011 if (inode->i_nlink == 0) {
992 ret = replay_dir_deletes(trans, root, NULL, path, 1012 if (S_ISDIR(inode->i_mode)) {
993 inode->i_ino, 1); 1013 ret = replay_dir_deletes(trans, root, NULL, path,
1014 inode->i_ino, 1);
1015 BUG_ON(ret);
1016 }
1017 ret = insert_orphan_item(trans, root, inode->i_ino);
994 BUG_ON(ret); 1018 BUG_ON(ret);
995 } 1019 }
996 btrfs_free_path(path); 1020 btrfs_free_path(path);
@@ -1578,7 +1602,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1578 /* inode keys are done during the first stage */ 1602 /* inode keys are done during the first stage */
1579 if (key.type == BTRFS_INODE_ITEM_KEY && 1603 if (key.type == BTRFS_INODE_ITEM_KEY &&
1580 wc->stage == LOG_WALK_REPLAY_INODES) { 1604 wc->stage == LOG_WALK_REPLAY_INODES) {
1581 struct inode *inode;
1582 struct btrfs_inode_item *inode_item; 1605 struct btrfs_inode_item *inode_item;
1583 u32 mode; 1606 u32 mode;
1584 1607
@@ -1594,31 +1617,16 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1594 eb, i, &key); 1617 eb, i, &key);
1595 BUG_ON(ret); 1618 BUG_ON(ret);
1596 1619
1597 /* for regular files, truncate away 1620 /* for regular files, make sure corresponding
1598 * extents past the new EOF 1621 * orhpan item exist. extents past the new EOF
1622 * will be truncated later by orphan cleanup.
1599 */ 1623 */
1600 if (S_ISREG(mode)) { 1624 if (S_ISREG(mode)) {
1601 inode = read_one_inode(root, 1625 ret = insert_orphan_item(wc->trans, root,
1602 key.objectid); 1626 key.objectid);
1603 BUG_ON(!inode);
1604
1605 ret = btrfs_truncate_inode_items(wc->trans,
1606 root, inode, inode->i_size,
1607 BTRFS_EXTENT_DATA_KEY);
1608 BUG_ON(ret); 1627 BUG_ON(ret);
1609
1610 /* if the nlink count is zero here, the iput
1611 * will free the inode. We bump it to make
1612 * sure it doesn't get freed until the link
1613 * count fixup is done
1614 */
1615 if (inode->i_nlink == 0) {
1616 btrfs_inc_nlink(inode);
1617 btrfs_update_inode(wc->trans,
1618 root, inode);
1619 }
1620 iput(inode);
1621 } 1628 }
1629
1622 ret = link_to_fixup_dir(wc->trans, root, 1630 ret = link_to_fixup_dir(wc->trans, root,
1623 path, key.objectid); 1631 path, key.objectid);
1624 BUG_ON(ret); 1632 BUG_ON(ret);
@@ -1968,9 +1976,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1968{ 1976{
1969 int index1; 1977 int index1;
1970 int index2; 1978 int index2;
1979 int mark;
1971 int ret; 1980 int ret;
1972 struct btrfs_root *log = root->log_root; 1981 struct btrfs_root *log = root->log_root;
1973 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 1982 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
1983 unsigned long log_transid = 0;
1974 1984
1975 mutex_lock(&root->log_mutex); 1985 mutex_lock(&root->log_mutex);
1976 index1 = root->log_transid % 2; 1986 index1 = root->log_transid % 2;
@@ -1987,10 +1997,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1987 1997
1988 while (1) { 1998 while (1) {
1989 unsigned long batch = root->log_batch; 1999 unsigned long batch = root->log_batch;
1990 mutex_unlock(&root->log_mutex); 2000 if (root->log_multiple_pids) {
1991 schedule_timeout_uninterruptible(1); 2001 mutex_unlock(&root->log_mutex);
1992 mutex_lock(&root->log_mutex); 2002 schedule_timeout_uninterruptible(1);
1993 2003 mutex_lock(&root->log_mutex);
2004 }
1994 wait_for_writer(trans, root); 2005 wait_for_writer(trans, root);
1995 if (batch == root->log_batch) 2006 if (batch == root->log_batch)
1996 break; 2007 break;
@@ -2003,7 +2014,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2003 goto out; 2014 goto out;
2004 } 2015 }
2005 2016
2006 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); 2017 log_transid = root->log_transid;
2018 if (log_transid % 2 == 0)
2019 mark = EXTENT_DIRTY;
2020 else
2021 mark = EXTENT_NEW;
2022
2023 /* we start IO on all the marked extents here, but we don't actually
2024 * wait for them until later.
2025 */
2026 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
2007 BUG_ON(ret); 2027 BUG_ON(ret);
2008 2028
2009 btrfs_set_root_node(&log->root_item, log->node); 2029 btrfs_set_root_node(&log->root_item, log->node);
@@ -2011,11 +2031,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2011 root->log_batch = 0; 2031 root->log_batch = 0;
2012 root->log_transid++; 2032 root->log_transid++;
2013 log->log_transid = root->log_transid; 2033 log->log_transid = root->log_transid;
2034 root->log_start_pid = 0;
2014 smp_mb(); 2035 smp_mb();
2015 /* 2036 /*
2016 * log tree has been flushed to disk, new modifications of 2037 * IO has been started, blocks of the log tree have WRITTEN flag set
2017 * the log will be written to new positions. so it's safe to 2038 * in their headers. new modifications of the log will be written to
2018 * allow log writers to go in. 2039 * new positions. so it's safe to allow log writers to go in.
2019 */ 2040 */
2020 mutex_unlock(&root->log_mutex); 2041 mutex_unlock(&root->log_mutex);
2021 2042
@@ -2036,6 +2057,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2036 2057
2037 index2 = log_root_tree->log_transid % 2; 2058 index2 = log_root_tree->log_transid % 2;
2038 if (atomic_read(&log_root_tree->log_commit[index2])) { 2059 if (atomic_read(&log_root_tree->log_commit[index2])) {
2060 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2039 wait_log_commit(trans, log_root_tree, 2061 wait_log_commit(trans, log_root_tree,
2040 log_root_tree->log_transid); 2062 log_root_tree->log_transid);
2041 mutex_unlock(&log_root_tree->log_mutex); 2063 mutex_unlock(&log_root_tree->log_mutex);
@@ -2055,14 +2077,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2055 * check the full commit flag again 2077 * check the full commit flag again
2056 */ 2078 */
2057 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2079 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2080 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2058 mutex_unlock(&log_root_tree->log_mutex); 2081 mutex_unlock(&log_root_tree->log_mutex);
2059 ret = -EAGAIN; 2082 ret = -EAGAIN;
2060 goto out_wake_log_root; 2083 goto out_wake_log_root;
2061 } 2084 }
2062 2085
2063 ret = btrfs_write_and_wait_marked_extents(log_root_tree, 2086 ret = btrfs_write_and_wait_marked_extents(log_root_tree,
2064 &log_root_tree->dirty_log_pages); 2087 &log_root_tree->dirty_log_pages,
2088 EXTENT_DIRTY | EXTENT_NEW);
2065 BUG_ON(ret); 2089 BUG_ON(ret);
2090 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2066 2091
2067 btrfs_set_super_log_root(&root->fs_info->super_for_commit, 2092 btrfs_set_super_log_root(&root->fs_info->super_for_commit,
2068 log_root_tree->node->start); 2093 log_root_tree->node->start);
@@ -2082,9 +2107,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2082 * the running transaction open, so a full commit can't hop 2107 * the running transaction open, so a full commit can't hop
2083 * in and cause problems either. 2108 * in and cause problems either.
2084 */ 2109 */
2085 write_ctree_super(trans, root->fs_info->tree_root, 2); 2110 write_ctree_super(trans, root->fs_info->tree_root, 1);
2086 ret = 0; 2111 ret = 0;
2087 2112
2113 mutex_lock(&root->log_mutex);
2114 if (root->last_log_commit < log_transid)
2115 root->last_log_commit = log_transid;
2116 mutex_unlock(&root->log_mutex);
2117
2088out_wake_log_root: 2118out_wake_log_root:
2089 atomic_set(&log_root_tree->log_commit[index2], 0); 2119 atomic_set(&log_root_tree->log_commit[index2], 0);
2090 smp_mb(); 2120 smp_mb();
@@ -2123,12 +2153,12 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2123 2153
2124 while (1) { 2154 while (1) {
2125 ret = find_first_extent_bit(&log->dirty_log_pages, 2155 ret = find_first_extent_bit(&log->dirty_log_pages,
2126 0, &start, &end, EXTENT_DIRTY); 2156 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
2127 if (ret) 2157 if (ret)
2128 break; 2158 break;
2129 2159
2130 clear_extent_dirty(&log->dirty_log_pages, 2160 clear_extent_bits(&log->dirty_log_pages, start, end,
2131 start, end, GFP_NOFS); 2161 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
2132 } 2162 }
2133 2163
2134 if (log->log_transid > 0) { 2164 if (log->log_transid > 0) {
@@ -2852,6 +2882,21 @@ out:
2852 return ret; 2882 return ret;
2853} 2883}
2854 2884
2885static int inode_in_log(struct btrfs_trans_handle *trans,
2886 struct inode *inode)
2887{
2888 struct btrfs_root *root = BTRFS_I(inode)->root;
2889 int ret = 0;
2890
2891 mutex_lock(&root->log_mutex);
2892 if (BTRFS_I(inode)->logged_trans == trans->transid &&
2893 BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
2894 ret = 1;
2895 mutex_unlock(&root->log_mutex);
2896 return ret;
2897}
2898
2899
2855/* 2900/*
2856 * helper function around btrfs_log_inode to make sure newly created 2901 * helper function around btrfs_log_inode to make sure newly created
2857 * parent directories also end up in the log. A minimal inode and backref 2902 * parent directories also end up in the log. A minimal inode and backref
@@ -2891,6 +2936,11 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2891 if (ret) 2936 if (ret)
2892 goto end_no_trans; 2937 goto end_no_trans;
2893 2938
2939 if (inode_in_log(trans, inode)) {
2940 ret = BTRFS_NO_LOG_SYNC;
2941 goto end_no_trans;
2942 }
2943
2894 start_log_trans(trans, root); 2944 start_log_trans(trans, root);
2895 2945
2896 ret = btrfs_log_inode(trans, root, inode, inode_only); 2946 ret = btrfs_log_inode(trans, root, inode, inode_only);
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index d09c7609e16b..0776eacb5083 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -19,6 +19,9 @@
19#ifndef __TREE_LOG_ 19#ifndef __TREE_LOG_
20#define __TREE_LOG_ 20#define __TREE_LOG_
21 21
22/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
23#define BTRFS_NO_LOG_SYNC 256
24
22int btrfs_sync_log(struct btrfs_trans_handle *trans, 25int btrfs_sync_log(struct btrfs_trans_handle *trans,
23 struct btrfs_root *root); 26 struct btrfs_root *root);
24int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); 27int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 7eda483d7b5a..198cff28766d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2209,7 +2209,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2209 max_chunk_size = 10 * calc_size; 2209 max_chunk_size = 10 * calc_size;
2210 min_stripe_size = 64 * 1024 * 1024; 2210 min_stripe_size = 64 * 1024 * 1024;
2211 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 2211 } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
2212 max_chunk_size = 4 * calc_size; 2212 max_chunk_size = 256 * 1024 * 1024;
2213 min_stripe_size = 32 * 1024 * 1024; 2213 min_stripe_size = 32 * 1024 * 1024;
2214 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 2214 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
2215 calc_size = 8 * 1024 * 1024; 2215 calc_size = 8 * 1024 * 1024;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index b0fc93f95fd0..193b58f7d3f3 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -85,22 +85,23 @@ out:
85 return ret; 85 return ret;
86} 86}
87 87
88int __btrfs_setxattr(struct inode *inode, const char *name, 88static int do_setxattr(struct btrfs_trans_handle *trans,
89 const void *value, size_t size, int flags) 89 struct inode *inode, const char *name,
90 const void *value, size_t size, int flags)
90{ 91{
91 struct btrfs_dir_item *di; 92 struct btrfs_dir_item *di;
92 struct btrfs_root *root = BTRFS_I(inode)->root; 93 struct btrfs_root *root = BTRFS_I(inode)->root;
93 struct btrfs_trans_handle *trans;
94 struct btrfs_path *path; 94 struct btrfs_path *path;
95 int ret = 0, mod = 0; 95 size_t name_len = strlen(name);
96 int ret = 0;
97
98 if (name_len + size > BTRFS_MAX_XATTR_SIZE(root))
99 return -ENOSPC;
96 100
97 path = btrfs_alloc_path(); 101 path = btrfs_alloc_path();
98 if (!path) 102 if (!path)
99 return -ENOMEM; 103 return -ENOMEM;
100 104
101 trans = btrfs_join_transaction(root, 1);
102 btrfs_set_trans_block_group(trans, inode);
103
104 /* first lets see if we already have this xattr */ 105 /* first lets see if we already have this xattr */
105 di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name, 106 di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name,
106 strlen(name), -1); 107 strlen(name), -1);
@@ -118,15 +119,12 @@ int __btrfs_setxattr(struct inode *inode, const char *name,
118 } 119 }
119 120
120 ret = btrfs_delete_one_dir_name(trans, root, path, di); 121 ret = btrfs_delete_one_dir_name(trans, root, path, di);
121 if (ret) 122 BUG_ON(ret);
122 goto out;
123 btrfs_release_path(root, path); 123 btrfs_release_path(root, path);
124 124
125 /* if we don't have a value then we are removing the xattr */ 125 /* if we don't have a value then we are removing the xattr */
126 if (!value) { 126 if (!value)
127 mod = 1;
128 goto out; 127 goto out;
129 }
130 } else { 128 } else {
131 btrfs_release_path(root, path); 129 btrfs_release_path(root, path);
132 130
@@ -138,20 +136,45 @@ int __btrfs_setxattr(struct inode *inode, const char *name,
138 } 136 }
139 137
140 /* ok we have to create a completely new xattr */ 138 /* ok we have to create a completely new xattr */
141 ret = btrfs_insert_xattr_item(trans, root, name, strlen(name), 139 ret = btrfs_insert_xattr_item(trans, root, path, inode->i_ino,
142 value, size, inode->i_ino); 140 name, name_len, value, size);
141 BUG_ON(ret);
142out:
143 btrfs_free_path(path);
144 return ret;
145}
146
147int __btrfs_setxattr(struct btrfs_trans_handle *trans,
148 struct inode *inode, const char *name,
149 const void *value, size_t size, int flags)
150{
151 struct btrfs_root *root = BTRFS_I(inode)->root;
152 int ret;
153
154 if (trans)
155 return do_setxattr(trans, inode, name, value, size, flags);
156
157 ret = btrfs_reserve_metadata_space(root, 2);
143 if (ret) 158 if (ret)
144 goto out; 159 return ret;
145 mod = 1;
146 160
147out: 161 trans = btrfs_start_transaction(root, 1);
148 if (mod) { 162 if (!trans) {
149 inode->i_ctime = CURRENT_TIME; 163 ret = -ENOMEM;
150 ret = btrfs_update_inode(trans, root, inode); 164 goto out;
151 } 165 }
166 btrfs_set_trans_block_group(trans, inode);
152 167
153 btrfs_end_transaction(trans, root); 168 ret = do_setxattr(trans, inode, name, value, size, flags);
154 btrfs_free_path(path); 169 if (ret)
170 goto out;
171
172 inode->i_ctime = CURRENT_TIME;
173 ret = btrfs_update_inode(trans, root, inode);
174 BUG_ON(ret);
175out:
176 btrfs_end_transaction_throttle(trans, root);
177 btrfs_unreserve_metadata_space(root, 2);
155 return ret; 178 return ret;
156} 179}
157 180
@@ -260,7 +283,7 @@ err:
260 * attributes are handled directly. 283 * attributes are handled directly.
261 */ 284 */
262struct xattr_handler *btrfs_xattr_handlers[] = { 285struct xattr_handler *btrfs_xattr_handlers[] = {
263#ifdef CONFIG_BTRFS_POSIX_ACL 286#ifdef CONFIG_BTRFS_FS_POSIX_ACL
264 &btrfs_xattr_acl_access_handler, 287 &btrfs_xattr_acl_access_handler,
265 &btrfs_xattr_acl_default_handler, 288 &btrfs_xattr_acl_default_handler,
266#endif 289#endif
@@ -314,7 +337,9 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
314 337
315 if (size == 0) 338 if (size == 0)
316 value = ""; /* empty EA, do not remove */ 339 value = ""; /* empty EA, do not remove */
317 return __btrfs_setxattr(dentry->d_inode, name, value, size, flags); 340
341 return __btrfs_setxattr(NULL, dentry->d_inode, name, value, size,
342 flags);
318} 343}
319 344
320int btrfs_removexattr(struct dentry *dentry, const char *name) 345int btrfs_removexattr(struct dentry *dentry, const char *name)
@@ -329,10 +354,13 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
329 354
330 if (!btrfs_is_valid_xattr(name)) 355 if (!btrfs_is_valid_xattr(name))
331 return -EOPNOTSUPP; 356 return -EOPNOTSUPP;
332 return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE); 357
358 return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0,
359 XATTR_REPLACE);
333} 360}
334 361
335int btrfs_xattr_security_init(struct inode *inode, struct inode *dir) 362int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
363 struct inode *inode, struct inode *dir)
336{ 364{
337 int err; 365 int err;
338 size_t len; 366 size_t len;
@@ -354,7 +382,7 @@ int btrfs_xattr_security_init(struct inode *inode, struct inode *dir)
354 } else { 382 } else {
355 strcpy(name, XATTR_SECURITY_PREFIX); 383 strcpy(name, XATTR_SECURITY_PREFIX);
356 strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix); 384 strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
357 err = __btrfs_setxattr(inode, name, value, len, 0); 385 err = __btrfs_setxattr(trans, inode, name, value, len, 0);
358 kfree(name); 386 kfree(name);
359 } 387 }
360 388
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index c71e9c3cf3f7..721efa0346e0 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -27,15 +27,16 @@ extern struct xattr_handler *btrfs_xattr_handlers[];
27 27
28extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, 28extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
29 void *buffer, size_t size); 29 void *buffer, size_t size);
30extern int __btrfs_setxattr(struct inode *inode, const char *name, 30extern int __btrfs_setxattr(struct btrfs_trans_handle *trans,
31 const void *value, size_t size, int flags); 31 struct inode *inode, const char *name,
32 32 const void *value, size_t size, int flags);
33extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, 33extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
34 void *buffer, size_t size); 34 void *buffer, size_t size);
35extern int btrfs_setxattr(struct dentry *dentry, const char *name, 35extern int btrfs_setxattr(struct dentry *dentry, const char *name,
36 const void *value, size_t size, int flags); 36 const void *value, size_t size, int flags);
37extern int btrfs_removexattr(struct dentry *dentry, const char *name); 37extern int btrfs_removexattr(struct dentry *dentry, const char *name);
38 38
39extern int btrfs_xattr_security_init(struct inode *inode, struct inode *dir); 39extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
40 struct inode *inode, struct inode *dir);
40 41
41#endif /* __XATTR__ */ 42#endif /* __XATTR__ */