aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-10-13 03:31:28 -0400
committerIngo Molnar <mingo@elte.hu>2009-10-13 03:31:34 -0400
commit9dbdd6c41c12fb42ee7188eafa7e1917b192af3a (patch)
tree06a9eb894bc976c5c20e84ccd74fd82b9b71aed4 /fs/btrfs
parent7a693d3f0d10f978ebdf3082c41404ab97106567 (diff)
parent161291396e76e0832c08f617eb9bd364d1648148 (diff)
Merge commit 'v2.6.32-rc4' into perf/core
Merge reason: we were on an -rc1 base, merge up to -rc4. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/acl.c6
-rw-r--r--fs/btrfs/async-thread.c81
-rw-r--r--fs/btrfs/async-thread.h10
-rw-r--r--fs/btrfs/btrfs_inode.h10
-rw-r--r--fs/btrfs/ctree.h36
-rw-r--r--fs/btrfs/disk-io.c58
-rw-r--r--fs/btrfs/extent-tree.c598
-rw-r--r--fs/btrfs/extent_io.c134
-rw-r--r--fs/btrfs/extent_io.h31
-rw-r--r--fs/btrfs/file.c38
-rw-r--r--fs/btrfs/inode.c328
-rw-r--r--fs/btrfs/ioctl.c69
-rw-r--r--fs/btrfs/ordered-data.c99
-rw-r--r--fs/btrfs/ordered-data.h4
-rw-r--r--fs/btrfs/relocation.c4
-rw-r--r--fs/btrfs/super.c2
-rw-r--r--fs/btrfs/transaction.c10
-rw-r--r--fs/btrfs/tree-log.c12
-rw-r--r--fs/btrfs/volumes.c4
-rw-r--r--fs/btrfs/xattr.c2
20 files changed, 1163 insertions, 373 deletions
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index f128427b995b..69b355ae7f49 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -27,7 +27,7 @@
27#include "btrfs_inode.h" 27#include "btrfs_inode.h"
28#include "xattr.h" 28#include "xattr.h"
29 29
30#ifdef CONFIG_FS_POSIX_ACL 30#ifdef CONFIG_BTRFS_POSIX_ACL
31 31
32static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) 32static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
33{ 33{
@@ -313,7 +313,7 @@ struct xattr_handler btrfs_xattr_acl_access_handler = {
313 .set = btrfs_xattr_acl_access_set, 313 .set = btrfs_xattr_acl_access_set,
314}; 314};
315 315
316#else /* CONFIG_FS_POSIX_ACL */ 316#else /* CONFIG_BTRFS_POSIX_ACL */
317 317
318int btrfs_acl_chmod(struct inode *inode) 318int btrfs_acl_chmod(struct inode *inode)
319{ 319{
@@ -325,4 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
325 return 0; 325 return 0;
326} 326}
327 327
328#endif /* CONFIG_FS_POSIX_ACL */ 328#endif /* CONFIG_BTRFS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 282ca085c2fb..c0861e781cdb 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -64,6 +64,51 @@ struct btrfs_worker_thread {
64}; 64};
65 65
66/* 66/*
67 * btrfs_start_workers uses kthread_run, which can block waiting for memory
68 * for a very long time. It will actually throttle on page writeback,
69 * and so it may not make progress until after our btrfs worker threads
70 * process all of the pending work structs in their queue
71 *
72 * This means we can't use btrfs_start_workers from inside a btrfs worker
73 * thread that is used as part of cleaning dirty memory, which pretty much
74 * involves all of the worker threads.
75 *
76 * Instead we have a helper queue who never has more than one thread
77 * where we scheduler thread start operations. This worker_start struct
78 * is used to contain the work and hold a pointer to the queue that needs
79 * another worker.
80 */
81struct worker_start {
82 struct btrfs_work work;
83 struct btrfs_workers *queue;
84};
85
86static void start_new_worker_func(struct btrfs_work *work)
87{
88 struct worker_start *start;
89 start = container_of(work, struct worker_start, work);
90 btrfs_start_workers(start->queue, 1);
91 kfree(start);
92}
93
94static int start_new_worker(struct btrfs_workers *queue)
95{
96 struct worker_start *start;
97 int ret;
98
99 start = kzalloc(sizeof(*start), GFP_NOFS);
100 if (!start)
101 return -ENOMEM;
102
103 start->work.func = start_new_worker_func;
104 start->queue = queue;
105 ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work);
106 if (ret)
107 kfree(start);
108 return ret;
109}
110
111/*
67 * helper function to move a thread onto the idle list after it 112 * helper function to move a thread onto the idle list after it
68 * has finished some requests. 113 * has finished some requests.
69 */ 114 */
@@ -118,11 +163,13 @@ static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
118 goto out; 163 goto out;
119 164
120 workers->atomic_start_pending = 0; 165 workers->atomic_start_pending = 0;
121 if (workers->num_workers >= workers->max_workers) 166 if (workers->num_workers + workers->num_workers_starting >=
167 workers->max_workers)
122 goto out; 168 goto out;
123 169
170 workers->num_workers_starting += 1;
124 spin_unlock_irqrestore(&workers->lock, flags); 171 spin_unlock_irqrestore(&workers->lock, flags);
125 btrfs_start_workers(workers, 1); 172 start_new_worker(workers);
126 return; 173 return;
127 174
128out: 175out:
@@ -390,9 +437,11 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
390/* 437/*
391 * simple init on struct btrfs_workers 438 * simple init on struct btrfs_workers
392 */ 439 */
393void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) 440void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
441 struct btrfs_workers *async_helper)
394{ 442{
395 workers->num_workers = 0; 443 workers->num_workers = 0;
444 workers->num_workers_starting = 0;
396 INIT_LIST_HEAD(&workers->worker_list); 445 INIT_LIST_HEAD(&workers->worker_list);
397 INIT_LIST_HEAD(&workers->idle_list); 446 INIT_LIST_HEAD(&workers->idle_list);
398 INIT_LIST_HEAD(&workers->order_list); 447 INIT_LIST_HEAD(&workers->order_list);
@@ -404,14 +453,15 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
404 workers->name = name; 453 workers->name = name;
405 workers->ordered = 0; 454 workers->ordered = 0;
406 workers->atomic_start_pending = 0; 455 workers->atomic_start_pending = 0;
407 workers->atomic_worker_start = 0; 456 workers->atomic_worker_start = async_helper;
408} 457}
409 458
410/* 459/*
411 * starts new worker threads. This does not enforce the max worker 460 * starts new worker threads. This does not enforce the max worker
412 * count in case you need to temporarily go past it. 461 * count in case you need to temporarily go past it.
413 */ 462 */
414int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) 463static int __btrfs_start_workers(struct btrfs_workers *workers,
464 int num_workers)
415{ 465{
416 struct btrfs_worker_thread *worker; 466 struct btrfs_worker_thread *worker;
417 int ret = 0; 467 int ret = 0;
@@ -444,6 +494,8 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
444 list_add_tail(&worker->worker_list, &workers->idle_list); 494 list_add_tail(&worker->worker_list, &workers->idle_list);
445 worker->idle = 1; 495 worker->idle = 1;
446 workers->num_workers++; 496 workers->num_workers++;
497 workers->num_workers_starting--;
498 WARN_ON(workers->num_workers_starting < 0);
447 spin_unlock_irq(&workers->lock); 499 spin_unlock_irq(&workers->lock);
448 } 500 }
449 return 0; 501 return 0;
@@ -452,6 +504,14 @@ fail:
452 return ret; 504 return ret;
453} 505}
454 506
507int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
508{
509 spin_lock_irq(&workers->lock);
510 workers->num_workers_starting += num_workers;
511 spin_unlock_irq(&workers->lock);
512 return __btrfs_start_workers(workers, num_workers);
513}
514
455/* 515/*
456 * run through the list and find a worker thread that doesn't have a lot 516 * run through the list and find a worker thread that doesn't have a lot
457 * to do right now. This can return null if we aren't yet at the thread 517 * to do right now. This can return null if we aren't yet at the thread
@@ -461,7 +521,10 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
461{ 521{
462 struct btrfs_worker_thread *worker; 522 struct btrfs_worker_thread *worker;
463 struct list_head *next; 523 struct list_head *next;
464 int enforce_min = workers->num_workers < workers->max_workers; 524 int enforce_min;
525
526 enforce_min = (workers->num_workers + workers->num_workers_starting) <
527 workers->max_workers;
465 528
466 /* 529 /*
467 * if we find an idle thread, don't move it to the end of the 530 * if we find an idle thread, don't move it to the end of the
@@ -509,15 +572,17 @@ again:
509 worker = next_worker(workers); 572 worker = next_worker(workers);
510 573
511 if (!worker) { 574 if (!worker) {
512 if (workers->num_workers >= workers->max_workers) { 575 if (workers->num_workers + workers->num_workers_starting >=
576 workers->max_workers) {
513 goto fallback; 577 goto fallback;
514 } else if (workers->atomic_worker_start) { 578 } else if (workers->atomic_worker_start) {
515 workers->atomic_start_pending = 1; 579 workers->atomic_start_pending = 1;
516 goto fallback; 580 goto fallback;
517 } else { 581 } else {
582 workers->num_workers_starting++;
518 spin_unlock_irqrestore(&workers->lock, flags); 583 spin_unlock_irqrestore(&workers->lock, flags);
519 /* we're below the limit, start another worker */ 584 /* we're below the limit, start another worker */
520 btrfs_start_workers(workers, 1); 585 __btrfs_start_workers(workers, 1);
521 goto again; 586 goto again;
522 } 587 }
523 } 588 }
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index fc089b95ec14..5077746cf85e 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -64,6 +64,8 @@ struct btrfs_workers {
64 /* current number of running workers */ 64 /* current number of running workers */
65 int num_workers; 65 int num_workers;
66 66
67 int num_workers_starting;
68
67 /* max number of workers allowed. changed by btrfs_start_workers */ 69 /* max number of workers allowed. changed by btrfs_start_workers */
68 int max_workers; 70 int max_workers;
69 71
@@ -78,9 +80,10 @@ struct btrfs_workers {
78 80
79 /* 81 /*
80 * are we allowed to sleep while starting workers or are we required 82 * are we allowed to sleep while starting workers or are we required
81 * to start them at a later time? 83 * to start them at a later time? If we can't sleep, this indicates
84 * which queue we need to use to schedule thread creation.
82 */ 85 */
83 int atomic_worker_start; 86 struct btrfs_workers *atomic_worker_start;
84 87
85 /* list with all the work threads. The workers on the idle thread 88 /* list with all the work threads. The workers on the idle thread
86 * may be actively servicing jobs, but they haven't yet hit the 89 * may be actively servicing jobs, but they haven't yet hit the
@@ -109,7 +112,8 @@ struct btrfs_workers {
109int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); 112int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
110int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); 113int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
111int btrfs_stop_workers(struct btrfs_workers *workers); 114int btrfs_stop_workers(struct btrfs_workers *workers);
112void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); 115void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
116 struct btrfs_workers *async_starter);
113int btrfs_requeue_work(struct btrfs_work *work); 117int btrfs_requeue_work(struct btrfs_work *work);
114void btrfs_set_work_high_prio(struct btrfs_work *work); 118void btrfs_set_work_high_prio(struct btrfs_work *work);
115#endif 119#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 82ee56bba299..c71abec0ab90 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -128,6 +128,16 @@ struct btrfs_inode {
128 u64 last_unlink_trans; 128 u64 last_unlink_trans;
129 129
130 /* 130 /*
131 * Counters to keep track of the number of extent item's we may use due
132 * to delalloc and such. outstanding_extents is the number of extent
133 * items we think we'll end up using, and reserved_extents is the number
134 * of extent items we've reserved metadata for.
135 */
136 spinlock_t accounting_lock;
137 int reserved_extents;
138 int outstanding_extents;
139
140 /*
131 * ordered_data_close is set by truncate when a file that used 141 * ordered_data_close is set by truncate when a file that used
132 * to have good data has been truncated to zero. When it is set 142 * to have good data has been truncated to zero. When it is set
133 * the btrfs file release call will add this inode to the 143 * the btrfs file release call will add this inode to the
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 80599b4e42bd..1bb897ecdeeb 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -675,21 +675,28 @@ struct btrfs_space_info {
675 current allocations */ 675 current allocations */
676 u64 bytes_readonly; /* total bytes that are read only */ 676 u64 bytes_readonly; /* total bytes that are read only */
677 u64 bytes_super; /* total bytes reserved for the super blocks */ 677 u64 bytes_super; /* total bytes reserved for the super blocks */
678 678 u64 bytes_root; /* the number of bytes needed to commit a
679 /* delalloc accounting */ 679 transaction */
680 u64 bytes_delalloc; /* number of bytes reserved for allocation,
681 this space is not necessarily reserved yet
682 by the allocator */
683 u64 bytes_may_use; /* number of bytes that may be used for 680 u64 bytes_may_use; /* number of bytes that may be used for
684 delalloc */ 681 delalloc/allocations */
682 u64 bytes_delalloc; /* number of bytes currently reserved for
683 delayed allocation */
685 684
686 int full; /* indicates that we cannot allocate any more 685 int full; /* indicates that we cannot allocate any more
687 chunks for this space */ 686 chunks for this space */
688 int force_alloc; /* set if we need to force a chunk alloc for 687 int force_alloc; /* set if we need to force a chunk alloc for
689 this space */ 688 this space */
689 int force_delalloc; /* make people start doing filemap_flush until
690 we're under a threshold */
690 691
691 struct list_head list; 692 struct list_head list;
692 693
694 /* for controlling how we free up space for allocations */
695 wait_queue_head_t allocate_wait;
696 wait_queue_head_t flush_wait;
697 int allocating_chunk;
698 int flushing;
699
693 /* for block groups in our same type */ 700 /* for block groups in our same type */
694 struct list_head block_groups; 701 struct list_head block_groups;
695 spinlock_t lock; 702 spinlock_t lock;
@@ -903,6 +910,7 @@ struct btrfs_fs_info {
903 * A third pool does submit_bio to avoid deadlocking with the other 910 * A third pool does submit_bio to avoid deadlocking with the other
904 * two 911 * two
905 */ 912 */
913 struct btrfs_workers generic_worker;
906 struct btrfs_workers workers; 914 struct btrfs_workers workers;
907 struct btrfs_workers delalloc_workers; 915 struct btrfs_workers delalloc_workers;
908 struct btrfs_workers endio_workers; 916 struct btrfs_workers endio_workers;
@@ -910,6 +918,7 @@ struct btrfs_fs_info {
910 struct btrfs_workers endio_meta_write_workers; 918 struct btrfs_workers endio_meta_write_workers;
911 struct btrfs_workers endio_write_workers; 919 struct btrfs_workers endio_write_workers;
912 struct btrfs_workers submit_workers; 920 struct btrfs_workers submit_workers;
921 struct btrfs_workers enospc_workers;
913 /* 922 /*
914 * fixup workers take dirty pages that didn't properly go through 923 * fixup workers take dirty pages that didn't properly go through
915 * the cow mechanism and make them safe to write. It happens 924 * the cow mechanism and make them safe to write. It happens
@@ -1001,6 +1010,8 @@ struct btrfs_root {
1001 atomic_t log_commit[2]; 1010 atomic_t log_commit[2];
1002 unsigned long log_transid; 1011 unsigned long log_transid;
1003 unsigned long log_batch; 1012 unsigned long log_batch;
1013 pid_t log_start_pid;
1014 bool log_multiple_pids;
1004 1015
1005 u64 objectid; 1016 u64 objectid;
1006 u64 last_trans; 1017 u64 last_trans;
@@ -2022,7 +2033,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2022void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); 2033void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
2023void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2034void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
2024 2035
2025int btrfs_check_metadata_free_space(struct btrfs_root *root); 2036int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items);
2037int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items);
2038int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2039 struct inode *inode, int num_items);
2040int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
2041 struct inode *inode, int num_items);
2026int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, 2042int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
2027 u64 bytes); 2043 u64 bytes);
2028void btrfs_free_reserved_data_space(struct btrfs_root *root, 2044void btrfs_free_reserved_data_space(struct btrfs_root *root,
@@ -2314,7 +2330,7 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2314void btrfs_orphan_cleanup(struct btrfs_root *root); 2330void btrfs_orphan_cleanup(struct btrfs_root *root);
2315int btrfs_cont_expand(struct inode *inode, loff_t size); 2331int btrfs_cont_expand(struct inode *inode, loff_t size);
2316int btrfs_invalidate_inodes(struct btrfs_root *root); 2332int btrfs_invalidate_inodes(struct btrfs_root *root);
2317extern struct dentry_operations btrfs_dentry_operations; 2333extern const struct dentry_operations btrfs_dentry_operations;
2318 2334
2319/* ioctl.c */ 2335/* ioctl.c */
2320long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); 2336long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@@ -2326,7 +2342,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
2326int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 2342int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
2327 int skip_pinned); 2343 int skip_pinned);
2328int btrfs_check_file(struct btrfs_root *root, struct inode *inode); 2344int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
2329extern struct file_operations btrfs_file_operations; 2345extern const struct file_operations btrfs_file_operations;
2330int btrfs_drop_extents(struct btrfs_trans_handle *trans, 2346int btrfs_drop_extents(struct btrfs_trans_handle *trans,
2331 struct btrfs_root *root, struct inode *inode, 2347 struct btrfs_root *root, struct inode *inode,
2332 u64 start, u64 end, u64 locked_end, 2348 u64 start, u64 end, u64 locked_end,
@@ -2357,7 +2373,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options);
2357int btrfs_sync_fs(struct super_block *sb, int wait); 2373int btrfs_sync_fs(struct super_block *sb, int wait);
2358 2374
2359/* acl.c */ 2375/* acl.c */
2360#ifdef CONFIG_FS_POSIX_ACL 2376#ifdef CONFIG_BTRFS_POSIX_ACL
2361int btrfs_check_acl(struct inode *inode, int mask); 2377int btrfs_check_acl(struct inode *inode, int mask);
2362#else 2378#else
2363#define btrfs_check_acl NULL 2379#define btrfs_check_acl NULL
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 644e796fd643..100551a66c46 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -822,14 +822,14 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
822 822
823int btrfs_write_tree_block(struct extent_buffer *buf) 823int btrfs_write_tree_block(struct extent_buffer *buf)
824{ 824{
825 return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start, 825 return filemap_fdatawrite_range(buf->first_page->mapping, buf->start,
826 buf->start + buf->len - 1, WB_SYNC_ALL); 826 buf->start + buf->len - 1);
827} 827}
828 828
829int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) 829int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
830{ 830{
831 return btrfs_wait_on_page_writeback_range(buf->first_page->mapping, 831 return filemap_fdatawait_range(buf->first_page->mapping,
832 buf->start, buf->start + buf->len - 1); 832 buf->start, buf->start + buf->len - 1);
833} 833}
834 834
835struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, 835struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
@@ -1630,7 +1630,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1630 fs_info->sb = sb; 1630 fs_info->sb = sb;
1631 fs_info->max_extent = (u64)-1; 1631 fs_info->max_extent = (u64)-1;
1632 fs_info->max_inline = 8192 * 1024; 1632 fs_info->max_inline = 8192 * 1024;
1633 fs_info->metadata_ratio = 8; 1633 fs_info->metadata_ratio = 0;
1634 1634
1635 fs_info->thread_pool_size = min_t(unsigned long, 1635 fs_info->thread_pool_size = min_t(unsigned long,
1636 num_online_cpus() + 2, 8); 1636 num_online_cpus() + 2, 8);
@@ -1746,21 +1746,25 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1746 err = -EINVAL; 1746 err = -EINVAL;
1747 goto fail_iput; 1747 goto fail_iput;
1748 } 1748 }
1749printk("thread pool is %d\n", fs_info->thread_pool_size); 1749
1750 /* 1750 btrfs_init_workers(&fs_info->generic_worker,
1751 * we need to start all the end_io workers up front because the 1751 "genwork", 1, NULL);
1752 * queue work function gets called at interrupt time, and so it 1752
1753 * cannot dynamically grow.
1754 */
1755 btrfs_init_workers(&fs_info->workers, "worker", 1753 btrfs_init_workers(&fs_info->workers, "worker",
1756 fs_info->thread_pool_size); 1754 fs_info->thread_pool_size,
1755 &fs_info->generic_worker);
1757 1756
1758 btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", 1757 btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
1759 fs_info->thread_pool_size); 1758 fs_info->thread_pool_size,
1759 &fs_info->generic_worker);
1760 1760
1761 btrfs_init_workers(&fs_info->submit_workers, "submit", 1761 btrfs_init_workers(&fs_info->submit_workers, "submit",
1762 min_t(u64, fs_devices->num_devices, 1762 min_t(u64, fs_devices->num_devices,
1763 fs_info->thread_pool_size)); 1763 fs_info->thread_pool_size),
1764 &fs_info->generic_worker);
1765 btrfs_init_workers(&fs_info->enospc_workers, "enospc",
1766 fs_info->thread_pool_size,
1767 &fs_info->generic_worker);
1764 1768
1765 /* a higher idle thresh on the submit workers makes it much more 1769 /* a higher idle thresh on the submit workers makes it much more
1766 * likely that bios will be send down in a sane order to the 1770 * likely that bios will be send down in a sane order to the
@@ -1774,15 +1778,20 @@ printk("thread pool is %d\n", fs_info->thread_pool_size);
1774 fs_info->delalloc_workers.idle_thresh = 2; 1778 fs_info->delalloc_workers.idle_thresh = 2;
1775 fs_info->delalloc_workers.ordered = 1; 1779 fs_info->delalloc_workers.ordered = 1;
1776 1780
1777 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1); 1781 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
1782 &fs_info->generic_worker);
1778 btrfs_init_workers(&fs_info->endio_workers, "endio", 1783 btrfs_init_workers(&fs_info->endio_workers, "endio",
1779 fs_info->thread_pool_size); 1784 fs_info->thread_pool_size,
1785 &fs_info->generic_worker);
1780 btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta", 1786 btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
1781 fs_info->thread_pool_size); 1787 fs_info->thread_pool_size,
1788 &fs_info->generic_worker);
1782 btrfs_init_workers(&fs_info->endio_meta_write_workers, 1789 btrfs_init_workers(&fs_info->endio_meta_write_workers,
1783 "endio-meta-write", fs_info->thread_pool_size); 1790 "endio-meta-write", fs_info->thread_pool_size,
1791 &fs_info->generic_worker);
1784 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", 1792 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
1785 fs_info->thread_pool_size); 1793 fs_info->thread_pool_size,
1794 &fs_info->generic_worker);
1786 1795
1787 /* 1796 /*
1788 * endios are largely parallel and should have a very 1797 * endios are largely parallel and should have a very
@@ -1794,12 +1803,8 @@ printk("thread pool is %d\n", fs_info->thread_pool_size);
1794 fs_info->endio_write_workers.idle_thresh = 2; 1803 fs_info->endio_write_workers.idle_thresh = 2;
1795 fs_info->endio_meta_write_workers.idle_thresh = 2; 1804 fs_info->endio_meta_write_workers.idle_thresh = 2;
1796 1805
1797 fs_info->endio_workers.atomic_worker_start = 1;
1798 fs_info->endio_meta_workers.atomic_worker_start = 1;
1799 fs_info->endio_write_workers.atomic_worker_start = 1;
1800 fs_info->endio_meta_write_workers.atomic_worker_start = 1;
1801
1802 btrfs_start_workers(&fs_info->workers, 1); 1806 btrfs_start_workers(&fs_info->workers, 1);
1807 btrfs_start_workers(&fs_info->generic_worker, 1);
1803 btrfs_start_workers(&fs_info->submit_workers, 1); 1808 btrfs_start_workers(&fs_info->submit_workers, 1);
1804 btrfs_start_workers(&fs_info->delalloc_workers, 1); 1809 btrfs_start_workers(&fs_info->delalloc_workers, 1);
1805 btrfs_start_workers(&fs_info->fixup_workers, 1); 1810 btrfs_start_workers(&fs_info->fixup_workers, 1);
@@ -1807,6 +1812,7 @@ printk("thread pool is %d\n", fs_info->thread_pool_size);
1807 btrfs_start_workers(&fs_info->endio_meta_workers, 1); 1812 btrfs_start_workers(&fs_info->endio_meta_workers, 1);
1808 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); 1813 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
1809 btrfs_start_workers(&fs_info->endio_write_workers, 1); 1814 btrfs_start_workers(&fs_info->endio_write_workers, 1);
1815 btrfs_start_workers(&fs_info->enospc_workers, 1);
1810 1816
1811 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 1817 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1812 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 1818 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -2012,6 +2018,7 @@ fail_chunk_root:
2012 free_extent_buffer(chunk_root->node); 2018 free_extent_buffer(chunk_root->node);
2013 free_extent_buffer(chunk_root->commit_root); 2019 free_extent_buffer(chunk_root->commit_root);
2014fail_sb_buffer: 2020fail_sb_buffer:
2021 btrfs_stop_workers(&fs_info->generic_worker);
2015 btrfs_stop_workers(&fs_info->fixup_workers); 2022 btrfs_stop_workers(&fs_info->fixup_workers);
2016 btrfs_stop_workers(&fs_info->delalloc_workers); 2023 btrfs_stop_workers(&fs_info->delalloc_workers);
2017 btrfs_stop_workers(&fs_info->workers); 2024 btrfs_stop_workers(&fs_info->workers);
@@ -2020,6 +2027,7 @@ fail_sb_buffer:
2020 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2027 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2021 btrfs_stop_workers(&fs_info->endio_write_workers); 2028 btrfs_stop_workers(&fs_info->endio_write_workers);
2022 btrfs_stop_workers(&fs_info->submit_workers); 2029 btrfs_stop_workers(&fs_info->submit_workers);
2030 btrfs_stop_workers(&fs_info->enospc_workers);
2023fail_iput: 2031fail_iput:
2024 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2032 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2025 iput(fs_info->btree_inode); 2033 iput(fs_info->btree_inode);
@@ -2437,6 +2445,7 @@ int close_ctree(struct btrfs_root *root)
2437 2445
2438 iput(fs_info->btree_inode); 2446 iput(fs_info->btree_inode);
2439 2447
2448 btrfs_stop_workers(&fs_info->generic_worker);
2440 btrfs_stop_workers(&fs_info->fixup_workers); 2449 btrfs_stop_workers(&fs_info->fixup_workers);
2441 btrfs_stop_workers(&fs_info->delalloc_workers); 2450 btrfs_stop_workers(&fs_info->delalloc_workers);
2442 btrfs_stop_workers(&fs_info->workers); 2451 btrfs_stop_workers(&fs_info->workers);
@@ -2445,6 +2454,7 @@ int close_ctree(struct btrfs_root *root)
2445 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2454 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2446 btrfs_stop_workers(&fs_info->endio_write_workers); 2455 btrfs_stop_workers(&fs_info->endio_write_workers);
2447 btrfs_stop_workers(&fs_info->submit_workers); 2456 btrfs_stop_workers(&fs_info->submit_workers);
2457 btrfs_stop_workers(&fs_info->enospc_workers);
2448 2458
2449 btrfs_close_devices(fs_info->fs_devices); 2459 btrfs_close_devices(fs_info->fs_devices);
2450 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2460 btrfs_mapping_tree_free(&fs_info->mapping_tree);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 993f93ff7ba6..d0c4d584efad 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -68,6 +68,8 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
68 struct extent_buffer **must_clean); 68 struct extent_buffer **must_clean);
69static int find_next_key(struct btrfs_path *path, int level, 69static int find_next_key(struct btrfs_path *path, int level,
70 struct btrfs_key *key); 70 struct btrfs_key *key);
71static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
72 int dump_block_groups);
71 73
72static noinline int 74static noinline int
73block_group_cache_done(struct btrfs_block_group_cache *cache) 75block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -2765,67 +2767,448 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2765 alloc_target); 2767 alloc_target);
2766} 2768}
2767 2769
2770static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
2771{
2772 u64 num_bytes;
2773 int level;
2774
2775 level = BTRFS_MAX_LEVEL - 2;
2776 /*
2777 * NOTE: these calculations are absolutely the worst possible case.
2778 * This assumes that _every_ item we insert will require a new leaf, and
2779 * that the tree has grown to its maximum level size.
2780 */
2781
2782 /*
2783 * for every item we insert we could insert both an extent item and a
2784 * extent ref item. Then for ever item we insert, we will need to cow
2785 * both the original leaf, plus the leaf to the left and right of it.
2786 *
2787 * Unless we are talking about the extent root, then we just want the
2788 * number of items * 2, since we just need the extent item plus its ref.
2789 */
2790 if (root == root->fs_info->extent_root)
2791 num_bytes = num_items * 2;
2792 else
2793 num_bytes = (num_items + (2 * num_items)) * 3;
2794
2795 /*
2796 * num_bytes is total number of leaves we could need times the leaf
2797 * size, and then for every leaf we could end up cow'ing 2 nodes per
2798 * level, down to the leaf level.
2799 */
2800 num_bytes = (num_bytes * root->leafsize) +
2801 (num_bytes * (level * 2)) * root->nodesize;
2802
2803 return num_bytes;
2804}
2805
2768/* 2806/*
2769 * for now this just makes sure we have at least 5% of our metadata space free 2807 * Unreserve metadata space for delalloc. If we have less reserved credits than
2770 * for use. 2808 * we have extents, this function does nothing.
2771 */ 2809 */
2772int btrfs_check_metadata_free_space(struct btrfs_root *root) 2810int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2811 struct inode *inode, int num_items)
2773{ 2812{
2774 struct btrfs_fs_info *info = root->fs_info; 2813 struct btrfs_fs_info *info = root->fs_info;
2775 struct btrfs_space_info *meta_sinfo; 2814 struct btrfs_space_info *meta_sinfo;
2776 u64 alloc_target, thresh; 2815 u64 num_bytes;
2777 int committed = 0, ret; 2816 u64 alloc_target;
2817 bool bug = false;
2778 2818
2779 /* get the space info for where the metadata will live */ 2819 /* get the space info for where the metadata will live */
2780 alloc_target = btrfs_get_alloc_profile(root, 0); 2820 alloc_target = btrfs_get_alloc_profile(root, 0);
2781 meta_sinfo = __find_space_info(info, alloc_target); 2821 meta_sinfo = __find_space_info(info, alloc_target);
2782 if (!meta_sinfo)
2783 goto alloc;
2784 2822
2785again: 2823 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2824 num_items);
2825
2786 spin_lock(&meta_sinfo->lock); 2826 spin_lock(&meta_sinfo->lock);
2787 if (!meta_sinfo->full) 2827 spin_lock(&BTRFS_I(inode)->accounting_lock);
2788 thresh = meta_sinfo->total_bytes * 80; 2828 if (BTRFS_I(inode)->reserved_extents <=
2789 else 2829 BTRFS_I(inode)->outstanding_extents) {
2790 thresh = meta_sinfo->total_bytes * 95; 2830 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2831 spin_unlock(&meta_sinfo->lock);
2832 return 0;
2833 }
2834 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2835
2836 BTRFS_I(inode)->reserved_extents--;
2837 BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
2838
2839 if (meta_sinfo->bytes_delalloc < num_bytes) {
2840 bug = true;
2841 meta_sinfo->bytes_delalloc = 0;
2842 } else {
2843 meta_sinfo->bytes_delalloc -= num_bytes;
2844 }
2845 spin_unlock(&meta_sinfo->lock);
2846
2847 BUG_ON(bug);
2791 2848
2849 return 0;
2850}
2851
2852static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
2853{
2854 u64 thresh;
2855
2856 thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2857 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
2858 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
2859 meta_sinfo->bytes_may_use;
2860
2861 thresh = meta_sinfo->total_bytes - thresh;
2862 thresh *= 80;
2792 do_div(thresh, 100); 2863 do_div(thresh, 100);
2864 if (thresh <= meta_sinfo->bytes_delalloc)
2865 meta_sinfo->force_delalloc = 1;
2866 else
2867 meta_sinfo->force_delalloc = 0;
2868}
2793 2869
2794 if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + 2870struct async_flush {
2795 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + 2871 struct btrfs_root *root;
2796 meta_sinfo->bytes_super > thresh) { 2872 struct btrfs_space_info *info;
2797 struct btrfs_trans_handle *trans; 2873 struct btrfs_work work;
2798 if (!meta_sinfo->full) { 2874};
2799 meta_sinfo->force_alloc = 1; 2875
2876static noinline void flush_delalloc_async(struct btrfs_work *work)
2877{
2878 struct async_flush *async;
2879 struct btrfs_root *root;
2880 struct btrfs_space_info *info;
2881
2882 async = container_of(work, struct async_flush, work);
2883 root = async->root;
2884 info = async->info;
2885
2886 btrfs_start_delalloc_inodes(root);
2887 wake_up(&info->flush_wait);
2888 btrfs_wait_ordered_extents(root, 0);
2889
2890 spin_lock(&info->lock);
2891 info->flushing = 0;
2892 spin_unlock(&info->lock);
2893 wake_up(&info->flush_wait);
2894
2895 kfree(async);
2896}
2897
2898static void wait_on_flush(struct btrfs_space_info *info)
2899{
2900 DEFINE_WAIT(wait);
2901 u64 used;
2902
2903 while (1) {
2904 prepare_to_wait(&info->flush_wait, &wait,
2905 TASK_UNINTERRUPTIBLE);
2906 spin_lock(&info->lock);
2907 if (!info->flushing) {
2908 spin_unlock(&info->lock);
2909 break;
2910 }
2911
2912 used = info->bytes_used + info->bytes_reserved +
2913 info->bytes_pinned + info->bytes_readonly +
2914 info->bytes_super + info->bytes_root +
2915 info->bytes_may_use + info->bytes_delalloc;
2916 if (used < info->total_bytes) {
2917 spin_unlock(&info->lock);
2918 break;
2919 }
2920 spin_unlock(&info->lock);
2921 schedule();
2922 }
2923 finish_wait(&info->flush_wait, &wait);
2924}
2925
2926static void flush_delalloc(struct btrfs_root *root,
2927 struct btrfs_space_info *info)
2928{
2929 struct async_flush *async;
2930 bool wait = false;
2931
2932 spin_lock(&info->lock);
2933
2934 if (!info->flushing) {
2935 info->flushing = 1;
2936 init_waitqueue_head(&info->flush_wait);
2937 } else {
2938 wait = true;
2939 }
2940
2941 spin_unlock(&info->lock);
2942
2943 if (wait) {
2944 wait_on_flush(info);
2945 return;
2946 }
2947
2948 async = kzalloc(sizeof(*async), GFP_NOFS);
2949 if (!async)
2950 goto flush;
2951
2952 async->root = root;
2953 async->info = info;
2954 async->work.func = flush_delalloc_async;
2955
2956 btrfs_queue_worker(&root->fs_info->enospc_workers,
2957 &async->work);
2958 wait_on_flush(info);
2959 return;
2960
2961flush:
2962 btrfs_start_delalloc_inodes(root);
2963 btrfs_wait_ordered_extents(root, 0);
2964
2965 spin_lock(&info->lock);
2966 info->flushing = 0;
2967 spin_unlock(&info->lock);
2968 wake_up(&info->flush_wait);
2969}
2970
2971static int maybe_allocate_chunk(struct btrfs_root *root,
2972 struct btrfs_space_info *info)
2973{
2974 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
2975 struct btrfs_trans_handle *trans;
2976 bool wait = false;
2977 int ret = 0;
2978 u64 min_metadata;
2979 u64 free_space;
2980
2981 free_space = btrfs_super_total_bytes(disk_super);
2982 /*
2983 * we allow the metadata to grow to a max of either 5gb or 5% of the
2984 * space in the volume.
2985 */
2986 min_metadata = min((u64)5 * 1024 * 1024 * 1024,
2987 div64_u64(free_space * 5, 100));
2988 if (info->total_bytes >= min_metadata) {
2989 spin_unlock(&info->lock);
2990 return 0;
2991 }
2992
2993 if (info->full) {
2994 spin_unlock(&info->lock);
2995 return 0;
2996 }
2997
2998 if (!info->allocating_chunk) {
2999 info->force_alloc = 1;
3000 info->allocating_chunk = 1;
3001 init_waitqueue_head(&info->allocate_wait);
3002 } else {
3003 wait = true;
3004 }
3005
3006 spin_unlock(&info->lock);
3007
3008 if (wait) {
3009 wait_event(info->allocate_wait,
3010 !info->allocating_chunk);
3011 return 1;
3012 }
3013
3014 trans = btrfs_start_transaction(root, 1);
3015 if (!trans) {
3016 ret = -ENOMEM;
3017 goto out;
3018 }
3019
3020 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3021 4096 + 2 * 1024 * 1024,
3022 info->flags, 0);
3023 btrfs_end_transaction(trans, root);
3024 if (ret)
3025 goto out;
3026out:
3027 spin_lock(&info->lock);
3028 info->allocating_chunk = 0;
3029 spin_unlock(&info->lock);
3030 wake_up(&info->allocate_wait);
3031
3032 if (ret)
3033 return 0;
3034 return 1;
3035}
3036
3037/*
3038 * Reserve metadata space for delalloc.
3039 */
3040int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
3041 struct inode *inode, int num_items)
3042{
3043 struct btrfs_fs_info *info = root->fs_info;
3044 struct btrfs_space_info *meta_sinfo;
3045 u64 num_bytes;
3046 u64 used;
3047 u64 alloc_target;
3048 int flushed = 0;
3049 int force_delalloc;
3050
3051 /* get the space info for where the metadata will live */
3052 alloc_target = btrfs_get_alloc_profile(root, 0);
3053 meta_sinfo = __find_space_info(info, alloc_target);
3054
3055 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
3056 num_items);
3057again:
3058 spin_lock(&meta_sinfo->lock);
3059
3060 force_delalloc = meta_sinfo->force_delalloc;
3061
3062 if (unlikely(!meta_sinfo->bytes_root))
3063 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3064
3065 if (!flushed)
3066 meta_sinfo->bytes_delalloc += num_bytes;
3067
3068 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3069 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3070 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3071 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3072
3073 if (used > meta_sinfo->total_bytes) {
3074 flushed++;
3075
3076 if (flushed == 1) {
3077 if (maybe_allocate_chunk(root, meta_sinfo))
3078 goto again;
3079 flushed++;
3080 } else {
2800 spin_unlock(&meta_sinfo->lock); 3081 spin_unlock(&meta_sinfo->lock);
2801alloc: 3082 }
2802 trans = btrfs_start_transaction(root, 1);
2803 if (!trans)
2804 return -ENOMEM;
2805 3083
2806 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 3084 if (flushed == 2) {
2807 2 * 1024 * 1024, alloc_target, 0); 3085 filemap_flush(inode->i_mapping);
2808 btrfs_end_transaction(trans, root); 3086 goto again;
2809 if (!meta_sinfo) { 3087 } else if (flushed == 3) {
2810 meta_sinfo = __find_space_info(info, 3088 flush_delalloc(root, meta_sinfo);
2811 alloc_target);
2812 }
2813 goto again; 3089 goto again;
2814 } 3090 }
3091 spin_lock(&meta_sinfo->lock);
3092 meta_sinfo->bytes_delalloc -= num_bytes;
2815 spin_unlock(&meta_sinfo->lock); 3093 spin_unlock(&meta_sinfo->lock);
3094 printk(KERN_ERR "enospc, has %d, reserved %d\n",
3095 BTRFS_I(inode)->outstanding_extents,
3096 BTRFS_I(inode)->reserved_extents);
3097 dump_space_info(meta_sinfo, 0, 0);
3098 return -ENOSPC;
3099 }
2816 3100
2817 if (!committed) { 3101 BTRFS_I(inode)->reserved_extents++;
2818 committed = 1; 3102 check_force_delalloc(meta_sinfo);
2819 trans = btrfs_join_transaction(root, 1); 3103 spin_unlock(&meta_sinfo->lock);
2820 if (!trans) 3104
2821 return -ENOMEM; 3105 if (!flushed && force_delalloc)
2822 ret = btrfs_commit_transaction(trans, root); 3106 filemap_flush(inode->i_mapping);
2823 if (ret) 3107
2824 return ret; 3108 return 0;
3109}
3110
3111/*
3112 * unreserve num_items number of items worth of metadata space. This needs to
3113 * be paired with btrfs_reserve_metadata_space.
3114 *
3115 * NOTE: if you have the option, run this _AFTER_ you do a
3116 * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
3117 * oprations which will result in more used metadata, so we want to make sure we
3118 * can do that without issue.
3119 */
3120int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
3121{
3122 struct btrfs_fs_info *info = root->fs_info;
3123 struct btrfs_space_info *meta_sinfo;
3124 u64 num_bytes;
3125 u64 alloc_target;
3126 bool bug = false;
3127
3128 /* get the space info for where the metadata will live */
3129 alloc_target = btrfs_get_alloc_profile(root, 0);
3130 meta_sinfo = __find_space_info(info, alloc_target);
3131
3132 num_bytes = calculate_bytes_needed(root, num_items);
3133
3134 spin_lock(&meta_sinfo->lock);
3135 if (meta_sinfo->bytes_may_use < num_bytes) {
3136 bug = true;
3137 meta_sinfo->bytes_may_use = 0;
3138 } else {
3139 meta_sinfo->bytes_may_use -= num_bytes;
3140 }
3141 spin_unlock(&meta_sinfo->lock);
3142
3143 BUG_ON(bug);
3144
3145 return 0;
3146}
3147
3148/*
3149 * Reserve some metadata space for use. We'll calculate the worste case number
3150 * of bytes that would be needed to modify num_items number of items. If we
3151 * have space, fantastic, if not, you get -ENOSPC. Please call
3152 * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
3153 * items you reserved, since whatever metadata you needed should have already
3154 * been allocated.
3155 *
3156 * This will commit the transaction to make more space if we don't have enough
3157 * metadata space. THe only time we don't do this is if we're reserving space
3158 * inside of a transaction, then we will just return -ENOSPC and it is the
3159 * callers responsibility to handle it properly.
3160 */
3161int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
3162{
3163 struct btrfs_fs_info *info = root->fs_info;
3164 struct btrfs_space_info *meta_sinfo;
3165 u64 num_bytes;
3166 u64 used;
3167 u64 alloc_target;
3168 int retries = 0;
3169
3170 /* get the space info for where the metadata will live */
3171 alloc_target = btrfs_get_alloc_profile(root, 0);
3172 meta_sinfo = __find_space_info(info, alloc_target);
3173
3174 num_bytes = calculate_bytes_needed(root, num_items);
3175again:
3176 spin_lock(&meta_sinfo->lock);
3177
3178 if (unlikely(!meta_sinfo->bytes_root))
3179 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3180
3181 if (!retries)
3182 meta_sinfo->bytes_may_use += num_bytes;
3183
3184 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3185 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3186 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3187 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3188
3189 if (used > meta_sinfo->total_bytes) {
3190 retries++;
3191 if (retries == 1) {
3192 if (maybe_allocate_chunk(root, meta_sinfo))
3193 goto again;
3194 retries++;
3195 } else {
3196 spin_unlock(&meta_sinfo->lock);
3197 }
3198
3199 if (retries == 2) {
3200 flush_delalloc(root, meta_sinfo);
2825 goto again; 3201 goto again;
2826 } 3202 }
3203 spin_lock(&meta_sinfo->lock);
3204 meta_sinfo->bytes_may_use -= num_bytes;
3205 spin_unlock(&meta_sinfo->lock);
3206
3207 dump_space_info(meta_sinfo, 0, 0);
2827 return -ENOSPC; 3208 return -ENOSPC;
2828 } 3209 }
3210
3211 check_force_delalloc(meta_sinfo);
2829 spin_unlock(&meta_sinfo->lock); 3212 spin_unlock(&meta_sinfo->lock);
2830 3213
2831 return 0; 3214 return 0;
@@ -2888,7 +3271,7 @@ alloc:
2888 spin_unlock(&data_sinfo->lock); 3271 spin_unlock(&data_sinfo->lock);
2889 3272
2890 /* commit the current transaction and try again */ 3273 /* commit the current transaction and try again */
2891 if (!committed) { 3274 if (!committed && !root->fs_info->open_ioctl_trans) {
2892 committed = 1; 3275 committed = 1;
2893 trans = btrfs_join_transaction(root, 1); 3276 trans = btrfs_join_transaction(root, 1);
2894 if (!trans) 3277 if (!trans)
@@ -2916,7 +3299,7 @@ alloc:
2916 BTRFS_I(inode)->reserved_bytes += bytes; 3299 BTRFS_I(inode)->reserved_bytes += bytes;
2917 spin_unlock(&data_sinfo->lock); 3300 spin_unlock(&data_sinfo->lock);
2918 3301
2919 return btrfs_check_metadata_free_space(root); 3302 return 0;
2920} 3303}
2921 3304
2922/* 3305/*
@@ -3015,17 +3398,15 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3015 BUG_ON(!space_info); 3398 BUG_ON(!space_info);
3016 3399
3017 spin_lock(&space_info->lock); 3400 spin_lock(&space_info->lock);
3018 if (space_info->force_alloc) { 3401 if (space_info->force_alloc)
3019 force = 1; 3402 force = 1;
3020 space_info->force_alloc = 0;
3021 }
3022 if (space_info->full) { 3403 if (space_info->full) {
3023 spin_unlock(&space_info->lock); 3404 spin_unlock(&space_info->lock);
3024 goto out; 3405 goto out;
3025 } 3406 }
3026 3407
3027 thresh = space_info->total_bytes - space_info->bytes_readonly; 3408 thresh = space_info->total_bytes - space_info->bytes_readonly;
3028 thresh = div_factor(thresh, 6); 3409 thresh = div_factor(thresh, 8);
3029 if (!force && 3410 if (!force &&
3030 (space_info->bytes_used + space_info->bytes_pinned + 3411 (space_info->bytes_used + space_info->bytes_pinned +
3031 space_info->bytes_reserved + alloc_bytes) < thresh) { 3412 space_info->bytes_reserved + alloc_bytes) < thresh) {
@@ -3039,7 +3420,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3039 * we keep a reasonable number of metadata chunks allocated in the 3420 * we keep a reasonable number of metadata chunks allocated in the
3040 * FS as well. 3421 * FS as well.
3041 */ 3422 */
3042 if (flags & BTRFS_BLOCK_GROUP_DATA) { 3423 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3043 fs_info->data_chunk_allocations++; 3424 fs_info->data_chunk_allocations++;
3044 if (!(fs_info->data_chunk_allocations % 3425 if (!(fs_info->data_chunk_allocations %
3045 fs_info->metadata_ratio)) 3426 fs_info->metadata_ratio))
@@ -3047,8 +3428,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3047 } 3428 }
3048 3429
3049 ret = btrfs_alloc_chunk(trans, extent_root, flags); 3430 ret = btrfs_alloc_chunk(trans, extent_root, flags);
3431 spin_lock(&space_info->lock);
3050 if (ret) 3432 if (ret)
3051 space_info->full = 1; 3433 space_info->full = 1;
3434 space_info->force_alloc = 0;
3435 spin_unlock(&space_info->lock);
3052out: 3436out:
3053 mutex_unlock(&extent_root->fs_info->chunk_mutex); 3437 mutex_unlock(&extent_root->fs_info->chunk_mutex);
3054 return ret; 3438 return ret;
@@ -3747,6 +4131,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
3747 int loop = 0; 4131 int loop = 0;
3748 bool found_uncached_bg = false; 4132 bool found_uncached_bg = false;
3749 bool failed_cluster_refill = false; 4133 bool failed_cluster_refill = false;
4134 bool failed_alloc = false;
3750 4135
3751 WARN_ON(num_bytes < root->sectorsize); 4136 WARN_ON(num_bytes < root->sectorsize);
3752 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 4137 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -3951,14 +4336,23 @@ refill_cluster:
3951 4336
3952 offset = btrfs_find_space_for_alloc(block_group, search_start, 4337 offset = btrfs_find_space_for_alloc(block_group, search_start,
3953 num_bytes, empty_size); 4338 num_bytes, empty_size);
3954 if (!offset && (cached || (!cached && 4339 /*
3955 loop == LOOP_CACHING_NOWAIT))) { 4340 * If we didn't find a chunk, and we haven't failed on this
3956 goto loop; 4341 * block group before, and this block group is in the middle of
3957 } else if (!offset && (!cached && 4342 * caching and we are ok with waiting, then go ahead and wait
3958 loop > LOOP_CACHING_NOWAIT)) { 4343 * for progress to be made, and set failed_alloc to true.
4344 *
4345 * If failed_alloc is true then we've already waited on this
4346 * block group once and should move on to the next block group.
4347 */
4348 if (!offset && !failed_alloc && !cached &&
4349 loop > LOOP_CACHING_NOWAIT) {
3959 wait_block_group_cache_progress(block_group, 4350 wait_block_group_cache_progress(block_group,
3960 num_bytes + empty_size); 4351 num_bytes + empty_size);
4352 failed_alloc = true;
3961 goto have_block_group; 4353 goto have_block_group;
4354 } else if (!offset) {
4355 goto loop;
3962 } 4356 }
3963checks: 4357checks:
3964 search_start = stripe_align(root, offset); 4358 search_start = stripe_align(root, offset);
@@ -4006,6 +4400,7 @@ checks:
4006 break; 4400 break;
4007loop: 4401loop:
4008 failed_cluster_refill = false; 4402 failed_cluster_refill = false;
4403 failed_alloc = false;
4009 btrfs_put_block_group(block_group); 4404 btrfs_put_block_group(block_group);
4010 } 4405 }
4011 up_read(&space_info->groups_sem); 4406 up_read(&space_info->groups_sem);
@@ -4063,21 +4458,32 @@ loop:
4063 return ret; 4458 return ret;
4064} 4459}
4065 4460
4066static void dump_space_info(struct btrfs_space_info *info, u64 bytes) 4461static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4462 int dump_block_groups)
4067{ 4463{
4068 struct btrfs_block_group_cache *cache; 4464 struct btrfs_block_group_cache *cache;
4069 4465
4466 spin_lock(&info->lock);
4070 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 4467 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
4071 (unsigned long long)(info->total_bytes - info->bytes_used - 4468 (unsigned long long)(info->total_bytes - info->bytes_used -
4072 info->bytes_pinned - info->bytes_reserved), 4469 info->bytes_pinned - info->bytes_reserved -
4470 info->bytes_super),
4073 (info->full) ? "" : "not "); 4471 (info->full) ? "" : "not ");
4074 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," 4472 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
4075 " may_use=%llu, used=%llu\n", 4473 " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
4474 "\n",
4076 (unsigned long long)info->total_bytes, 4475 (unsigned long long)info->total_bytes,
4077 (unsigned long long)info->bytes_pinned, 4476 (unsigned long long)info->bytes_pinned,
4078 (unsigned long long)info->bytes_delalloc, 4477 (unsigned long long)info->bytes_delalloc,
4079 (unsigned long long)info->bytes_may_use, 4478 (unsigned long long)info->bytes_may_use,
4080 (unsigned long long)info->bytes_used); 4479 (unsigned long long)info->bytes_used,
4480 (unsigned long long)info->bytes_root,
4481 (unsigned long long)info->bytes_super,
4482 (unsigned long long)info->bytes_reserved);
4483 spin_unlock(&info->lock);
4484
4485 if (!dump_block_groups)
4486 return;
4081 4487
4082 down_read(&info->groups_sem); 4488 down_read(&info->groups_sem);
4083 list_for_each_entry(cache, &info->block_groups, list) { 4489 list_for_each_entry(cache, &info->block_groups, list) {
@@ -4145,7 +4551,7 @@ again:
4145 printk(KERN_ERR "btrfs allocation failed flags %llu, " 4551 printk(KERN_ERR "btrfs allocation failed flags %llu, "
4146 "wanted %llu\n", (unsigned long long)data, 4552 "wanted %llu\n", (unsigned long long)data,
4147 (unsigned long long)num_bytes); 4553 (unsigned long long)num_bytes);
4148 dump_space_info(sinfo, num_bytes); 4554 dump_space_info(sinfo, num_bytes, 1);
4149 } 4555 }
4150 4556
4151 return ret; 4557 return ret;
@@ -4506,6 +4912,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
4506 u64 bytenr; 4912 u64 bytenr;
4507 u64 generation; 4913 u64 generation;
4508 u64 refs; 4914 u64 refs;
4915 u64 flags;
4509 u64 last = 0; 4916 u64 last = 0;
4510 u32 nritems; 4917 u32 nritems;
4511 u32 blocksize; 4918 u32 blocksize;
@@ -4543,15 +4950,19 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
4543 generation <= root->root_key.offset) 4950 generation <= root->root_key.offset)
4544 continue; 4951 continue;
4545 4952
4953 /* We don't lock the tree block, it's OK to be racy here */
4954 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
4955 &refs, &flags);
4956 BUG_ON(ret);
4957 BUG_ON(refs == 0);
4958
4546 if (wc->stage == DROP_REFERENCE) { 4959 if (wc->stage == DROP_REFERENCE) {
4547 ret = btrfs_lookup_extent_info(trans, root,
4548 bytenr, blocksize,
4549 &refs, NULL);
4550 BUG_ON(ret);
4551 BUG_ON(refs == 0);
4552 if (refs == 1) 4960 if (refs == 1)
4553 goto reada; 4961 goto reada;
4554 4962
4963 if (wc->level == 1 &&
4964 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
4965 continue;
4555 if (!wc->update_ref || 4966 if (!wc->update_ref ||
4556 generation <= root->root_key.offset) 4967 generation <= root->root_key.offset)
4557 continue; 4968 continue;
@@ -4560,6 +4971,10 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
4560 &wc->update_progress); 4971 &wc->update_progress);
4561 if (ret < 0) 4972 if (ret < 0)
4562 continue; 4973 continue;
4974 } else {
4975 if (wc->level == 1 &&
4976 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
4977 continue;
4563 } 4978 }
4564reada: 4979reada:
4565 ret = readahead_tree_block(root, bytenr, blocksize, 4980 ret = readahead_tree_block(root, bytenr, blocksize,
@@ -4583,7 +4998,7 @@ reada:
4583static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 4998static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4584 struct btrfs_root *root, 4999 struct btrfs_root *root,
4585 struct btrfs_path *path, 5000 struct btrfs_path *path,
4586 struct walk_control *wc) 5001 struct walk_control *wc, int lookup_info)
4587{ 5002{
4588 int level = wc->level; 5003 int level = wc->level;
4589 struct extent_buffer *eb = path->nodes[level]; 5004 struct extent_buffer *eb = path->nodes[level];
@@ -4598,8 +5013,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4598 * when reference count of tree block is 1, it won't increase 5013 * when reference count of tree block is 1, it won't increase
4599 * again. once full backref flag is set, we never clear it. 5014 * again. once full backref flag is set, we never clear it.
4600 */ 5015 */
4601 if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 5016 if (lookup_info &&
4602 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) { 5017 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
5018 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
4603 BUG_ON(!path->locks[level]); 5019 BUG_ON(!path->locks[level]);
4604 ret = btrfs_lookup_extent_info(trans, root, 5020 ret = btrfs_lookup_extent_info(trans, root,
4605 eb->start, eb->len, 5021 eb->start, eb->len,
@@ -4660,7 +5076,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4660static noinline int do_walk_down(struct btrfs_trans_handle *trans, 5076static noinline int do_walk_down(struct btrfs_trans_handle *trans,
4661 struct btrfs_root *root, 5077 struct btrfs_root *root,
4662 struct btrfs_path *path, 5078 struct btrfs_path *path,
4663 struct walk_control *wc) 5079 struct walk_control *wc, int *lookup_info)
4664{ 5080{
4665 u64 bytenr; 5081 u64 bytenr;
4666 u64 generation; 5082 u64 generation;
@@ -4680,8 +5096,10 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
4680 * for the subtree 5096 * for the subtree
4681 */ 5097 */
4682 if (wc->stage == UPDATE_BACKREF && 5098 if (wc->stage == UPDATE_BACKREF &&
4683 generation <= root->root_key.offset) 5099 generation <= root->root_key.offset) {
5100 *lookup_info = 1;
4684 return 1; 5101 return 1;
5102 }
4685 5103
4686 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 5104 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
4687 blocksize = btrfs_level_size(root, level - 1); 5105 blocksize = btrfs_level_size(root, level - 1);
@@ -4694,14 +5112,19 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
4694 btrfs_tree_lock(next); 5112 btrfs_tree_lock(next);
4695 btrfs_set_lock_blocking(next); 5113 btrfs_set_lock_blocking(next);
4696 5114
4697 if (wc->stage == DROP_REFERENCE) { 5115 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
4698 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, 5116 &wc->refs[level - 1],
4699 &wc->refs[level - 1], 5117 &wc->flags[level - 1]);
4700 &wc->flags[level - 1]); 5118 BUG_ON(ret);
4701 BUG_ON(ret); 5119 BUG_ON(wc->refs[level - 1] == 0);
4702 BUG_ON(wc->refs[level - 1] == 0); 5120 *lookup_info = 0;
4703 5121
5122 if (wc->stage == DROP_REFERENCE) {
4704 if (wc->refs[level - 1] > 1) { 5123 if (wc->refs[level - 1] > 1) {
5124 if (level == 1 &&
5125 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5126 goto skip;
5127
4705 if (!wc->update_ref || 5128 if (!wc->update_ref ||
4706 generation <= root->root_key.offset) 5129 generation <= root->root_key.offset)
4707 goto skip; 5130 goto skip;
@@ -4715,12 +5138,17 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
4715 wc->stage = UPDATE_BACKREF; 5138 wc->stage = UPDATE_BACKREF;
4716 wc->shared_level = level - 1; 5139 wc->shared_level = level - 1;
4717 } 5140 }
5141 } else {
5142 if (level == 1 &&
5143 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5144 goto skip;
4718 } 5145 }
4719 5146
4720 if (!btrfs_buffer_uptodate(next, generation)) { 5147 if (!btrfs_buffer_uptodate(next, generation)) {
4721 btrfs_tree_unlock(next); 5148 btrfs_tree_unlock(next);
4722 free_extent_buffer(next); 5149 free_extent_buffer(next);
4723 next = NULL; 5150 next = NULL;
5151 *lookup_info = 1;
4724 } 5152 }
4725 5153
4726 if (!next) { 5154 if (!next) {
@@ -4743,21 +5171,22 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
4743skip: 5171skip:
4744 wc->refs[level - 1] = 0; 5172 wc->refs[level - 1] = 0;
4745 wc->flags[level - 1] = 0; 5173 wc->flags[level - 1] = 0;
5174 if (wc->stage == DROP_REFERENCE) {
5175 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5176 parent = path->nodes[level]->start;
5177 } else {
5178 BUG_ON(root->root_key.objectid !=
5179 btrfs_header_owner(path->nodes[level]));
5180 parent = 0;
5181 }
4746 5182
4747 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 5183 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
4748 parent = path->nodes[level]->start; 5184 root->root_key.objectid, level - 1, 0);
4749 } else { 5185 BUG_ON(ret);
4750 BUG_ON(root->root_key.objectid !=
4751 btrfs_header_owner(path->nodes[level]));
4752 parent = 0;
4753 } 5186 }
4754
4755 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
4756 root->root_key.objectid, level - 1, 0);
4757 BUG_ON(ret);
4758
4759 btrfs_tree_unlock(next); 5187 btrfs_tree_unlock(next);
4760 free_extent_buffer(next); 5188 free_extent_buffer(next);
5189 *lookup_info = 1;
4761 return 1; 5190 return 1;
4762} 5191}
4763 5192
@@ -4871,6 +5300,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4871 struct walk_control *wc) 5300 struct walk_control *wc)
4872{ 5301{
4873 int level = wc->level; 5302 int level = wc->level;
5303 int lookup_info = 1;
4874 int ret; 5304 int ret;
4875 5305
4876 while (level >= 0) { 5306 while (level >= 0) {
@@ -4878,14 +5308,14 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4878 btrfs_header_nritems(path->nodes[level])) 5308 btrfs_header_nritems(path->nodes[level]))
4879 break; 5309 break;
4880 5310
4881 ret = walk_down_proc(trans, root, path, wc); 5311 ret = walk_down_proc(trans, root, path, wc, lookup_info);
4882 if (ret > 0) 5312 if (ret > 0)
4883 break; 5313 break;
4884 5314
4885 if (level == 0) 5315 if (level == 0)
4886 break; 5316 break;
4887 5317
4888 ret = do_walk_down(trans, root, path, wc); 5318 ret = do_walk_down(trans, root, path, wc, &lookup_info);
4889 if (ret > 0) { 5319 if (ret > 0) {
4890 path->slots[level]++; 5320 path->slots[level]++;
4891 continue; 5321 continue;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0cb88f8146ea..96577e8bf9fd 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -280,6 +280,14 @@ static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
280 return NULL; 280 return NULL;
281} 281}
282 282
283static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
284 struct extent_state *other)
285{
286 if (tree->ops && tree->ops->merge_extent_hook)
287 tree->ops->merge_extent_hook(tree->mapping->host, new,
288 other);
289}
290
283/* 291/*
284 * utility function to look for merge candidates inside a given range. 292 * utility function to look for merge candidates inside a given range.
285 * Any extents with matching state are merged together into a single 293 * Any extents with matching state are merged together into a single
@@ -303,6 +311,7 @@ static int merge_state(struct extent_io_tree *tree,
303 other = rb_entry(other_node, struct extent_state, rb_node); 311 other = rb_entry(other_node, struct extent_state, rb_node);
304 if (other->end == state->start - 1 && 312 if (other->end == state->start - 1 &&
305 other->state == state->state) { 313 other->state == state->state) {
314 merge_cb(tree, state, other);
306 state->start = other->start; 315 state->start = other->start;
307 other->tree = NULL; 316 other->tree = NULL;
308 rb_erase(&other->rb_node, &tree->state); 317 rb_erase(&other->rb_node, &tree->state);
@@ -314,33 +323,37 @@ static int merge_state(struct extent_io_tree *tree,
314 other = rb_entry(other_node, struct extent_state, rb_node); 323 other = rb_entry(other_node, struct extent_state, rb_node);
315 if (other->start == state->end + 1 && 324 if (other->start == state->end + 1 &&
316 other->state == state->state) { 325 other->state == state->state) {
326 merge_cb(tree, state, other);
317 other->start = state->start; 327 other->start = state->start;
318 state->tree = NULL; 328 state->tree = NULL;
319 rb_erase(&state->rb_node, &tree->state); 329 rb_erase(&state->rb_node, &tree->state);
320 free_extent_state(state); 330 free_extent_state(state);
331 state = NULL;
321 } 332 }
322 } 333 }
334
323 return 0; 335 return 0;
324} 336}
325 337
326static void set_state_cb(struct extent_io_tree *tree, 338static int set_state_cb(struct extent_io_tree *tree,
327 struct extent_state *state, 339 struct extent_state *state,
328 unsigned long bits) 340 unsigned long bits)
329{ 341{
330 if (tree->ops && tree->ops->set_bit_hook) { 342 if (tree->ops && tree->ops->set_bit_hook) {
331 tree->ops->set_bit_hook(tree->mapping->host, state->start, 343 return tree->ops->set_bit_hook(tree->mapping->host,
332 state->end, state->state, bits); 344 state->start, state->end,
345 state->state, bits);
333 } 346 }
347
348 return 0;
334} 349}
335 350
336static void clear_state_cb(struct extent_io_tree *tree, 351static void clear_state_cb(struct extent_io_tree *tree,
337 struct extent_state *state, 352 struct extent_state *state,
338 unsigned long bits) 353 unsigned long bits)
339{ 354{
340 if (tree->ops && tree->ops->clear_bit_hook) { 355 if (tree->ops && tree->ops->clear_bit_hook)
341 tree->ops->clear_bit_hook(tree->mapping->host, state->start, 356 tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
342 state->end, state->state, bits);
343 }
344} 357}
345 358
346/* 359/*
@@ -358,6 +371,7 @@ static int insert_state(struct extent_io_tree *tree,
358 int bits) 371 int bits)
359{ 372{
360 struct rb_node *node; 373 struct rb_node *node;
374 int ret;
361 375
362 if (end < start) { 376 if (end < start) {
363 printk(KERN_ERR "btrfs end < start %llu %llu\n", 377 printk(KERN_ERR "btrfs end < start %llu %llu\n",
@@ -365,11 +379,14 @@ static int insert_state(struct extent_io_tree *tree,
365 (unsigned long long)start); 379 (unsigned long long)start);
366 WARN_ON(1); 380 WARN_ON(1);
367 } 381 }
368 if (bits & EXTENT_DIRTY)
369 tree->dirty_bytes += end - start + 1;
370 state->start = start; 382 state->start = start;
371 state->end = end; 383 state->end = end;
372 set_state_cb(tree, state, bits); 384 ret = set_state_cb(tree, state, bits);
385 if (ret)
386 return ret;
387
388 if (bits & EXTENT_DIRTY)
389 tree->dirty_bytes += end - start + 1;
373 state->state |= bits; 390 state->state |= bits;
374 node = tree_insert(&tree->state, end, &state->rb_node); 391 node = tree_insert(&tree->state, end, &state->rb_node);
375 if (node) { 392 if (node) {
@@ -387,6 +404,15 @@ static int insert_state(struct extent_io_tree *tree,
387 return 0; 404 return 0;
388} 405}
389 406
407static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
408 u64 split)
409{
410 if (tree->ops && tree->ops->split_extent_hook)
411 return tree->ops->split_extent_hook(tree->mapping->host,
412 orig, split);
413 return 0;
414}
415
390/* 416/*
391 * split a given extent state struct in two, inserting the preallocated 417 * split a given extent state struct in two, inserting the preallocated
392 * struct 'prealloc' as the newly created second half. 'split' indicates an 418 * struct 'prealloc' as the newly created second half. 'split' indicates an
@@ -405,6 +431,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
405 struct extent_state *prealloc, u64 split) 431 struct extent_state *prealloc, u64 split)
406{ 432{
407 struct rb_node *node; 433 struct rb_node *node;
434
435 split_cb(tree, orig, split);
436
408 prealloc->start = orig->start; 437 prealloc->start = orig->start;
409 prealloc->end = split - 1; 438 prealloc->end = split - 1;
410 prealloc->state = orig->state; 439 prealloc->state = orig->state;
@@ -431,7 +460,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
431 struct extent_state *state, int bits, int wake, 460 struct extent_state *state, int bits, int wake,
432 int delete) 461 int delete)
433{ 462{
434 int ret = state->state & bits; 463 int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING;
464 int ret = state->state & bits_to_clear;
435 465
436 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 466 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
437 u64 range = state->end - state->start + 1; 467 u64 range = state->end - state->start + 1;
@@ -439,7 +469,7 @@ static int clear_state_bit(struct extent_io_tree *tree,
439 tree->dirty_bytes -= range; 469 tree->dirty_bytes -= range;
440 } 470 }
441 clear_state_cb(tree, state, bits); 471 clear_state_cb(tree, state, bits);
442 state->state &= ~bits; 472 state->state &= ~bits_to_clear;
443 if (wake) 473 if (wake)
444 wake_up(&state->wq); 474 wake_up(&state->wq);
445 if (delete || state->state == 0) { 475 if (delete || state->state == 0) {
@@ -542,8 +572,8 @@ hit_next:
542 if (err) 572 if (err)
543 goto out; 573 goto out;
544 if (state->end <= end) { 574 if (state->end <= end) {
545 set |= clear_state_bit(tree, state, bits, 575 set |= clear_state_bit(tree, state, bits, wake,
546 wake, delete); 576 delete);
547 if (last_end == (u64)-1) 577 if (last_end == (u64)-1)
548 goto out; 578 goto out;
549 start = last_end + 1; 579 start = last_end + 1;
@@ -561,12 +591,11 @@ hit_next:
561 prealloc = alloc_extent_state(GFP_ATOMIC); 591 prealloc = alloc_extent_state(GFP_ATOMIC);
562 err = split_state(tree, state, prealloc, end + 1); 592 err = split_state(tree, state, prealloc, end + 1);
563 BUG_ON(err == -EEXIST); 593 BUG_ON(err == -EEXIST);
564
565 if (wake) 594 if (wake)
566 wake_up(&state->wq); 595 wake_up(&state->wq);
567 596
568 set |= clear_state_bit(tree, prealloc, bits, 597 set |= clear_state_bit(tree, prealloc, bits, wake, delete);
569 wake, delete); 598
570 prealloc = NULL; 599 prealloc = NULL;
571 goto out; 600 goto out;
572 } 601 }
@@ -667,16 +696,23 @@ out:
667 return 0; 696 return 0;
668} 697}
669 698
670static void set_state_bits(struct extent_io_tree *tree, 699static int set_state_bits(struct extent_io_tree *tree,
671 struct extent_state *state, 700 struct extent_state *state,
672 int bits) 701 int bits)
673{ 702{
703 int ret;
704
705 ret = set_state_cb(tree, state, bits);
706 if (ret)
707 return ret;
708
674 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 709 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
675 u64 range = state->end - state->start + 1; 710 u64 range = state->end - state->start + 1;
676 tree->dirty_bytes += range; 711 tree->dirty_bytes += range;
677 } 712 }
678 set_state_cb(tree, state, bits);
679 state->state |= bits; 713 state->state |= bits;
714
715 return 0;
680} 716}
681 717
682static void cache_state(struct extent_state *state, 718static void cache_state(struct extent_state *state,
@@ -758,7 +794,10 @@ hit_next:
758 goto out; 794 goto out;
759 } 795 }
760 796
761 set_state_bits(tree, state, bits); 797 err = set_state_bits(tree, state, bits);
798 if (err)
799 goto out;
800
762 cache_state(state, cached_state); 801 cache_state(state, cached_state);
763 merge_state(tree, state); 802 merge_state(tree, state);
764 if (last_end == (u64)-1) 803 if (last_end == (u64)-1)
@@ -805,7 +844,9 @@ hit_next:
805 if (err) 844 if (err)
806 goto out; 845 goto out;
807 if (state->end <= end) { 846 if (state->end <= end) {
808 set_state_bits(tree, state, bits); 847 err = set_state_bits(tree, state, bits);
848 if (err)
849 goto out;
809 cache_state(state, cached_state); 850 cache_state(state, cached_state);
810 merge_state(tree, state); 851 merge_state(tree, state);
811 if (last_end == (u64)-1) 852 if (last_end == (u64)-1)
@@ -829,11 +870,13 @@ hit_next:
829 this_end = last_start - 1; 870 this_end = last_start - 1;
830 err = insert_state(tree, prealloc, start, this_end, 871 err = insert_state(tree, prealloc, start, this_end,
831 bits); 872 bits);
832 cache_state(prealloc, cached_state);
833 prealloc = NULL;
834 BUG_ON(err == -EEXIST); 873 BUG_ON(err == -EEXIST);
835 if (err) 874 if (err) {
875 prealloc = NULL;
836 goto out; 876 goto out;
877 }
878 cache_state(prealloc, cached_state);
879 prealloc = NULL;
837 start = this_end + 1; 880 start = this_end + 1;
838 goto search_again; 881 goto search_again;
839 } 882 }
@@ -852,7 +895,11 @@ hit_next:
852 err = split_state(tree, state, prealloc, end + 1); 895 err = split_state(tree, state, prealloc, end + 1);
853 BUG_ON(err == -EEXIST); 896 BUG_ON(err == -EEXIST);
854 897
855 set_state_bits(tree, prealloc, bits); 898 err = set_state_bits(tree, prealloc, bits);
899 if (err) {
900 prealloc = NULL;
901 goto out;
902 }
856 cache_state(prealloc, cached_state); 903 cache_state(prealloc, cached_state);
857 merge_state(tree, prealloc); 904 merge_state(tree, prealloc);
858 prealloc = NULL; 905 prealloc = NULL;
@@ -910,7 +957,8 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
910 gfp_t mask) 957 gfp_t mask)
911{ 958{
912 return clear_extent_bit(tree, start, end, 959 return clear_extent_bit(tree, start, end,
913 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, 960 EXTENT_DIRTY | EXTENT_DELALLOC |
961 EXTENT_DO_ACCOUNTING, 0, 0,
914 NULL, mask); 962 NULL, mask);
915} 963}
916 964
@@ -1355,12 +1403,7 @@ out_failed:
1355int extent_clear_unlock_delalloc(struct inode *inode, 1403int extent_clear_unlock_delalloc(struct inode *inode,
1356 struct extent_io_tree *tree, 1404 struct extent_io_tree *tree,
1357 u64 start, u64 end, struct page *locked_page, 1405 u64 start, u64 end, struct page *locked_page,
1358 int unlock_pages, 1406 unsigned long op)
1359 int clear_unlock,
1360 int clear_delalloc, int clear_dirty,
1361 int set_writeback,
1362 int end_writeback,
1363 int set_private2)
1364{ 1407{
1365 int ret; 1408 int ret;
1366 struct page *pages[16]; 1409 struct page *pages[16];
@@ -1370,17 +1413,21 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1370 int i; 1413 int i;
1371 int clear_bits = 0; 1414 int clear_bits = 0;
1372 1415
1373 if (clear_unlock) 1416 if (op & EXTENT_CLEAR_UNLOCK)
1374 clear_bits |= EXTENT_LOCKED; 1417 clear_bits |= EXTENT_LOCKED;
1375 if (clear_dirty) 1418 if (op & EXTENT_CLEAR_DIRTY)
1376 clear_bits |= EXTENT_DIRTY; 1419 clear_bits |= EXTENT_DIRTY;
1377 1420
1378 if (clear_delalloc) 1421 if (op & EXTENT_CLEAR_DELALLOC)
1379 clear_bits |= EXTENT_DELALLOC; 1422 clear_bits |= EXTENT_DELALLOC;
1380 1423
1424 if (op & EXTENT_CLEAR_ACCOUNTING)
1425 clear_bits |= EXTENT_DO_ACCOUNTING;
1426
1381 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); 1427 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1382 if (!(unlock_pages || clear_dirty || set_writeback || end_writeback || 1428 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1383 set_private2)) 1429 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
1430 EXTENT_SET_PRIVATE2)))
1384 return 0; 1431 return 0;
1385 1432
1386 while (nr_pages > 0) { 1433 while (nr_pages > 0) {
@@ -1389,20 +1436,20 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1389 nr_pages, ARRAY_SIZE(pages)), pages); 1436 nr_pages, ARRAY_SIZE(pages)), pages);
1390 for (i = 0; i < ret; i++) { 1437 for (i = 0; i < ret; i++) {
1391 1438
1392 if (set_private2) 1439 if (op & EXTENT_SET_PRIVATE2)
1393 SetPagePrivate2(pages[i]); 1440 SetPagePrivate2(pages[i]);
1394 1441
1395 if (pages[i] == locked_page) { 1442 if (pages[i] == locked_page) {
1396 page_cache_release(pages[i]); 1443 page_cache_release(pages[i]);
1397 continue; 1444 continue;
1398 } 1445 }
1399 if (clear_dirty) 1446 if (op & EXTENT_CLEAR_DIRTY)
1400 clear_page_dirty_for_io(pages[i]); 1447 clear_page_dirty_for_io(pages[i]);
1401 if (set_writeback) 1448 if (op & EXTENT_SET_WRITEBACK)
1402 set_page_writeback(pages[i]); 1449 set_page_writeback(pages[i]);
1403 if (end_writeback) 1450 if (op & EXTENT_END_WRITEBACK)
1404 end_page_writeback(pages[i]); 1451 end_page_writeback(pages[i]);
1405 if (unlock_pages) 1452 if (op & EXTENT_CLEAR_UNLOCK_PAGE)
1406 unlock_page(pages[i]); 1453 unlock_page(pages[i]);
1407 page_cache_release(pages[i]); 1454 page_cache_release(pages[i]);
1408 } 1455 }
@@ -2668,7 +2715,8 @@ int extent_invalidatepage(struct extent_io_tree *tree,
2668 lock_extent(tree, start, end, GFP_NOFS); 2715 lock_extent(tree, start, end, GFP_NOFS);
2669 wait_on_page_writeback(page); 2716 wait_on_page_writeback(page);
2670 clear_extent_bit(tree, start, end, 2717 clear_extent_bit(tree, start, end,
2671 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, 2718 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
2719 EXTENT_DO_ACCOUNTING,
2672 1, 1, NULL, GFP_NOFS); 2720 1, 1, NULL, GFP_NOFS);
2673 return 0; 2721 return 0;
2674} 2722}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 14ed16fd862d..36de250a7b2b 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -15,6 +15,7 @@
15#define EXTENT_BUFFER_FILLED (1 << 8) 15#define EXTENT_BUFFER_FILLED (1 << 8)
16#define EXTENT_BOUNDARY (1 << 9) 16#define EXTENT_BOUNDARY (1 << 9)
17#define EXTENT_NODATASUM (1 << 10) 17#define EXTENT_NODATASUM (1 << 10)
18#define EXTENT_DO_ACCOUNTING (1 << 11)
18#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 19#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
19 20
20/* flags for bio submission */ 21/* flags for bio submission */
@@ -25,6 +26,16 @@
25#define EXTENT_BUFFER_BLOCKING 1 26#define EXTENT_BUFFER_BLOCKING 1
26#define EXTENT_BUFFER_DIRTY 2 27#define EXTENT_BUFFER_DIRTY 2
27 28
29/* these are flags for extent_clear_unlock_delalloc */
30#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
31#define EXTENT_CLEAR_UNLOCK 0x2
32#define EXTENT_CLEAR_DELALLOC 0x4
33#define EXTENT_CLEAR_DIRTY 0x8
34#define EXTENT_SET_WRITEBACK 0x10
35#define EXTENT_END_WRITEBACK 0x20
36#define EXTENT_SET_PRIVATE2 0x40
37#define EXTENT_CLEAR_ACCOUNTING 0x80
38
28/* 39/*
29 * page->private values. Every page that is controlled by the extent 40 * page->private values. Every page that is controlled by the extent
30 * map has page->private set to one. 41 * map has page->private set to one.
@@ -60,8 +71,13 @@ struct extent_io_ops {
60 struct extent_state *state, int uptodate); 71 struct extent_state *state, int uptodate);
61 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, 72 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
62 unsigned long old, unsigned long bits); 73 unsigned long old, unsigned long bits);
63 int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end, 74 int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
64 unsigned long old, unsigned long bits); 75 unsigned long bits);
76 int (*merge_extent_hook)(struct inode *inode,
77 struct extent_state *new,
78 struct extent_state *other);
79 int (*split_extent_hook)(struct inode *inode,
80 struct extent_state *orig, u64 split);
65 int (*write_cache_pages_lock_hook)(struct page *page); 81 int (*write_cache_pages_lock_hook)(struct page *page);
66}; 82};
67 83
@@ -79,10 +95,14 @@ struct extent_state {
79 u64 start; 95 u64 start;
80 u64 end; /* inclusive */ 96 u64 end; /* inclusive */
81 struct rb_node rb_node; 97 struct rb_node rb_node;
98
99 /* ADD NEW ELEMENTS AFTER THIS */
82 struct extent_io_tree *tree; 100 struct extent_io_tree *tree;
83 wait_queue_head_t wq; 101 wait_queue_head_t wq;
84 atomic_t refs; 102 atomic_t refs;
85 unsigned long state; 103 unsigned long state;
104 u64 split_start;
105 u64 split_end;
86 106
87 /* for use by the FS */ 107 /* for use by the FS */
88 u64 private; 108 u64 private;
@@ -279,10 +299,5 @@ int extent_range_uptodate(struct extent_io_tree *tree,
279int extent_clear_unlock_delalloc(struct inode *inode, 299int extent_clear_unlock_delalloc(struct inode *inode,
280 struct extent_io_tree *tree, 300 struct extent_io_tree *tree,
281 u64 start, u64 end, struct page *locked_page, 301 u64 start, u64 end, struct page *locked_page,
282 int unlock_page, 302 unsigned long op);
283 int clear_unlock,
284 int clear_delalloc, int clear_dirty,
285 int set_writeback,
286 int end_writeback,
287 int set_private2);
288#endif 303#endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a3492a3ad96b..2d623aa0625f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -123,7 +123,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
123 root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 123 root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
124 124
125 end_of_last_block = start_pos + num_bytes - 1; 125 end_of_last_block = start_pos + num_bytes - 1;
126 btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); 126 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
127 if (err)
128 return err;
129
127 for (i = 0; i < num_pages; i++) { 130 for (i = 0; i < num_pages; i++) {
128 struct page *p = pages[i]; 131 struct page *p = pages[i];
129 SetPageUptodate(p); 132 SetPageUptodate(p);
@@ -875,7 +878,8 @@ again:
875 btrfs_put_ordered_extent(ordered); 878 btrfs_put_ordered_extent(ordered);
876 879
877 clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos, 880 clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
878 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC, 881 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
882 EXTENT_DO_ACCOUNTING,
879 GFP_NOFS); 883 GFP_NOFS);
880 unlock_extent(&BTRFS_I(inode)->io_tree, 884 unlock_extent(&BTRFS_I(inode)->io_tree,
881 start_pos, last_pos - 1, GFP_NOFS); 885 start_pos, last_pos - 1, GFP_NOFS);
@@ -917,21 +921,35 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
917 start_pos = pos; 921 start_pos = pos;
918 922
919 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 923 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
924
925 /* do the reserve before the mutex lock in case we have to do some
926 * flushing. We wouldn't deadlock, but this is more polite.
927 */
928 err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
929 if (err)
930 goto out_nolock;
931
932 mutex_lock(&inode->i_mutex);
933
920 current->backing_dev_info = inode->i_mapping->backing_dev_info; 934 current->backing_dev_info = inode->i_mapping->backing_dev_info;
921 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 935 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
922 if (err) 936 if (err)
923 goto out_nolock; 937 goto out;
938
924 if (count == 0) 939 if (count == 0)
925 goto out_nolock; 940 goto out;
926 941
927 err = file_remove_suid(file); 942 err = file_remove_suid(file);
928 if (err) 943 if (err)
929 goto out_nolock; 944 goto out;
945
930 file_update_time(file); 946 file_update_time(file);
931 947
932 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 948 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
933 949
934 mutex_lock(&inode->i_mutex); 950 /* generic_write_checks can change our pos */
951 start_pos = pos;
952
935 BTRFS_I(inode)->sequence++; 953 BTRFS_I(inode)->sequence++;
936 first_index = pos >> PAGE_CACHE_SHIFT; 954 first_index = pos >> PAGE_CACHE_SHIFT;
937 last_index = (pos + count) >> PAGE_CACHE_SHIFT; 955 last_index = (pos + count) >> PAGE_CACHE_SHIFT;
@@ -1005,9 +1023,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
1005 } 1023 }
1006 1024
1007 if (will_write) { 1025 if (will_write) {
1008 btrfs_fdatawrite_range(inode->i_mapping, pos, 1026 filemap_fdatawrite_range(inode->i_mapping, pos,
1009 pos + write_bytes - 1, 1027 pos + write_bytes - 1);
1010 WB_SYNC_ALL);
1011 } else { 1028 } else {
1012 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1029 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
1013 num_pages); 1030 num_pages);
@@ -1028,6 +1045,7 @@ out:
1028 mutex_unlock(&inode->i_mutex); 1045 mutex_unlock(&inode->i_mutex);
1029 if (ret) 1046 if (ret)
1030 err = ret; 1047 err = ret;
1048 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
1031 1049
1032out_nolock: 1050out_nolock:
1033 kfree(pages); 1051 kfree(pages);
@@ -1196,7 +1214,7 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1196 return 0; 1214 return 0;
1197} 1215}
1198 1216
1199struct file_operations btrfs_file_operations = { 1217const struct file_operations btrfs_file_operations = {
1200 .llseek = generic_file_llseek, 1218 .llseek = generic_file_llseek,
1201 .read = do_sync_read, 1219 .read = do_sync_read,
1202 .aio_read = generic_file_aio_read, 1220 .aio_read = generic_file_aio_read,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e9b76bcd1c12..9e138b793dc7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -62,7 +62,7 @@ static const struct inode_operations btrfs_special_inode_operations;
62static const struct inode_operations btrfs_file_inode_operations; 62static const struct inode_operations btrfs_file_inode_operations;
63static const struct address_space_operations btrfs_aops; 63static const struct address_space_operations btrfs_aops;
64static const struct address_space_operations btrfs_symlink_aops; 64static const struct address_space_operations btrfs_symlink_aops;
65static struct file_operations btrfs_dir_file_operations; 65static const struct file_operations btrfs_dir_file_operations;
66static struct extent_io_ops btrfs_extent_io_ops; 66static struct extent_io_ops btrfs_extent_io_ops;
67 67
68static struct kmem_cache *btrfs_inode_cachep; 68static struct kmem_cache *btrfs_inode_cachep;
@@ -424,9 +424,12 @@ again:
424 * and free up our temp pages. 424 * and free up our temp pages.
425 */ 425 */
426 extent_clear_unlock_delalloc(inode, 426 extent_clear_unlock_delalloc(inode,
427 &BTRFS_I(inode)->io_tree, 427 &BTRFS_I(inode)->io_tree,
428 start, end, NULL, 1, 0, 428 start, end, NULL,
429 0, 1, 1, 1, 0); 429 EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
430 EXTENT_CLEAR_DELALLOC |
431 EXTENT_CLEAR_ACCOUNTING |
432 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
430 ret = 0; 433 ret = 0;
431 goto free_pages_out; 434 goto free_pages_out;
432 } 435 }
@@ -637,11 +640,14 @@ static noinline int submit_compressed_extents(struct inode *inode,
637 * clear dirty, set writeback and unlock the pages. 640 * clear dirty, set writeback and unlock the pages.
638 */ 641 */
639 extent_clear_unlock_delalloc(inode, 642 extent_clear_unlock_delalloc(inode,
640 &BTRFS_I(inode)->io_tree, 643 &BTRFS_I(inode)->io_tree,
641 async_extent->start, 644 async_extent->start,
642 async_extent->start + 645 async_extent->start +
643 async_extent->ram_size - 1, 646 async_extent->ram_size - 1,
644 NULL, 1, 1, 0, 1, 1, 0, 0); 647 NULL, EXTENT_CLEAR_UNLOCK_PAGE |
648 EXTENT_CLEAR_UNLOCK |
649 EXTENT_CLEAR_DELALLOC |
650 EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK);
645 651
646 ret = btrfs_submit_compressed_write(inode, 652 ret = btrfs_submit_compressed_write(inode,
647 async_extent->start, 653 async_extent->start,
@@ -712,9 +718,15 @@ static noinline int cow_file_range(struct inode *inode,
712 start, end, 0, NULL); 718 start, end, 0, NULL);
713 if (ret == 0) { 719 if (ret == 0) {
714 extent_clear_unlock_delalloc(inode, 720 extent_clear_unlock_delalloc(inode,
715 &BTRFS_I(inode)->io_tree, 721 &BTRFS_I(inode)->io_tree,
716 start, end, NULL, 1, 1, 722 start, end, NULL,
717 1, 1, 1, 1, 0); 723 EXTENT_CLEAR_UNLOCK_PAGE |
724 EXTENT_CLEAR_UNLOCK |
725 EXTENT_CLEAR_DELALLOC |
726 EXTENT_CLEAR_ACCOUNTING |
727 EXTENT_CLEAR_DIRTY |
728 EXTENT_SET_WRITEBACK |
729 EXTENT_END_WRITEBACK);
718 *nr_written = *nr_written + 730 *nr_written = *nr_written +
719 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; 731 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
720 *page_started = 1; 732 *page_started = 1;
@@ -738,6 +750,8 @@ static noinline int cow_file_range(struct inode *inode,
738 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 750 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
739 751
740 while (disk_num_bytes > 0) { 752 while (disk_num_bytes > 0) {
753 unsigned long op;
754
741 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); 755 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
742 ret = btrfs_reserve_extent(trans, root, cur_alloc_size, 756 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
743 root->sectorsize, 0, alloc_hint, 757 root->sectorsize, 0, alloc_hint,
@@ -789,10 +803,13 @@ static noinline int cow_file_range(struct inode *inode,
789 * Do set the Private2 bit so we know this page was properly 803 * Do set the Private2 bit so we know this page was properly
790 * setup for writepage 804 * setup for writepage
791 */ 805 */
806 op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0;
807 op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
808 EXTENT_SET_PRIVATE2;
809
792 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 810 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
793 start, start + ram_size - 1, 811 start, start + ram_size - 1,
794 locked_page, unlock, 1, 812 locked_page, op);
795 1, 0, 0, 0, 1);
796 disk_num_bytes -= cur_alloc_size; 813 disk_num_bytes -= cur_alloc_size;
797 num_bytes -= cur_alloc_size; 814 num_bytes -= cur_alloc_size;
798 alloc_hint = ins.objectid + ins.offset; 815 alloc_hint = ins.objectid + ins.offset;
@@ -864,8 +881,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
864 u64 cur_end; 881 u64 cur_end;
865 int limit = 10 * 1024 * 1042; 882 int limit = 10 * 1024 * 1042;
866 883
867 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | 884 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
868 EXTENT_DELALLOC, 1, 0, NULL, GFP_NOFS); 885 1, 0, NULL, GFP_NOFS);
869 while (start < end) { 886 while (start < end) {
870 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); 887 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
871 async_cow->inode = inode; 888 async_cow->inode = inode;
@@ -1006,6 +1023,7 @@ next_slot:
1006 1023
1007 if (found_key.offset > cur_offset) { 1024 if (found_key.offset > cur_offset) {
1008 extent_end = found_key.offset; 1025 extent_end = found_key.offset;
1026 extent_type = 0;
1009 goto out_check; 1027 goto out_check;
1010 } 1028 }
1011 1029
@@ -1112,8 +1130,10 @@ out_check:
1112 BUG_ON(ret); 1130 BUG_ON(ret);
1113 1131
1114 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 1132 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
1115 cur_offset, cur_offset + num_bytes - 1, 1133 cur_offset, cur_offset + num_bytes - 1,
1116 locked_page, 1, 1, 1, 0, 0, 0, 1); 1134 locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
1135 EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
1136 EXTENT_SET_PRIVATE2);
1117 cur_offset = extent_end; 1137 cur_offset = extent_end;
1118 if (cur_offset > end) 1138 if (cur_offset > end)
1119 break; 1139 break;
@@ -1159,6 +1179,89 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1159 return ret; 1179 return ret;
1160} 1180}
1161 1181
1182static int btrfs_split_extent_hook(struct inode *inode,
1183 struct extent_state *orig, u64 split)
1184{
1185 struct btrfs_root *root = BTRFS_I(inode)->root;
1186 u64 size;
1187
1188 if (!(orig->state & EXTENT_DELALLOC))
1189 return 0;
1190
1191 size = orig->end - orig->start + 1;
1192 if (size > root->fs_info->max_extent) {
1193 u64 num_extents;
1194 u64 new_size;
1195
1196 new_size = orig->end - split + 1;
1197 num_extents = div64_u64(size + root->fs_info->max_extent - 1,
1198 root->fs_info->max_extent);
1199
1200 /*
1201 * if we break a large extent up then leave oustanding_extents
1202 * be, since we've already accounted for the large extent.
1203 */
1204 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1205 root->fs_info->max_extent) < num_extents)
1206 return 0;
1207 }
1208
1209 spin_lock(&BTRFS_I(inode)->accounting_lock);
1210 BTRFS_I(inode)->outstanding_extents++;
1211 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1212
1213 return 0;
1214}
1215
1216/*
1217 * extent_io.c merge_extent_hook, used to track merged delayed allocation
1218 * extents so we can keep track of new extents that are just merged onto old
1219 * extents, such as when we are doing sequential writes, so we can properly
1220 * account for the metadata space we'll need.
1221 */
1222static int btrfs_merge_extent_hook(struct inode *inode,
1223 struct extent_state *new,
1224 struct extent_state *other)
1225{
1226 struct btrfs_root *root = BTRFS_I(inode)->root;
1227 u64 new_size, old_size;
1228 u64 num_extents;
1229
1230 /* not delalloc, ignore it */
1231 if (!(other->state & EXTENT_DELALLOC))
1232 return 0;
1233
1234 old_size = other->end - other->start + 1;
1235 if (new->start < other->start)
1236 new_size = other->end - new->start + 1;
1237 else
1238 new_size = new->end - other->start + 1;
1239
1240 /* we're not bigger than the max, unreserve the space and go */
1241 if (new_size <= root->fs_info->max_extent) {
1242 spin_lock(&BTRFS_I(inode)->accounting_lock);
1243 BTRFS_I(inode)->outstanding_extents--;
1244 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1245 return 0;
1246 }
1247
1248 /*
1249 * If we grew by another max_extent, just return, we want to keep that
1250 * reserved amount.
1251 */
1252 num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
1253 root->fs_info->max_extent);
1254 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1255 root->fs_info->max_extent) > num_extents)
1256 return 0;
1257
1258 spin_lock(&BTRFS_I(inode)->accounting_lock);
1259 BTRFS_I(inode)->outstanding_extents--;
1260 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1261
1262 return 0;
1263}
1264
1162/* 1265/*
1163 * extent_io.c set_bit_hook, used to track delayed allocation 1266 * extent_io.c set_bit_hook, used to track delayed allocation
1164 * bytes in this file, and to maintain the list of inodes that 1267 * bytes in this file, and to maintain the list of inodes that
@@ -1167,6 +1270,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1167static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, 1270static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1168 unsigned long old, unsigned long bits) 1271 unsigned long old, unsigned long bits)
1169{ 1272{
1273
1170 /* 1274 /*
1171 * set_bit and clear bit hooks normally require _irqsave/restore 1275 * set_bit and clear bit hooks normally require _irqsave/restore
1172 * but in this case, we are only testeing for the DELALLOC 1276 * but in this case, we are only testeing for the DELALLOC
@@ -1174,6 +1278,10 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1174 */ 1278 */
1175 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1279 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1176 struct btrfs_root *root = BTRFS_I(inode)->root; 1280 struct btrfs_root *root = BTRFS_I(inode)->root;
1281
1282 spin_lock(&BTRFS_I(inode)->accounting_lock);
1283 BTRFS_I(inode)->outstanding_extents++;
1284 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1177 btrfs_delalloc_reserve_space(root, inode, end - start + 1); 1285 btrfs_delalloc_reserve_space(root, inode, end - start + 1);
1178 spin_lock(&root->fs_info->delalloc_lock); 1286 spin_lock(&root->fs_info->delalloc_lock);
1179 BTRFS_I(inode)->delalloc_bytes += end - start + 1; 1287 BTRFS_I(inode)->delalloc_bytes += end - start + 1;
@@ -1190,22 +1298,31 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1190/* 1298/*
1191 * extent_io.c clear_bit_hook, see set_bit_hook for why 1299 * extent_io.c clear_bit_hook, see set_bit_hook for why
1192 */ 1300 */
1193static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, 1301static int btrfs_clear_bit_hook(struct inode *inode,
1194 unsigned long old, unsigned long bits) 1302 struct extent_state *state, unsigned long bits)
1195{ 1303{
1196 /* 1304 /*
1197 * set_bit and clear bit hooks normally require _irqsave/restore 1305 * set_bit and clear bit hooks normally require _irqsave/restore
1198 * but in this case, we are only testeing for the DELALLOC 1306 * but in this case, we are only testeing for the DELALLOC
1199 * bit, which is only set or cleared with irqs on 1307 * bit, which is only set or cleared with irqs on
1200 */ 1308 */
1201 if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1309 if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1202 struct btrfs_root *root = BTRFS_I(inode)->root; 1310 struct btrfs_root *root = BTRFS_I(inode)->root;
1203 1311
1312 if (bits & EXTENT_DO_ACCOUNTING) {
1313 spin_lock(&BTRFS_I(inode)->accounting_lock);
1314 BTRFS_I(inode)->outstanding_extents--;
1315 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1316 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
1317 }
1318
1204 spin_lock(&root->fs_info->delalloc_lock); 1319 spin_lock(&root->fs_info->delalloc_lock);
1205 if (end - start + 1 > root->fs_info->delalloc_bytes) { 1320 if (state->end - state->start + 1 >
1321 root->fs_info->delalloc_bytes) {
1206 printk(KERN_INFO "btrfs warning: delalloc account " 1322 printk(KERN_INFO "btrfs warning: delalloc account "
1207 "%llu %llu\n", 1323 "%llu %llu\n",
1208 (unsigned long long)end - start + 1, 1324 (unsigned long long)
1325 state->end - state->start + 1,
1209 (unsigned long long) 1326 (unsigned long long)
1210 root->fs_info->delalloc_bytes); 1327 root->fs_info->delalloc_bytes);
1211 btrfs_delalloc_free_space(root, inode, (u64)-1); 1328 btrfs_delalloc_free_space(root, inode, (u64)-1);
@@ -1213,9 +1330,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
1213 BTRFS_I(inode)->delalloc_bytes = 0; 1330 BTRFS_I(inode)->delalloc_bytes = 0;
1214 } else { 1331 } else {
1215 btrfs_delalloc_free_space(root, inode, 1332 btrfs_delalloc_free_space(root, inode,
1216 end - start + 1); 1333 state->end -
1217 root->fs_info->delalloc_bytes -= end - start + 1; 1334 state->start + 1);
1218 BTRFS_I(inode)->delalloc_bytes -= end - start + 1; 1335 root->fs_info->delalloc_bytes -= state->end -
1336 state->start + 1;
1337 BTRFS_I(inode)->delalloc_bytes -= state->end -
1338 state->start + 1;
1219 } 1339 }
1220 if (BTRFS_I(inode)->delalloc_bytes == 0 && 1340 if (BTRFS_I(inode)->delalloc_bytes == 0 &&
1221 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1341 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
@@ -2950,7 +3070,12 @@ again:
2950 goto again; 3070 goto again;
2951 } 3071 }
2952 3072
2953 btrfs_set_extent_delalloc(inode, page_start, page_end); 3073 ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
3074 if (ret) {
3075 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3076 goto out_unlock;
3077 }
3078
2954 ret = 0; 3079 ret = 0;
2955 if (offset != PAGE_CACHE_SIZE) { 3080 if (offset != PAGE_CACHE_SIZE) {
2956 kaddr = kmap(page); 3081 kaddr = kmap(page);
@@ -2981,15 +3106,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
2981 u64 last_byte; 3106 u64 last_byte;
2982 u64 cur_offset; 3107 u64 cur_offset;
2983 u64 hole_size; 3108 u64 hole_size;
2984 int err; 3109 int err = 0;
2985 3110
2986 if (size <= hole_start) 3111 if (size <= hole_start)
2987 return 0; 3112 return 0;
2988 3113
2989 err = btrfs_check_metadata_free_space(root);
2990 if (err)
2991 return err;
2992
2993 btrfs_truncate_page(inode->i_mapping, inode->i_size); 3114 btrfs_truncate_page(inode->i_mapping, inode->i_size);
2994 3115
2995 while (1) { 3116 while (1) {
@@ -3024,12 +3145,18 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3024 cur_offset, &hint_byte, 1); 3145 cur_offset, &hint_byte, 1);
3025 if (err) 3146 if (err)
3026 break; 3147 break;
3148
3149 err = btrfs_reserve_metadata_space(root, 1);
3150 if (err)
3151 break;
3152
3027 err = btrfs_insert_file_extent(trans, root, 3153 err = btrfs_insert_file_extent(trans, root,
3028 inode->i_ino, cur_offset, 0, 3154 inode->i_ino, cur_offset, 0,
3029 0, hole_size, 0, hole_size, 3155 0, hole_size, 0, hole_size,
3030 0, 0, 0); 3156 0, 0, 0);
3031 btrfs_drop_extent_cache(inode, hole_start, 3157 btrfs_drop_extent_cache(inode, hole_start,
3032 last_byte - 1, 0); 3158 last_byte - 1, 0);
3159 btrfs_unreserve_metadata_space(root, 1);
3033 } 3160 }
3034 free_extent_map(em); 3161 free_extent_map(em);
3035 cur_offset = last_byte; 3162 cur_offset = last_byte;
@@ -3503,12 +3630,14 @@ static int btrfs_dentry_delete(struct dentry *dentry)
3503{ 3630{
3504 struct btrfs_root *root; 3631 struct btrfs_root *root;
3505 3632
3506 if (!dentry->d_inode) 3633 if (!dentry->d_inode && !IS_ROOT(dentry))
3507 return 0; 3634 dentry = dentry->d_parent;
3508 3635
3509 root = BTRFS_I(dentry->d_inode)->root; 3636 if (dentry->d_inode) {
3510 if (btrfs_root_refs(&root->root_item) == 0) 3637 root = BTRFS_I(dentry->d_inode)->root;
3511 return 1; 3638 if (btrfs_root_refs(&root->root_item) == 0)
3639 return 1;
3640 }
3512 return 0; 3641 return 0;
3513} 3642}
3514 3643
@@ -3990,11 +4119,18 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
3990 if (!new_valid_dev(rdev)) 4119 if (!new_valid_dev(rdev))
3991 return -EINVAL; 4120 return -EINVAL;
3992 4121
3993 err = btrfs_check_metadata_free_space(root); 4122 /*
4123 * 2 for inode item and ref
4124 * 2 for dir items
4125 * 1 for xattr if selinux is on
4126 */
4127 err = btrfs_reserve_metadata_space(root, 5);
3994 if (err) 4128 if (err)
3995 goto fail; 4129 return err;
3996 4130
3997 trans = btrfs_start_transaction(root, 1); 4131 trans = btrfs_start_transaction(root, 1);
4132 if (!trans)
4133 goto fail;
3998 btrfs_set_trans_block_group(trans, dir); 4134 btrfs_set_trans_block_group(trans, dir);
3999 4135
4000 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4136 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -4032,6 +4168,7 @@ out_unlock:
4032 nr = trans->blocks_used; 4168 nr = trans->blocks_used;
4033 btrfs_end_transaction_throttle(trans, root); 4169 btrfs_end_transaction_throttle(trans, root);
4034fail: 4170fail:
4171 btrfs_unreserve_metadata_space(root, 5);
4035 if (drop_inode) { 4172 if (drop_inode) {
4036 inode_dec_link_count(inode); 4173 inode_dec_link_count(inode);
4037 iput(inode); 4174 iput(inode);
@@ -4052,10 +4189,18 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4052 u64 objectid; 4189 u64 objectid;
4053 u64 index = 0; 4190 u64 index = 0;
4054 4191
4055 err = btrfs_check_metadata_free_space(root); 4192 /*
4193 * 2 for inode item and ref
4194 * 2 for dir items
4195 * 1 for xattr if selinux is on
4196 */
4197 err = btrfs_reserve_metadata_space(root, 5);
4056 if (err) 4198 if (err)
4057 goto fail; 4199 return err;
4200
4058 trans = btrfs_start_transaction(root, 1); 4201 trans = btrfs_start_transaction(root, 1);
4202 if (!trans)
4203 goto fail;
4059 btrfs_set_trans_block_group(trans, dir); 4204 btrfs_set_trans_block_group(trans, dir);
4060 4205
4061 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4206 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -4096,6 +4241,7 @@ out_unlock:
4096 nr = trans->blocks_used; 4241 nr = trans->blocks_used;
4097 btrfs_end_transaction_throttle(trans, root); 4242 btrfs_end_transaction_throttle(trans, root);
4098fail: 4243fail:
4244 btrfs_unreserve_metadata_space(root, 5);
4099 if (drop_inode) { 4245 if (drop_inode) {
4100 inode_dec_link_count(inode); 4246 inode_dec_link_count(inode);
4101 iput(inode); 4247 iput(inode);
@@ -4118,10 +4264,16 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4118 if (inode->i_nlink == 0) 4264 if (inode->i_nlink == 0)
4119 return -ENOENT; 4265 return -ENOENT;
4120 4266
4121 btrfs_inc_nlink(inode); 4267 /*
4122 err = btrfs_check_metadata_free_space(root); 4268 * 1 item for inode ref
4269 * 2 items for dir items
4270 */
4271 err = btrfs_reserve_metadata_space(root, 3);
4123 if (err) 4272 if (err)
4124 goto fail; 4273 return err;
4274
4275 btrfs_inc_nlink(inode);
4276
4125 err = btrfs_set_inode_index(dir, &index); 4277 err = btrfs_set_inode_index(dir, &index);
4126 if (err) 4278 if (err)
4127 goto fail; 4279 goto fail;
@@ -4145,6 +4297,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4145 nr = trans->blocks_used; 4297 nr = trans->blocks_used;
4146 btrfs_end_transaction_throttle(trans, root); 4298 btrfs_end_transaction_throttle(trans, root);
4147fail: 4299fail:
4300 btrfs_unreserve_metadata_space(root, 3);
4148 if (drop_inode) { 4301 if (drop_inode) {
4149 inode_dec_link_count(inode); 4302 inode_dec_link_count(inode);
4150 iput(inode); 4303 iput(inode);
@@ -4164,17 +4317,21 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4164 u64 index = 0; 4317 u64 index = 0;
4165 unsigned long nr = 1; 4318 unsigned long nr = 1;
4166 4319
4167 err = btrfs_check_metadata_free_space(root); 4320 /*
4321 * 2 items for inode and ref
4322 * 2 items for dir items
4323 * 1 for xattr if selinux is on
4324 */
4325 err = btrfs_reserve_metadata_space(root, 5);
4168 if (err) 4326 if (err)
4169 goto out_unlock; 4327 return err;
4170 4328
4171 trans = btrfs_start_transaction(root, 1); 4329 trans = btrfs_start_transaction(root, 1);
4172 btrfs_set_trans_block_group(trans, dir); 4330 if (!trans) {
4173 4331 err = -ENOMEM;
4174 if (IS_ERR(trans)) {
4175 err = PTR_ERR(trans);
4176 goto out_unlock; 4332 goto out_unlock;
4177 } 4333 }
4334 btrfs_set_trans_block_group(trans, dir);
4178 4335
4179 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4336 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4180 if (err) { 4337 if (err) {
@@ -4223,6 +4380,7 @@ out_fail:
4223 btrfs_end_transaction_throttle(trans, root); 4380 btrfs_end_transaction_throttle(trans, root);
4224 4381
4225out_unlock: 4382out_unlock:
4383 btrfs_unreserve_metadata_space(root, 5);
4226 if (drop_on_err) 4384 if (drop_on_err)
4227 iput(inode); 4385 iput(inode);
4228 btrfs_btree_balance_dirty(root, nr); 4386 btrfs_btree_balance_dirty(root, nr);
@@ -4684,7 +4842,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4684 */ 4842 */
4685 clear_extent_bit(tree, page_start, page_end, 4843 clear_extent_bit(tree, page_start, page_end,
4686 EXTENT_DIRTY | EXTENT_DELALLOC | 4844 EXTENT_DIRTY | EXTENT_DELALLOC |
4687 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); 4845 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
4846 NULL, GFP_NOFS);
4688 /* 4847 /*
4689 * whoever cleared the private bit is responsible 4848 * whoever cleared the private bit is responsible
4690 * for the finish_ordered_io 4849 * for the finish_ordered_io
@@ -4697,8 +4856,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4697 lock_extent(tree, page_start, page_end, GFP_NOFS); 4856 lock_extent(tree, page_start, page_end, GFP_NOFS);
4698 } 4857 }
4699 clear_extent_bit(tree, page_start, page_end, 4858 clear_extent_bit(tree, page_start, page_end,
4700 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, 4859 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
4701 1, 1, NULL, GFP_NOFS); 4860 EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS);
4702 __btrfs_releasepage(page, GFP_NOFS); 4861 __btrfs_releasepage(page, GFP_NOFS);
4703 4862
4704 ClearPageChecked(page); 4863 ClearPageChecked(page);
@@ -4747,6 +4906,13 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
4747 goto out; 4906 goto out;
4748 } 4907 }
4749 4908
4909 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
4910 if (ret) {
4911 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
4912 ret = VM_FAULT_SIGBUS;
4913 goto out;
4914 }
4915
4750 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 4916 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
4751again: 4917again:
4752 lock_page(page); 4918 lock_page(page);
@@ -4778,7 +4944,24 @@ again:
4778 goto again; 4944 goto again;
4779 } 4945 }
4780 4946
4781 btrfs_set_extent_delalloc(inode, page_start, page_end); 4947 /*
4948 * XXX - page_mkwrite gets called every time the page is dirtied, even
4949 * if it was already dirty, so for space accounting reasons we need to
4950 * clear any delalloc bits for the range we are fixing to save. There
4951 * is probably a better way to do this, but for now keep consistent with
4952 * prepare_pages in the normal write path.
4953 */
4954 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
4955 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
4956 GFP_NOFS);
4957
4958 ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
4959 if (ret) {
4960 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4961 ret = VM_FAULT_SIGBUS;
4962 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
4963 goto out_unlock;
4964 }
4782 ret = 0; 4965 ret = 0;
4783 4966
4784 /* page is wholly or partially inside EOF */ 4967 /* page is wholly or partially inside EOF */
@@ -4801,6 +4984,7 @@ again:
4801 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 4984 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4802 4985
4803out_unlock: 4986out_unlock:
4987 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
4804 if (!ret) 4988 if (!ret)
4805 return VM_FAULT_LOCKED; 4989 return VM_FAULT_LOCKED;
4806 unlock_page(page); 4990 unlock_page(page);
@@ -4917,6 +5101,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
4917 return NULL; 5101 return NULL;
4918 ei->last_trans = 0; 5102 ei->last_trans = 0;
4919 ei->logged_trans = 0; 5103 ei->logged_trans = 0;
5104 ei->outstanding_extents = 0;
5105 ei->reserved_extents = 0;
5106 spin_lock_init(&ei->accounting_lock);
4920 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 5107 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
4921 INIT_LIST_HEAD(&ei->i_orphan); 5108 INIT_LIST_HEAD(&ei->i_orphan);
4922 INIT_LIST_HEAD(&ei->ordered_operations); 5109 INIT_LIST_HEAD(&ei->ordered_operations);
@@ -5070,7 +5257,12 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
5070 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 5257 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
5071 return -ENOTEMPTY; 5258 return -ENOTEMPTY;
5072 5259
5073 ret = btrfs_check_metadata_free_space(root); 5260 /*
5261 * 2 items for dir items
5262 * 1 item for orphan entry
5263 * 1 item for ref
5264 */
5265 ret = btrfs_reserve_metadata_space(root, 4);
5074 if (ret) 5266 if (ret)
5075 return ret; 5267 return ret;
5076 5268
@@ -5185,6 +5377,8 @@ out_fail:
5185 5377
5186 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 5378 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
5187 up_read(&root->fs_info->subvol_sem); 5379 up_read(&root->fs_info->subvol_sem);
5380
5381 btrfs_unreserve_metadata_space(root, 4);
5188 return ret; 5382 return ret;
5189} 5383}
5190 5384
@@ -5256,11 +5450,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5256 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 5450 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
5257 return -ENAMETOOLONG; 5451 return -ENAMETOOLONG;
5258 5452
5259 err = btrfs_check_metadata_free_space(root); 5453 /*
5454 * 2 items for inode item and ref
5455 * 2 items for dir items
5456 * 1 item for xattr if selinux is on
5457 */
5458 err = btrfs_reserve_metadata_space(root, 5);
5260 if (err) 5459 if (err)
5261 goto out_fail; 5460 return err;
5262 5461
5263 trans = btrfs_start_transaction(root, 1); 5462 trans = btrfs_start_transaction(root, 1);
5463 if (!trans)
5464 goto out_fail;
5264 btrfs_set_trans_block_group(trans, dir); 5465 btrfs_set_trans_block_group(trans, dir);
5265 5466
5266 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 5467 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -5341,6 +5542,7 @@ out_unlock:
5341 nr = trans->blocks_used; 5542 nr = trans->blocks_used;
5342 btrfs_end_transaction_throttle(trans, root); 5543 btrfs_end_transaction_throttle(trans, root);
5343out_fail: 5544out_fail:
5545 btrfs_unreserve_metadata_space(root, 5);
5344 if (drop_inode) { 5546 if (drop_inode) {
5345 inode_dec_link_count(inode); 5547 inode_dec_link_count(inode);
5346 iput(inode); 5548 iput(inode);
@@ -5362,6 +5564,11 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
5362 5564
5363 while (num_bytes > 0) { 5565 while (num_bytes > 0) {
5364 alloc_size = min(num_bytes, root->fs_info->max_extent); 5566 alloc_size = min(num_bytes, root->fs_info->max_extent);
5567
5568 ret = btrfs_reserve_metadata_space(root, 1);
5569 if (ret)
5570 goto out;
5571
5365 ret = btrfs_reserve_extent(trans, root, alloc_size, 5572 ret = btrfs_reserve_extent(trans, root, alloc_size,
5366 root->sectorsize, 0, alloc_hint, 5573 root->sectorsize, 0, alloc_hint,
5367 (u64)-1, &ins, 1); 5574 (u64)-1, &ins, 1);
@@ -5381,6 +5588,7 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
5381 num_bytes -= ins.offset; 5588 num_bytes -= ins.offset;
5382 cur_offset += ins.offset; 5589 cur_offset += ins.offset;
5383 alloc_hint = ins.objectid + ins.offset; 5590 alloc_hint = ins.objectid + ins.offset;
5591 btrfs_unreserve_metadata_space(root, 1);
5384 } 5592 }
5385out: 5593out:
5386 if (cur_offset > start) { 5594 if (cur_offset > start) {
@@ -5544,7 +5752,7 @@ static const struct inode_operations btrfs_dir_ro_inode_operations = {
5544 .permission = btrfs_permission, 5752 .permission = btrfs_permission,
5545}; 5753};
5546 5754
5547static struct file_operations btrfs_dir_file_operations = { 5755static const struct file_operations btrfs_dir_file_operations = {
5548 .llseek = generic_file_llseek, 5756 .llseek = generic_file_llseek,
5549 .read = generic_read_dir, 5757 .read = generic_read_dir,
5550 .readdir = btrfs_real_readdir, 5758 .readdir = btrfs_real_readdir,
@@ -5566,6 +5774,8 @@ static struct extent_io_ops btrfs_extent_io_ops = {
5566 .readpage_io_failed_hook = btrfs_io_failed_hook, 5774 .readpage_io_failed_hook = btrfs_io_failed_hook,
5567 .set_bit_hook = btrfs_set_bit_hook, 5775 .set_bit_hook = btrfs_set_bit_hook,
5568 .clear_bit_hook = btrfs_clear_bit_hook, 5776 .clear_bit_hook = btrfs_clear_bit_hook,
5777 .merge_extent_hook = btrfs_merge_extent_hook,
5778 .split_extent_hook = btrfs_split_extent_hook,
5569}; 5779};
5570 5780
5571/* 5781/*
@@ -5632,6 +5842,6 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
5632 .removexattr = btrfs_removexattr, 5842 .removexattr = btrfs_removexattr,
5633}; 5843};
5634 5844
5635struct dentry_operations btrfs_dentry_operations = { 5845const struct dentry_operations btrfs_dentry_operations = {
5636 .d_delete = btrfs_dentry_delete, 5846 .d_delete = btrfs_dentry_delete,
5637}; 5847};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a8577a7f26ab..cdbb054102b9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -239,7 +239,13 @@ static noinline int create_subvol(struct btrfs_root *root,
239 u64 index = 0; 239 u64 index = 0;
240 unsigned long nr = 1; 240 unsigned long nr = 1;
241 241
242 ret = btrfs_check_metadata_free_space(root); 242 /*
243 * 1 - inode item
244 * 2 - refs
245 * 1 - root item
246 * 2 - dir items
247 */
248 ret = btrfs_reserve_metadata_space(root, 6);
243 if (ret) 249 if (ret)
244 return ret; 250 return ret;
245 251
@@ -340,6 +346,9 @@ fail:
340 err = btrfs_commit_transaction(trans, root); 346 err = btrfs_commit_transaction(trans, root);
341 if (err && !ret) 347 if (err && !ret)
342 ret = err; 348 ret = err;
349
350 btrfs_unreserve_metadata_space(root, 6);
351 btrfs_btree_balance_dirty(root, nr);
343 return ret; 352 return ret;
344} 353}
345 354
@@ -355,19 +364,27 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
355 if (!root->ref_cows) 364 if (!root->ref_cows)
356 return -EINVAL; 365 return -EINVAL;
357 366
358 ret = btrfs_check_metadata_free_space(root); 367 /*
368 * 1 - inode item
369 * 2 - refs
370 * 1 - root item
371 * 2 - dir items
372 */
373 ret = btrfs_reserve_metadata_space(root, 6);
359 if (ret) 374 if (ret)
360 goto fail_unlock; 375 goto fail_unlock;
361 376
362 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 377 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
363 if (!pending_snapshot) { 378 if (!pending_snapshot) {
364 ret = -ENOMEM; 379 ret = -ENOMEM;
380 btrfs_unreserve_metadata_space(root, 6);
365 goto fail_unlock; 381 goto fail_unlock;
366 } 382 }
367 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); 383 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
368 if (!pending_snapshot->name) { 384 if (!pending_snapshot->name) {
369 ret = -ENOMEM; 385 ret = -ENOMEM;
370 kfree(pending_snapshot); 386 kfree(pending_snapshot);
387 btrfs_unreserve_metadata_space(root, 6);
371 goto fail_unlock; 388 goto fail_unlock;
372 } 389 }
373 memcpy(pending_snapshot->name, name, namelen); 390 memcpy(pending_snapshot->name, name, namelen);
@@ -813,6 +830,7 @@ out_up_write:
813out_unlock: 830out_unlock:
814 mutex_unlock(&inode->i_mutex); 831 mutex_unlock(&inode->i_mutex);
815 if (!err) { 832 if (!err) {
833 shrink_dcache_sb(root->fs_info->sb);
816 btrfs_invalidate_inodes(dest); 834 btrfs_invalidate_inodes(dest);
817 d_delete(dentry); 835 d_delete(dentry);
818 } 836 }
@@ -1105,8 +1123,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1105 datao += off - key.offset; 1123 datao += off - key.offset;
1106 datal -= off - key.offset; 1124 datal -= off - key.offset;
1107 } 1125 }
1108 if (key.offset + datao + datal > off + len) 1126
1109 datal = off + len - key.offset - datao; 1127 if (key.offset + datal > off + len)
1128 datal = off + len - key.offset;
1129
1110 /* disko == 0 means it's a hole */ 1130 /* disko == 0 means it's a hole */
1111 if (!disko) 1131 if (!disko)
1112 datao = 0; 1132 datao = 0;
@@ -1215,15 +1235,15 @@ static long btrfs_ioctl_trans_start(struct file *file)
1215 struct inode *inode = fdentry(file)->d_inode; 1235 struct inode *inode = fdentry(file)->d_inode;
1216 struct btrfs_root *root = BTRFS_I(inode)->root; 1236 struct btrfs_root *root = BTRFS_I(inode)->root;
1217 struct btrfs_trans_handle *trans; 1237 struct btrfs_trans_handle *trans;
1218 int ret = 0; 1238 int ret;
1219 1239
1240 ret = -EPERM;
1220 if (!capable(CAP_SYS_ADMIN)) 1241 if (!capable(CAP_SYS_ADMIN))
1221 return -EPERM; 1242 goto out;
1222 1243
1223 if (file->private_data) { 1244 ret = -EINPROGRESS;
1224 ret = -EINPROGRESS; 1245 if (file->private_data)
1225 goto out; 1246 goto out;
1226 }
1227 1247
1228 ret = mnt_want_write(file->f_path.mnt); 1248 ret = mnt_want_write(file->f_path.mnt);
1229 if (ret) 1249 if (ret)
@@ -1233,12 +1253,19 @@ static long btrfs_ioctl_trans_start(struct file *file)
1233 root->fs_info->open_ioctl_trans++; 1253 root->fs_info->open_ioctl_trans++;
1234 mutex_unlock(&root->fs_info->trans_mutex); 1254 mutex_unlock(&root->fs_info->trans_mutex);
1235 1255
1256 ret = -ENOMEM;
1236 trans = btrfs_start_ioctl_transaction(root, 0); 1257 trans = btrfs_start_ioctl_transaction(root, 0);
1237 if (trans) 1258 if (!trans)
1238 file->private_data = trans; 1259 goto out_drop;
1239 else 1260
1240 ret = -ENOMEM; 1261 file->private_data = trans;
1241 /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/ 1262 return 0;
1263
1264out_drop:
1265 mutex_lock(&root->fs_info->trans_mutex);
1266 root->fs_info->open_ioctl_trans--;
1267 mutex_unlock(&root->fs_info->trans_mutex);
1268 mnt_drop_write(file->f_path.mnt);
1242out: 1269out:
1243 return ret; 1270 return ret;
1244} 1271}
@@ -1254,24 +1281,20 @@ long btrfs_ioctl_trans_end(struct file *file)
1254 struct inode *inode = fdentry(file)->d_inode; 1281 struct inode *inode = fdentry(file)->d_inode;
1255 struct btrfs_root *root = BTRFS_I(inode)->root; 1282 struct btrfs_root *root = BTRFS_I(inode)->root;
1256 struct btrfs_trans_handle *trans; 1283 struct btrfs_trans_handle *trans;
1257 int ret = 0;
1258 1284
1259 trans = file->private_data; 1285 trans = file->private_data;
1260 if (!trans) { 1286 if (!trans)
1261 ret = -EINVAL; 1287 return -EINVAL;
1262 goto out;
1263 }
1264 btrfs_end_transaction(trans, root);
1265 file->private_data = NULL; 1288 file->private_data = NULL;
1266 1289
1290 btrfs_end_transaction(trans, root);
1291
1267 mutex_lock(&root->fs_info->trans_mutex); 1292 mutex_lock(&root->fs_info->trans_mutex);
1268 root->fs_info->open_ioctl_trans--; 1293 root->fs_info->open_ioctl_trans--;
1269 mutex_unlock(&root->fs_info->trans_mutex); 1294 mutex_unlock(&root->fs_info->trans_mutex);
1270 1295
1271 mnt_drop_write(file->f_path.mnt); 1296 mnt_drop_write(file->f_path.mnt);
1272 1297 return 0;
1273out:
1274 return ret;
1275} 1298}
1276 1299
1277long btrfs_ioctl(struct file *file, unsigned int 1300long btrfs_ioctl(struct file *file, unsigned int
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b5d6d24726b0..5799bc46a309 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -306,6 +306,12 @@ int btrfs_remove_ordered_extent(struct inode *inode,
306 tree->last = NULL; 306 tree->last = NULL;
307 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 307 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
308 308
309 spin_lock(&BTRFS_I(inode)->accounting_lock);
310 BTRFS_I(inode)->outstanding_extents--;
311 spin_unlock(&BTRFS_I(inode)->accounting_lock);
312 btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
313 inode, 1);
314
309 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 315 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
310 list_del_init(&entry->root_extent_list); 316 list_del_init(&entry->root_extent_list);
311 317
@@ -458,7 +464,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
458 * start IO on any dirty ones so the wait doesn't stall waiting 464 * start IO on any dirty ones so the wait doesn't stall waiting
459 * for pdflush to find them 465 * for pdflush to find them
460 */ 466 */
461 btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL); 467 filemap_fdatawrite_range(inode->i_mapping, start, end);
462 if (wait) { 468 if (wait) {
463 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, 469 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
464 &entry->flags)); 470 &entry->flags));
@@ -488,17 +494,15 @@ again:
488 /* start IO across the range first to instantiate any delalloc 494 /* start IO across the range first to instantiate any delalloc
489 * extents 495 * extents
490 */ 496 */
491 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); 497 filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
492 498
493 /* The compression code will leave pages locked but return from 499 /* The compression code will leave pages locked but return from
494 * writepage without setting the page writeback. Starting again 500 * writepage without setting the page writeback. Starting again
495 * with WB_SYNC_ALL will end up waiting for the IO to actually start. 501 * with WB_SYNC_ALL will end up waiting for the IO to actually start.
496 */ 502 */
497 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); 503 filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
498 504
499 btrfs_wait_on_page_writeback_range(inode->i_mapping, 505 filemap_fdatawait_range(inode->i_mapping, start, orig_end);
500 start >> PAGE_CACHE_SHIFT,
501 orig_end >> PAGE_CACHE_SHIFT);
502 506
503 end = orig_end; 507 end = orig_end;
504 found = 0; 508 found = 0;
@@ -716,89 +720,6 @@ out:
716} 720}
717 721
718 722
719/**
720 * taken from mm/filemap.c because it isn't exported
721 *
722 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
723 * @mapping: address space structure to write
724 * @start: offset in bytes where the range starts
725 * @end: offset in bytes where the range ends (inclusive)
726 * @sync_mode: enable synchronous operation
727 *
728 * Start writeback against all of a mapping's dirty pages that lie
729 * within the byte offsets <start, end> inclusive.
730 *
731 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
732 * opposed to a regular memory cleansing writeback. The difference between
733 * these two operations is that if a dirty page/buffer is encountered, it must
734 * be waited upon, and not just skipped over.
735 */
736int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
737 loff_t end, int sync_mode)
738{
739 struct writeback_control wbc = {
740 .sync_mode = sync_mode,
741 .nr_to_write = mapping->nrpages * 2,
742 .range_start = start,
743 .range_end = end,
744 };
745 return btrfs_writepages(mapping, &wbc);
746}
747
748/**
749 * taken from mm/filemap.c because it isn't exported
750 *
751 * wait_on_page_writeback_range - wait for writeback to complete
752 * @mapping: target address_space
753 * @start: beginning page index
754 * @end: ending page index
755 *
756 * Wait for writeback to complete against pages indexed by start->end
757 * inclusive
758 */
759int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
760 pgoff_t start, pgoff_t end)
761{
762 struct pagevec pvec;
763 int nr_pages;
764 int ret = 0;
765 pgoff_t index;
766
767 if (end < start)
768 return 0;
769
770 pagevec_init(&pvec, 0);
771 index = start;
772 while ((index <= end) &&
773 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
774 PAGECACHE_TAG_WRITEBACK,
775 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
776 unsigned i;
777
778 for (i = 0; i < nr_pages; i++) {
779 struct page *page = pvec.pages[i];
780
781 /* until radix tree lookup accepts end_index */
782 if (page->index > end)
783 continue;
784
785 wait_on_page_writeback(page);
786 if (PageError(page))
787 ret = -EIO;
788 }
789 pagevec_release(&pvec);
790 cond_resched();
791 }
792
793 /* Check for outstanding write errors */
794 if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
795 ret = -ENOSPC;
796 if (test_and_clear_bit(AS_EIO, &mapping->flags))
797 ret = -EIO;
798
799 return ret;
800}
801
802/* 723/*
803 * add a given inode to the list of inodes that must be fully on 724 * add a given inode to the list of inodes that must be fully on
804 * disk before a transaction commit finishes. 725 * disk before a transaction commit finishes.
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 993a7ea45c70..f82e87488ca8 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -153,10 +153,6 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
153int btrfs_ordered_update_i_size(struct inode *inode, 153int btrfs_ordered_update_i_size(struct inode *inode,
154 struct btrfs_ordered_extent *ordered); 154 struct btrfs_ordered_extent *ordered);
155int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); 155int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
156int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
157 pgoff_t start, pgoff_t end);
158int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
159 loff_t end, int sync_mode);
160int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); 156int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
161int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); 157int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
162int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, 158int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 361ad323faac..cfcc93c93a7b 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3518,7 +3518,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3518 BUG_ON(!rc->block_group); 3518 BUG_ON(!rc->block_group);
3519 3519
3520 btrfs_init_workers(&rc->workers, "relocate", 3520 btrfs_init_workers(&rc->workers, "relocate",
3521 fs_info->thread_pool_size); 3521 fs_info->thread_pool_size, NULL);
3522 3522
3523 rc->extent_root = extent_root; 3523 rc->extent_root = extent_root;
3524 btrfs_prepare_block_group_relocation(extent_root, rc->block_group); 3524 btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
@@ -3701,7 +3701,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
3701 mapping_tree_init(&rc->reloc_root_tree); 3701 mapping_tree_init(&rc->reloc_root_tree);
3702 INIT_LIST_HEAD(&rc->reloc_roots); 3702 INIT_LIST_HEAD(&rc->reloc_roots);
3703 btrfs_init_workers(&rc->workers, "relocate", 3703 btrfs_init_workers(&rc->workers, "relocate",
3704 root->fs_info->thread_pool_size); 3704 root->fs_info->thread_pool_size, NULL);
3705 rc->extent_root = root->fs_info->extent_root; 3705 rc->extent_root = root->fs_info->extent_root;
3706 3706
3707 set_reloc_control(rc); 3707 set_reloc_control(rc);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 67035385444c..9de9b2236419 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -344,7 +344,9 @@ static int btrfs_fill_super(struct super_block *sb,
344 sb->s_export_op = &btrfs_export_ops; 344 sb->s_export_op = &btrfs_export_ops;
345 sb->s_xattr = btrfs_xattr_handlers; 345 sb->s_xattr = btrfs_xattr_handlers;
346 sb->s_time_gran = 1; 346 sb->s_time_gran = 1;
347#ifdef CONFIG_BTRFS_POSIX_ACL
347 sb->s_flags |= MS_POSIXACL; 348 sb->s_flags |= MS_POSIXACL;
349#endif
348 350
349 tree_root = open_ctree(sb, fs_devices, (char *)data); 351 tree_root = open_ctree(sb, fs_devices, (char *)data);
350 352
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 88f866f85e7a..0b8f36d4400a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -186,6 +186,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
186 h->alloc_exclude_start = 0; 186 h->alloc_exclude_start = 0;
187 h->delayed_ref_updates = 0; 187 h->delayed_ref_updates = 0;
188 188
189 if (!current->journal_info)
190 current->journal_info = h;
191
189 root->fs_info->running_transaction->use_count++; 192 root->fs_info->running_transaction->use_count++;
190 record_root_in_trans(h, root); 193 record_root_in_trans(h, root);
191 mutex_unlock(&root->fs_info->trans_mutex); 194 mutex_unlock(&root->fs_info->trans_mutex);
@@ -317,6 +320,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
317 wake_up(&cur_trans->writer_wait); 320 wake_up(&cur_trans->writer_wait);
318 put_transaction(cur_trans); 321 put_transaction(cur_trans);
319 mutex_unlock(&info->trans_mutex); 322 mutex_unlock(&info->trans_mutex);
323
324 if (current->journal_info == trans)
325 current->journal_info = NULL;
320 memset(trans, 0, sizeof(*trans)); 326 memset(trans, 0, sizeof(*trans));
321 kmem_cache_free(btrfs_trans_handle_cachep, trans); 327 kmem_cache_free(btrfs_trans_handle_cachep, trans);
322 328
@@ -743,6 +749,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
743 memcpy(&pending->root_key, &key, sizeof(key)); 749 memcpy(&pending->root_key, &key, sizeof(key));
744fail: 750fail:
745 kfree(new_root_item); 751 kfree(new_root_item);
752 btrfs_unreserve_metadata_space(root, 6);
746 return ret; 753 return ret;
747} 754}
748 755
@@ -1059,6 +1066,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1059 1066
1060 mutex_unlock(&root->fs_info->trans_mutex); 1067 mutex_unlock(&root->fs_info->trans_mutex);
1061 1068
1069 if (current->journal_info == trans)
1070 current->journal_info = NULL;
1071
1062 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1072 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1063 return ret; 1073 return ret;
1064} 1074}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 7827841b55cb..4edfdc2acc5f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -137,11 +137,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
137 137
138 mutex_lock(&root->log_mutex); 138 mutex_lock(&root->log_mutex);
139 if (root->log_root) { 139 if (root->log_root) {
140 if (!root->log_start_pid) {
141 root->log_start_pid = current->pid;
142 root->log_multiple_pids = false;
143 } else if (root->log_start_pid != current->pid) {
144 root->log_multiple_pids = true;
145 }
146
140 root->log_batch++; 147 root->log_batch++;
141 atomic_inc(&root->log_writers); 148 atomic_inc(&root->log_writers);
142 mutex_unlock(&root->log_mutex); 149 mutex_unlock(&root->log_mutex);
143 return 0; 150 return 0;
144 } 151 }
152 root->log_multiple_pids = false;
153 root->log_start_pid = current->pid;
145 mutex_lock(&root->fs_info->tree_log_mutex); 154 mutex_lock(&root->fs_info->tree_log_mutex);
146 if (!root->fs_info->log_root_tree) { 155 if (!root->fs_info->log_root_tree) {
147 ret = btrfs_init_log_root_tree(trans, root->fs_info); 156 ret = btrfs_init_log_root_tree(trans, root->fs_info);
@@ -1985,7 +1994,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1985 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 1994 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
1986 wait_log_commit(trans, root, root->log_transid - 1); 1995 wait_log_commit(trans, root, root->log_transid - 1);
1987 1996
1988 while (1) { 1997 while (root->log_multiple_pids) {
1989 unsigned long batch = root->log_batch; 1998 unsigned long batch = root->log_batch;
1990 mutex_unlock(&root->log_mutex); 1999 mutex_unlock(&root->log_mutex);
1991 schedule_timeout_uninterruptible(1); 2000 schedule_timeout_uninterruptible(1);
@@ -2011,6 +2020,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2011 root->log_batch = 0; 2020 root->log_batch = 0;
2012 root->log_transid++; 2021 root->log_transid++;
2013 log->log_transid = root->log_transid; 2022 log->log_transid = root->log_transid;
2023 root->log_start_pid = 0;
2014 smp_mb(); 2024 smp_mb();
2015 /* 2025 /*
2016 * log tree has been flushed to disk, new modifications of 2026 * log tree has been flushed to disk, new modifications of
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 23e7d36ff325..7eda483d7b5a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -446,8 +446,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
446 goto error; 446 goto error;
447 447
448 device->name = kstrdup(orig_dev->name, GFP_NOFS); 448 device->name = kstrdup(orig_dev->name, GFP_NOFS);
449 if (!device->name) 449 if (!device->name) {
450 kfree(device);
450 goto error; 451 goto error;
452 }
451 453
452 device->devid = orig_dev->devid; 454 device->devid = orig_dev->devid;
453 device->work.func = pending_bios_fn; 455 device->work.func = pending_bios_fn;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index a9d3bf4d2689..b0fc93f95fd0 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -260,7 +260,7 @@ err:
260 * attributes are handled directly. 260 * attributes are handled directly.
261 */ 261 */
262struct xattr_handler *btrfs_xattr_handlers[] = { 262struct xattr_handler *btrfs_xattr_handlers[] = {
263#ifdef CONFIG_FS_POSIX_ACL 263#ifdef CONFIG_BTRFS_POSIX_ACL
264 &btrfs_xattr_acl_access_handler, 264 &btrfs_xattr_acl_access_handler,
265 &btrfs_xattr_acl_default_handler, 265 &btrfs_xattr_acl_default_handler,
266#endif 266#endif