aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-10-01 23:23:15 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-10-01 23:23:15 -0400
commit0efe5e32c8729ef44b00d9a7203e4c99a6378b27 (patch)
tree8df3309198b2ab87f549c82c59125d2b106bcdbe /fs/btrfs
parente6a0a8bfef1094084e53bfaad6d512c23da7a6dd (diff)
parent9c2693c9243b81802c6860570557165e874779a7 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable: Btrfs: fix data space leak fix Btrfs: remove duplicates of filemap_ helpers Btrfs: take i_mutex before generic_write_checks Btrfs: fix arguments to btrfs_wait_on_page_writeback_range Btrfs: fix deadlock with free space handling and user transactions Btrfs: fix error cases for ioctl transactions Btrfs: Use CONFIG_BTRFS_POSIX_ACL to enable ACL code Btrfs: introduce missing kfree Btrfs: Fix setting umask when POSIX ACLs are not enabled Btrfs: proper -ENOSPC handling
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/acl.c6
-rw-r--r--fs/btrfs/btrfs_inode.h8
-rw-r--r--fs/btrfs/ctree.h25
-rw-r--r--fs/btrfs/disk-io.c10
-rw-r--r--fs/btrfs/extent-tree.c391
-rw-r--r--fs/btrfs/extent_io.c92
-rw-r--r--fs/btrfs/extent_io.h13
-rw-r--r--fs/btrfs/file.c33
-rw-r--r--fs/btrfs/inode.c235
-rw-r--r--fs/btrfs/ioctl.c62
-rw-r--r--fs/btrfs/ordered-data.c93
-rw-r--r--fs/btrfs/ordered-data.h4
-rw-r--r--fs/btrfs/super.c2
-rw-r--r--fs/btrfs/transaction.c10
-rw-r--r--fs/btrfs/volumes.c4
-rw-r--r--fs/btrfs/xattr.c2
16 files changed, 745 insertions, 245 deletions
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index f128427b995b..69b355ae7f49 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -27,7 +27,7 @@
27#include "btrfs_inode.h" 27#include "btrfs_inode.h"
28#include "xattr.h" 28#include "xattr.h"
29 29
30#ifdef CONFIG_FS_POSIX_ACL 30#ifdef CONFIG_BTRFS_POSIX_ACL
31 31
32static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) 32static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
33{ 33{
@@ -313,7 +313,7 @@ struct xattr_handler btrfs_xattr_acl_access_handler = {
313 .set = btrfs_xattr_acl_access_set, 313 .set = btrfs_xattr_acl_access_set,
314}; 314};
315 315
316#else /* CONFIG_FS_POSIX_ACL */ 316#else /* CONFIG_BTRFS_POSIX_ACL */
317 317
318int btrfs_acl_chmod(struct inode *inode) 318int btrfs_acl_chmod(struct inode *inode)
319{ 319{
@@ -325,4 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
325 return 0; 325 return 0;
326} 326}
327 327
328#endif /* CONFIG_FS_POSIX_ACL */ 328#endif /* CONFIG_BTRFS_POSIX_ACL */
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 82ee56bba299..a54d354cefcb 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -128,6 +128,14 @@ struct btrfs_inode {
128 u64 last_unlink_trans; 128 u64 last_unlink_trans;
129 129
130 /* 130 /*
131 * These two counters are for delalloc metadata reservations. We keep
132 * track of how many extents we've accounted for vs how many extents we
133 * have.
134 */
135 int delalloc_reserved_extents;
136 int delalloc_extents;
137
138 /*
131 * ordered_data_close is set by truncate when a file that used 139 * ordered_data_close is set by truncate when a file that used
132 * to have good data has been truncated to zero. When it is set 140 * to have good data has been truncated to zero. When it is set
133 * the btrfs file release call will add this inode to the 141 * the btrfs file release call will add this inode to the
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4484eb3408af..dd8ced9814c4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -675,18 +675,19 @@ struct btrfs_space_info {
675 current allocations */ 675 current allocations */
676 u64 bytes_readonly; /* total bytes that are read only */ 676 u64 bytes_readonly; /* total bytes that are read only */
677 u64 bytes_super; /* total bytes reserved for the super blocks */ 677 u64 bytes_super; /* total bytes reserved for the super blocks */
678 678 u64 bytes_root; /* the number of bytes needed to commit a
679 /* delalloc accounting */ 679 transaction */
680 u64 bytes_delalloc; /* number of bytes reserved for allocation,
681 this space is not necessarily reserved yet
682 by the allocator */
683 u64 bytes_may_use; /* number of bytes that may be used for 680 u64 bytes_may_use; /* number of bytes that may be used for
684 delalloc */ 681 delalloc/allocations */
682 u64 bytes_delalloc; /* number of bytes currently reserved for
683 delayed allocation */
685 684
686 int full; /* indicates that we cannot allocate any more 685 int full; /* indicates that we cannot allocate any more
687 chunks for this space */ 686 chunks for this space */
688 int force_alloc; /* set if we need to force a chunk alloc for 687 int force_alloc; /* set if we need to force a chunk alloc for
689 this space */ 688 this space */
689 int force_delalloc; /* make people start doing filemap_flush until
690 we're under a threshold */
690 691
691 struct list_head list; 692 struct list_head list;
692 693
@@ -695,6 +696,9 @@ struct btrfs_space_info {
695 spinlock_t lock; 696 spinlock_t lock;
696 struct rw_semaphore groups_sem; 697 struct rw_semaphore groups_sem;
697 atomic_t caching_threads; 698 atomic_t caching_threads;
699
700 int allocating_chunk;
701 wait_queue_head_t wait;
698}; 702};
699 703
700/* 704/*
@@ -2022,7 +2026,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2022void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); 2026void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
2023void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2027void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
2024 2028
2025int btrfs_check_metadata_free_space(struct btrfs_root *root); 2029int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items);
2030int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items);
2031int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2032 struct inode *inode, int num_items);
2033int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
2034 struct inode *inode, int num_items);
2026int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, 2035int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
2027 u64 bytes); 2036 u64 bytes);
2028void btrfs_free_reserved_data_space(struct btrfs_root *root, 2037void btrfs_free_reserved_data_space(struct btrfs_root *root,
@@ -2357,7 +2366,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options);
2357int btrfs_sync_fs(struct super_block *sb, int wait); 2366int btrfs_sync_fs(struct super_block *sb, int wait);
2358 2367
2359/* acl.c */ 2368/* acl.c */
2360#ifdef CONFIG_FS_POSIX_ACL 2369#ifdef CONFIG_BTRFS_POSIX_ACL
2361int btrfs_check_acl(struct inode *inode, int mask); 2370int btrfs_check_acl(struct inode *inode, int mask);
2362#else 2371#else
2363#define btrfs_check_acl NULL 2372#define btrfs_check_acl NULL
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 644e796fd643..af0435f79fa6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -822,14 +822,14 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
822 822
823int btrfs_write_tree_block(struct extent_buffer *buf) 823int btrfs_write_tree_block(struct extent_buffer *buf)
824{ 824{
825 return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start, 825 return filemap_fdatawrite_range(buf->first_page->mapping, buf->start,
826 buf->start + buf->len - 1, WB_SYNC_ALL); 826 buf->start + buf->len - 1);
827} 827}
828 828
829int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) 829int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
830{ 830{
831 return btrfs_wait_on_page_writeback_range(buf->first_page->mapping, 831 return filemap_fdatawait_range(buf->first_page->mapping,
832 buf->start, buf->start + buf->len - 1); 832 buf->start, buf->start + buf->len - 1);
833} 833}
834 834
835struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, 835struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
@@ -1630,7 +1630,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1630 fs_info->sb = sb; 1630 fs_info->sb = sb;
1631 fs_info->max_extent = (u64)-1; 1631 fs_info->max_extent = (u64)-1;
1632 fs_info->max_inline = 8192 * 1024; 1632 fs_info->max_inline = 8192 * 1024;
1633 fs_info->metadata_ratio = 8; 1633 fs_info->metadata_ratio = 0;
1634 1634
1635 fs_info->thread_pool_size = min_t(unsigned long, 1635 fs_info->thread_pool_size = min_t(unsigned long,
1636 num_online_cpus() + 2, 8); 1636 num_online_cpus() + 2, 8);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 993f93ff7ba6..359a754c782c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -68,6 +68,8 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
68 struct extent_buffer **must_clean); 68 struct extent_buffer **must_clean);
69static int find_next_key(struct btrfs_path *path, int level, 69static int find_next_key(struct btrfs_path *path, int level,
70 struct btrfs_key *key); 70 struct btrfs_key *key);
71static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
72 int dump_block_groups);
71 73
72static noinline int 74static noinline int
73block_group_cache_done(struct btrfs_block_group_cache *cache) 75block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -2765,67 +2767,346 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2765 alloc_target); 2767 alloc_target);
2766} 2768}
2767 2769
2770static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
2771{
2772 u64 num_bytes;
2773 int level;
2774
2775 level = BTRFS_MAX_LEVEL - 2;
2776 /*
2777 * NOTE: these calculations are absolutely the worst possible case.
2778 * This assumes that _every_ item we insert will require a new leaf, and
2779 * that the tree has grown to its maximum level size.
2780 */
2781
2782 /*
2783 * for every item we insert we could insert both an extent item and a
2784 * extent ref item. Then for ever item we insert, we will need to cow
2785 * both the original leaf, plus the leaf to the left and right of it.
2786 *
2787 * Unless we are talking about the extent root, then we just want the
2788 * number of items * 2, since we just need the extent item plus its ref.
2789 */
2790 if (root == root->fs_info->extent_root)
2791 num_bytes = num_items * 2;
2792 else
2793 num_bytes = (num_items + (2 * num_items)) * 3;
2794
2795 /*
2796 * num_bytes is total number of leaves we could need times the leaf
2797 * size, and then for every leaf we could end up cow'ing 2 nodes per
2798 * level, down to the leaf level.
2799 */
2800 num_bytes = (num_bytes * root->leafsize) +
2801 (num_bytes * (level * 2)) * root->nodesize;
2802
2803 return num_bytes;
2804}
2805
2768/* 2806/*
2769 * for now this just makes sure we have at least 5% of our metadata space free 2807 * Unreserve metadata space for delalloc. If we have less reserved credits than
2770 * for use. 2808 * we have extents, this function does nothing.
2771 */ 2809 */
2772int btrfs_check_metadata_free_space(struct btrfs_root *root) 2810int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2811 struct inode *inode, int num_items)
2773{ 2812{
2774 struct btrfs_fs_info *info = root->fs_info; 2813 struct btrfs_fs_info *info = root->fs_info;
2775 struct btrfs_space_info *meta_sinfo; 2814 struct btrfs_space_info *meta_sinfo;
2776 u64 alloc_target, thresh; 2815 u64 num_bytes;
2777 int committed = 0, ret; 2816 u64 alloc_target;
2817 bool bug = false;
2778 2818
2779 /* get the space info for where the metadata will live */ 2819 /* get the space info for where the metadata will live */
2780 alloc_target = btrfs_get_alloc_profile(root, 0); 2820 alloc_target = btrfs_get_alloc_profile(root, 0);
2781 meta_sinfo = __find_space_info(info, alloc_target); 2821 meta_sinfo = __find_space_info(info, alloc_target);
2782 if (!meta_sinfo)
2783 goto alloc;
2784 2822
2785again: 2823 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2824 num_items);
2825
2786 spin_lock(&meta_sinfo->lock); 2826 spin_lock(&meta_sinfo->lock);
2787 if (!meta_sinfo->full) 2827 if (BTRFS_I(inode)->delalloc_reserved_extents <=
2788 thresh = meta_sinfo->total_bytes * 80; 2828 BTRFS_I(inode)->delalloc_extents) {
2789 else 2829 spin_unlock(&meta_sinfo->lock);
2790 thresh = meta_sinfo->total_bytes * 95; 2830 return 0;
2831 }
2832
2833 BTRFS_I(inode)->delalloc_reserved_extents--;
2834 BUG_ON(BTRFS_I(inode)->delalloc_reserved_extents < 0);
2835
2836 if (meta_sinfo->bytes_delalloc < num_bytes) {
2837 bug = true;
2838 meta_sinfo->bytes_delalloc = 0;
2839 } else {
2840 meta_sinfo->bytes_delalloc -= num_bytes;
2841 }
2842 spin_unlock(&meta_sinfo->lock);
2791 2843
2844 BUG_ON(bug);
2845
2846 return 0;
2847}
2848
2849static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
2850{
2851 u64 thresh;
2852
2853 thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2854 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
2855 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
2856 meta_sinfo->bytes_may_use;
2857
2858 thresh = meta_sinfo->total_bytes - thresh;
2859 thresh *= 80;
2792 do_div(thresh, 100); 2860 do_div(thresh, 100);
2861 if (thresh <= meta_sinfo->bytes_delalloc)
2862 meta_sinfo->force_delalloc = 1;
2863 else
2864 meta_sinfo->force_delalloc = 0;
2865}
2793 2866
2794 if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + 2867static int maybe_allocate_chunk(struct btrfs_root *root,
2795 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + 2868 struct btrfs_space_info *info)
2796 meta_sinfo->bytes_super > thresh) { 2869{
2797 struct btrfs_trans_handle *trans; 2870 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
2798 if (!meta_sinfo->full) { 2871 struct btrfs_trans_handle *trans;
2799 meta_sinfo->force_alloc = 1; 2872 bool wait = false;
2873 int ret = 0;
2874 u64 min_metadata;
2875 u64 free_space;
2876
2877 free_space = btrfs_super_total_bytes(disk_super);
2878 /*
2879 * we allow the metadata to grow to a max of either 5gb or 5% of the
2880 * space in the volume.
2881 */
2882 min_metadata = min((u64)5 * 1024 * 1024 * 1024,
2883 div64_u64(free_space * 5, 100));
2884 if (info->total_bytes >= min_metadata) {
2885 spin_unlock(&info->lock);
2886 return 0;
2887 }
2888
2889 if (info->full) {
2890 spin_unlock(&info->lock);
2891 return 0;
2892 }
2893
2894 if (!info->allocating_chunk) {
2895 info->force_alloc = 1;
2896 info->allocating_chunk = 1;
2897 init_waitqueue_head(&info->wait);
2898 } else {
2899 wait = true;
2900 }
2901
2902 spin_unlock(&info->lock);
2903
2904 if (wait) {
2905 wait_event(info->wait,
2906 !info->allocating_chunk);
2907 return 1;
2908 }
2909
2910 trans = btrfs_start_transaction(root, 1);
2911 if (!trans) {
2912 ret = -ENOMEM;
2913 goto out;
2914 }
2915
2916 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2917 4096 + 2 * 1024 * 1024,
2918 info->flags, 0);
2919 btrfs_end_transaction(trans, root);
2920 if (ret)
2921 goto out;
2922out:
2923 spin_lock(&info->lock);
2924 info->allocating_chunk = 0;
2925 spin_unlock(&info->lock);
2926 wake_up(&info->wait);
2927
2928 if (ret)
2929 return 0;
2930 return 1;
2931}
2932
2933/*
2934 * Reserve metadata space for delalloc.
2935 */
2936int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
2937 struct inode *inode, int num_items)
2938{
2939 struct btrfs_fs_info *info = root->fs_info;
2940 struct btrfs_space_info *meta_sinfo;
2941 u64 num_bytes;
2942 u64 used;
2943 u64 alloc_target;
2944 int flushed = 0;
2945 int force_delalloc;
2946
2947 /* get the space info for where the metadata will live */
2948 alloc_target = btrfs_get_alloc_profile(root, 0);
2949 meta_sinfo = __find_space_info(info, alloc_target);
2950
2951 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2952 num_items);
2953again:
2954 spin_lock(&meta_sinfo->lock);
2955
2956 force_delalloc = meta_sinfo->force_delalloc;
2957
2958 if (unlikely(!meta_sinfo->bytes_root))
2959 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
2960
2961 if (!flushed)
2962 meta_sinfo->bytes_delalloc += num_bytes;
2963
2964 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2965 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
2966 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
2967 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
2968
2969 if (used > meta_sinfo->total_bytes) {
2970 flushed++;
2971
2972 if (flushed == 1) {
2973 if (maybe_allocate_chunk(root, meta_sinfo))
2974 goto again;
2975 flushed++;
2976 } else {
2800 spin_unlock(&meta_sinfo->lock); 2977 spin_unlock(&meta_sinfo->lock);
2801alloc: 2978 }
2802 trans = btrfs_start_transaction(root, 1);
2803 if (!trans)
2804 return -ENOMEM;
2805 2979
2806 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 2980 if (flushed == 2) {
2807 2 * 1024 * 1024, alloc_target, 0); 2981 filemap_flush(inode->i_mapping);
2808 btrfs_end_transaction(trans, root); 2982 goto again;
2809 if (!meta_sinfo) { 2983 } else if (flushed == 3) {
2810 meta_sinfo = __find_space_info(info, 2984 btrfs_start_delalloc_inodes(root);
2811 alloc_target); 2985 btrfs_wait_ordered_extents(root, 0);
2812 }
2813 goto again; 2986 goto again;
2814 } 2987 }
2988 spin_lock(&meta_sinfo->lock);
2989 meta_sinfo->bytes_delalloc -= num_bytes;
2815 spin_unlock(&meta_sinfo->lock); 2990 spin_unlock(&meta_sinfo->lock);
2991 printk(KERN_ERR "enospc, has %d, reserved %d\n",
2992 BTRFS_I(inode)->delalloc_extents,
2993 BTRFS_I(inode)->delalloc_reserved_extents);
2994 dump_space_info(meta_sinfo, 0, 0);
2995 return -ENOSPC;
2996 }
2816 2997
2817 if (!committed) { 2998 BTRFS_I(inode)->delalloc_reserved_extents++;
2818 committed = 1; 2999 check_force_delalloc(meta_sinfo);
2819 trans = btrfs_join_transaction(root, 1); 3000 spin_unlock(&meta_sinfo->lock);
2820 if (!trans) 3001
2821 return -ENOMEM; 3002 if (!flushed && force_delalloc)
2822 ret = btrfs_commit_transaction(trans, root); 3003 filemap_flush(inode->i_mapping);
2823 if (ret) 3004
2824 return ret; 3005 return 0;
3006}
3007
3008/*
3009 * unreserve num_items number of items worth of metadata space. This needs to
3010 * be paired with btrfs_reserve_metadata_space.
3011 *
3012 * NOTE: if you have the option, run this _AFTER_ you do a
3013 * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
3014 * oprations which will result in more used metadata, so we want to make sure we
3015 * can do that without issue.
3016 */
3017int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
3018{
3019 struct btrfs_fs_info *info = root->fs_info;
3020 struct btrfs_space_info *meta_sinfo;
3021 u64 num_bytes;
3022 u64 alloc_target;
3023 bool bug = false;
3024
3025 /* get the space info for where the metadata will live */
3026 alloc_target = btrfs_get_alloc_profile(root, 0);
3027 meta_sinfo = __find_space_info(info, alloc_target);
3028
3029 num_bytes = calculate_bytes_needed(root, num_items);
3030
3031 spin_lock(&meta_sinfo->lock);
3032 if (meta_sinfo->bytes_may_use < num_bytes) {
3033 bug = true;
3034 meta_sinfo->bytes_may_use = 0;
3035 } else {
3036 meta_sinfo->bytes_may_use -= num_bytes;
3037 }
3038 spin_unlock(&meta_sinfo->lock);
3039
3040 BUG_ON(bug);
3041
3042 return 0;
3043}
3044
3045/*
3046 * Reserve some metadata space for use. We'll calculate the worste case number
3047 * of bytes that would be needed to modify num_items number of items. If we
3048 * have space, fantastic, if not, you get -ENOSPC. Please call
3049 * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
3050 * items you reserved, since whatever metadata you needed should have already
3051 * been allocated.
3052 *
3053 * This will commit the transaction to make more space if we don't have enough
3054 * metadata space. THe only time we don't do this is if we're reserving space
3055 * inside of a transaction, then we will just return -ENOSPC and it is the
3056 * callers responsibility to handle it properly.
3057 */
3058int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
3059{
3060 struct btrfs_fs_info *info = root->fs_info;
3061 struct btrfs_space_info *meta_sinfo;
3062 u64 num_bytes;
3063 u64 used;
3064 u64 alloc_target;
3065 int retries = 0;
3066
3067 /* get the space info for where the metadata will live */
3068 alloc_target = btrfs_get_alloc_profile(root, 0);
3069 meta_sinfo = __find_space_info(info, alloc_target);
3070
3071 num_bytes = calculate_bytes_needed(root, num_items);
3072again:
3073 spin_lock(&meta_sinfo->lock);
3074
3075 if (unlikely(!meta_sinfo->bytes_root))
3076 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3077
3078 if (!retries)
3079 meta_sinfo->bytes_may_use += num_bytes;
3080
3081 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3082 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3083 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3084 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3085
3086 if (used > meta_sinfo->total_bytes) {
3087 retries++;
3088 if (retries == 1) {
3089 if (maybe_allocate_chunk(root, meta_sinfo))
3090 goto again;
3091 retries++;
3092 } else {
3093 spin_unlock(&meta_sinfo->lock);
3094 }
3095
3096 if (retries == 2) {
3097 btrfs_start_delalloc_inodes(root);
3098 btrfs_wait_ordered_extents(root, 0);
2825 goto again; 3099 goto again;
2826 } 3100 }
3101 spin_lock(&meta_sinfo->lock);
3102 meta_sinfo->bytes_may_use -= num_bytes;
3103 spin_unlock(&meta_sinfo->lock);
3104
3105 dump_space_info(meta_sinfo, 0, 0);
2827 return -ENOSPC; 3106 return -ENOSPC;
2828 } 3107 }
3108
3109 check_force_delalloc(meta_sinfo);
2829 spin_unlock(&meta_sinfo->lock); 3110 spin_unlock(&meta_sinfo->lock);
2830 3111
2831 return 0; 3112 return 0;
@@ -2888,7 +3169,7 @@ alloc:
2888 spin_unlock(&data_sinfo->lock); 3169 spin_unlock(&data_sinfo->lock);
2889 3170
2890 /* commit the current transaction and try again */ 3171 /* commit the current transaction and try again */
2891 if (!committed) { 3172 if (!committed && !root->fs_info->open_ioctl_trans) {
2892 committed = 1; 3173 committed = 1;
2893 trans = btrfs_join_transaction(root, 1); 3174 trans = btrfs_join_transaction(root, 1);
2894 if (!trans) 3175 if (!trans)
@@ -2916,7 +3197,7 @@ alloc:
2916 BTRFS_I(inode)->reserved_bytes += bytes; 3197 BTRFS_I(inode)->reserved_bytes += bytes;
2917 spin_unlock(&data_sinfo->lock); 3198 spin_unlock(&data_sinfo->lock);
2918 3199
2919 return btrfs_check_metadata_free_space(root); 3200 return 0;
2920} 3201}
2921 3202
2922/* 3203/*
@@ -3015,17 +3296,15 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3015 BUG_ON(!space_info); 3296 BUG_ON(!space_info);
3016 3297
3017 spin_lock(&space_info->lock); 3298 spin_lock(&space_info->lock);
3018 if (space_info->force_alloc) { 3299 if (space_info->force_alloc)
3019 force = 1; 3300 force = 1;
3020 space_info->force_alloc = 0;
3021 }
3022 if (space_info->full) { 3301 if (space_info->full) {
3023 spin_unlock(&space_info->lock); 3302 spin_unlock(&space_info->lock);
3024 goto out; 3303 goto out;
3025 } 3304 }
3026 3305
3027 thresh = space_info->total_bytes - space_info->bytes_readonly; 3306 thresh = space_info->total_bytes - space_info->bytes_readonly;
3028 thresh = div_factor(thresh, 6); 3307 thresh = div_factor(thresh, 8);
3029 if (!force && 3308 if (!force &&
3030 (space_info->bytes_used + space_info->bytes_pinned + 3309 (space_info->bytes_used + space_info->bytes_pinned +
3031 space_info->bytes_reserved + alloc_bytes) < thresh) { 3310 space_info->bytes_reserved + alloc_bytes) < thresh) {
@@ -3039,7 +3318,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3039 * we keep a reasonable number of metadata chunks allocated in the 3318 * we keep a reasonable number of metadata chunks allocated in the
3040 * FS as well. 3319 * FS as well.
3041 */ 3320 */
3042 if (flags & BTRFS_BLOCK_GROUP_DATA) { 3321 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3043 fs_info->data_chunk_allocations++; 3322 fs_info->data_chunk_allocations++;
3044 if (!(fs_info->data_chunk_allocations % 3323 if (!(fs_info->data_chunk_allocations %
3045 fs_info->metadata_ratio)) 3324 fs_info->metadata_ratio))
@@ -3047,8 +3326,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3047 } 3326 }
3048 3327
3049 ret = btrfs_alloc_chunk(trans, extent_root, flags); 3328 ret = btrfs_alloc_chunk(trans, extent_root, flags);
3329 spin_lock(&space_info->lock);
3050 if (ret) 3330 if (ret)
3051 space_info->full = 1; 3331 space_info->full = 1;
3332 space_info->force_alloc = 0;
3333 spin_unlock(&space_info->lock);
3052out: 3334out:
3053 mutex_unlock(&extent_root->fs_info->chunk_mutex); 3335 mutex_unlock(&extent_root->fs_info->chunk_mutex);
3054 return ret; 3336 return ret;
@@ -4063,21 +4345,32 @@ loop:
4063 return ret; 4345 return ret;
4064} 4346}
4065 4347
4066static void dump_space_info(struct btrfs_space_info *info, u64 bytes) 4348static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4349 int dump_block_groups)
4067{ 4350{
4068 struct btrfs_block_group_cache *cache; 4351 struct btrfs_block_group_cache *cache;
4069 4352
4353 spin_lock(&info->lock);
4070 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 4354 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
4071 (unsigned long long)(info->total_bytes - info->bytes_used - 4355 (unsigned long long)(info->total_bytes - info->bytes_used -
4072 info->bytes_pinned - info->bytes_reserved), 4356 info->bytes_pinned - info->bytes_reserved -
4357 info->bytes_super),
4073 (info->full) ? "" : "not "); 4358 (info->full) ? "" : "not ");
4074 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," 4359 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
4075 " may_use=%llu, used=%llu\n", 4360 " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
4361 "\n",
4076 (unsigned long long)info->total_bytes, 4362 (unsigned long long)info->total_bytes,
4077 (unsigned long long)info->bytes_pinned, 4363 (unsigned long long)info->bytes_pinned,
4078 (unsigned long long)info->bytes_delalloc, 4364 (unsigned long long)info->bytes_delalloc,
4079 (unsigned long long)info->bytes_may_use, 4365 (unsigned long long)info->bytes_may_use,
4080 (unsigned long long)info->bytes_used); 4366 (unsigned long long)info->bytes_used,
4367 (unsigned long long)info->bytes_root,
4368 (unsigned long long)info->bytes_super,
4369 (unsigned long long)info->bytes_reserved);
4370 spin_unlock(&info->lock);
4371
4372 if (!dump_block_groups)
4373 return;
4081 4374
4082 down_read(&info->groups_sem); 4375 down_read(&info->groups_sem);
4083 list_for_each_entry(cache, &info->block_groups, list) { 4376 list_for_each_entry(cache, &info->block_groups, list) {
@@ -4145,7 +4438,7 @@ again:
4145 printk(KERN_ERR "btrfs allocation failed flags %llu, " 4438 printk(KERN_ERR "btrfs allocation failed flags %llu, "
4146 "wanted %llu\n", (unsigned long long)data, 4439 "wanted %llu\n", (unsigned long long)data,
4147 (unsigned long long)num_bytes); 4440 (unsigned long long)num_bytes);
4148 dump_space_info(sinfo, num_bytes); 4441 dump_space_info(sinfo, num_bytes, 1);
4149 } 4442 }
4150 4443
4151 return ret; 4444 return ret;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0cb88f8146ea..de1793ba004a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -280,6 +280,14 @@ static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
280 return NULL; 280 return NULL;
281} 281}
282 282
283static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
284 struct extent_state *other)
285{
286 if (tree->ops && tree->ops->merge_extent_hook)
287 tree->ops->merge_extent_hook(tree->mapping->host, new,
288 other);
289}
290
283/* 291/*
284 * utility function to look for merge candidates inside a given range. 292 * utility function to look for merge candidates inside a given range.
285 * Any extents with matching state are merged together into a single 293 * Any extents with matching state are merged together into a single
@@ -303,6 +311,7 @@ static int merge_state(struct extent_io_tree *tree,
303 other = rb_entry(other_node, struct extent_state, rb_node); 311 other = rb_entry(other_node, struct extent_state, rb_node);
304 if (other->end == state->start - 1 && 312 if (other->end == state->start - 1 &&
305 other->state == state->state) { 313 other->state == state->state) {
314 merge_cb(tree, state, other);
306 state->start = other->start; 315 state->start = other->start;
307 other->tree = NULL; 316 other->tree = NULL;
308 rb_erase(&other->rb_node, &tree->state); 317 rb_erase(&other->rb_node, &tree->state);
@@ -314,33 +323,37 @@ static int merge_state(struct extent_io_tree *tree,
314 other = rb_entry(other_node, struct extent_state, rb_node); 323 other = rb_entry(other_node, struct extent_state, rb_node);
315 if (other->start == state->end + 1 && 324 if (other->start == state->end + 1 &&
316 other->state == state->state) { 325 other->state == state->state) {
326 merge_cb(tree, state, other);
317 other->start = state->start; 327 other->start = state->start;
318 state->tree = NULL; 328 state->tree = NULL;
319 rb_erase(&state->rb_node, &tree->state); 329 rb_erase(&state->rb_node, &tree->state);
320 free_extent_state(state); 330 free_extent_state(state);
331 state = NULL;
321 } 332 }
322 } 333 }
334
323 return 0; 335 return 0;
324} 336}
325 337
326static void set_state_cb(struct extent_io_tree *tree, 338static int set_state_cb(struct extent_io_tree *tree,
327 struct extent_state *state, 339 struct extent_state *state,
328 unsigned long bits) 340 unsigned long bits)
329{ 341{
330 if (tree->ops && tree->ops->set_bit_hook) { 342 if (tree->ops && tree->ops->set_bit_hook) {
331 tree->ops->set_bit_hook(tree->mapping->host, state->start, 343 return tree->ops->set_bit_hook(tree->mapping->host,
332 state->end, state->state, bits); 344 state->start, state->end,
345 state->state, bits);
333 } 346 }
347
348 return 0;
334} 349}
335 350
336static void clear_state_cb(struct extent_io_tree *tree, 351static void clear_state_cb(struct extent_io_tree *tree,
337 struct extent_state *state, 352 struct extent_state *state,
338 unsigned long bits) 353 unsigned long bits)
339{ 354{
340 if (tree->ops && tree->ops->clear_bit_hook) { 355 if (tree->ops && tree->ops->clear_bit_hook)
341 tree->ops->clear_bit_hook(tree->mapping->host, state->start, 356 tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
342 state->end, state->state, bits);
343 }
344} 357}
345 358
346/* 359/*
@@ -358,6 +371,7 @@ static int insert_state(struct extent_io_tree *tree,
358 int bits) 371 int bits)
359{ 372{
360 struct rb_node *node; 373 struct rb_node *node;
374 int ret;
361 375
362 if (end < start) { 376 if (end < start) {
363 printk(KERN_ERR "btrfs end < start %llu %llu\n", 377 printk(KERN_ERR "btrfs end < start %llu %llu\n",
@@ -365,11 +379,14 @@ static int insert_state(struct extent_io_tree *tree,
365 (unsigned long long)start); 379 (unsigned long long)start);
366 WARN_ON(1); 380 WARN_ON(1);
367 } 381 }
368 if (bits & EXTENT_DIRTY)
369 tree->dirty_bytes += end - start + 1;
370 state->start = start; 382 state->start = start;
371 state->end = end; 383 state->end = end;
372 set_state_cb(tree, state, bits); 384 ret = set_state_cb(tree, state, bits);
385 if (ret)
386 return ret;
387
388 if (bits & EXTENT_DIRTY)
389 tree->dirty_bytes += end - start + 1;
373 state->state |= bits; 390 state->state |= bits;
374 node = tree_insert(&tree->state, end, &state->rb_node); 391 node = tree_insert(&tree->state, end, &state->rb_node);
375 if (node) { 392 if (node) {
@@ -387,6 +404,15 @@ static int insert_state(struct extent_io_tree *tree,
387 return 0; 404 return 0;
388} 405}
389 406
407static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
408 u64 split)
409{
410 if (tree->ops && tree->ops->split_extent_hook)
411 return tree->ops->split_extent_hook(tree->mapping->host,
412 orig, split);
413 return 0;
414}
415
390/* 416/*
391 * split a given extent state struct in two, inserting the preallocated 417 * split a given extent state struct in two, inserting the preallocated
392 * struct 'prealloc' as the newly created second half. 'split' indicates an 418 * struct 'prealloc' as the newly created second half. 'split' indicates an
@@ -405,6 +431,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
405 struct extent_state *prealloc, u64 split) 431 struct extent_state *prealloc, u64 split)
406{ 432{
407 struct rb_node *node; 433 struct rb_node *node;
434
435 split_cb(tree, orig, split);
436
408 prealloc->start = orig->start; 437 prealloc->start = orig->start;
409 prealloc->end = split - 1; 438 prealloc->end = split - 1;
410 prealloc->state = orig->state; 439 prealloc->state = orig->state;
@@ -542,8 +571,8 @@ hit_next:
542 if (err) 571 if (err)
543 goto out; 572 goto out;
544 if (state->end <= end) { 573 if (state->end <= end) {
545 set |= clear_state_bit(tree, state, bits, 574 set |= clear_state_bit(tree, state, bits, wake,
546 wake, delete); 575 delete);
547 if (last_end == (u64)-1) 576 if (last_end == (u64)-1)
548 goto out; 577 goto out;
549 start = last_end + 1; 578 start = last_end + 1;
@@ -561,12 +590,11 @@ hit_next:
561 prealloc = alloc_extent_state(GFP_ATOMIC); 590 prealloc = alloc_extent_state(GFP_ATOMIC);
562 err = split_state(tree, state, prealloc, end + 1); 591 err = split_state(tree, state, prealloc, end + 1);
563 BUG_ON(err == -EEXIST); 592 BUG_ON(err == -EEXIST);
564
565 if (wake) 593 if (wake)
566 wake_up(&state->wq); 594 wake_up(&state->wq);
567 595
568 set |= clear_state_bit(tree, prealloc, bits, 596 set |= clear_state_bit(tree, prealloc, bits, wake, delete);
569 wake, delete); 597
570 prealloc = NULL; 598 prealloc = NULL;
571 goto out; 599 goto out;
572 } 600 }
@@ -667,16 +695,23 @@ out:
667 return 0; 695 return 0;
668} 696}
669 697
670static void set_state_bits(struct extent_io_tree *tree, 698static int set_state_bits(struct extent_io_tree *tree,
671 struct extent_state *state, 699 struct extent_state *state,
672 int bits) 700 int bits)
673{ 701{
702 int ret;
703
704 ret = set_state_cb(tree, state, bits);
705 if (ret)
706 return ret;
707
674 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 708 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
675 u64 range = state->end - state->start + 1; 709 u64 range = state->end - state->start + 1;
676 tree->dirty_bytes += range; 710 tree->dirty_bytes += range;
677 } 711 }
678 set_state_cb(tree, state, bits);
679 state->state |= bits; 712 state->state |= bits;
713
714 return 0;
680} 715}
681 716
682static void cache_state(struct extent_state *state, 717static void cache_state(struct extent_state *state,
@@ -758,7 +793,10 @@ hit_next:
758 goto out; 793 goto out;
759 } 794 }
760 795
761 set_state_bits(tree, state, bits); 796 err = set_state_bits(tree, state, bits);
797 if (err)
798 goto out;
799
762 cache_state(state, cached_state); 800 cache_state(state, cached_state);
763 merge_state(tree, state); 801 merge_state(tree, state);
764 if (last_end == (u64)-1) 802 if (last_end == (u64)-1)
@@ -805,7 +843,9 @@ hit_next:
805 if (err) 843 if (err)
806 goto out; 844 goto out;
807 if (state->end <= end) { 845 if (state->end <= end) {
808 set_state_bits(tree, state, bits); 846 err = set_state_bits(tree, state, bits);
847 if (err)
848 goto out;
809 cache_state(state, cached_state); 849 cache_state(state, cached_state);
810 merge_state(tree, state); 850 merge_state(tree, state);
811 if (last_end == (u64)-1) 851 if (last_end == (u64)-1)
@@ -829,11 +869,13 @@ hit_next:
829 this_end = last_start - 1; 869 this_end = last_start - 1;
830 err = insert_state(tree, prealloc, start, this_end, 870 err = insert_state(tree, prealloc, start, this_end,
831 bits); 871 bits);
832 cache_state(prealloc, cached_state);
833 prealloc = NULL;
834 BUG_ON(err == -EEXIST); 872 BUG_ON(err == -EEXIST);
835 if (err) 873 if (err) {
874 prealloc = NULL;
836 goto out; 875 goto out;
876 }
877 cache_state(prealloc, cached_state);
878 prealloc = NULL;
837 start = this_end + 1; 879 start = this_end + 1;
838 goto search_again; 880 goto search_again;
839 } 881 }
@@ -852,7 +894,11 @@ hit_next:
852 err = split_state(tree, state, prealloc, end + 1); 894 err = split_state(tree, state, prealloc, end + 1);
853 BUG_ON(err == -EEXIST); 895 BUG_ON(err == -EEXIST);
854 896
855 set_state_bits(tree, prealloc, bits); 897 err = set_state_bits(tree, prealloc, bits);
898 if (err) {
899 prealloc = NULL;
900 goto out;
901 }
856 cache_state(prealloc, cached_state); 902 cache_state(prealloc, cached_state);
857 merge_state(tree, prealloc); 903 merge_state(tree, prealloc);
858 prealloc = NULL; 904 prealloc = NULL;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 14ed16fd862d..4794ec891fed 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -60,8 +60,13 @@ struct extent_io_ops {
60 struct extent_state *state, int uptodate); 60 struct extent_state *state, int uptodate);
61 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, 61 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
62 unsigned long old, unsigned long bits); 62 unsigned long old, unsigned long bits);
63 int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end, 63 int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
64 unsigned long old, unsigned long bits); 64 unsigned long bits);
65 int (*merge_extent_hook)(struct inode *inode,
66 struct extent_state *new,
67 struct extent_state *other);
68 int (*split_extent_hook)(struct inode *inode,
69 struct extent_state *orig, u64 split);
65 int (*write_cache_pages_lock_hook)(struct page *page); 70 int (*write_cache_pages_lock_hook)(struct page *page);
66}; 71};
67 72
@@ -79,10 +84,14 @@ struct extent_state {
79 u64 start; 84 u64 start;
80 u64 end; /* inclusive */ 85 u64 end; /* inclusive */
81 struct rb_node rb_node; 86 struct rb_node rb_node;
87
88 /* ADD NEW ELEMENTS AFTER THIS */
82 struct extent_io_tree *tree; 89 struct extent_io_tree *tree;
83 wait_queue_head_t wq; 90 wait_queue_head_t wq;
84 atomic_t refs; 91 atomic_t refs;
85 unsigned long state; 92 unsigned long state;
93 u64 split_start;
94 u64 split_end;
86 95
87 /* for use by the FS */ 96 /* for use by the FS */
88 u64 private; 97 u64 private;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9ed17dbe5c6e..f19e1259a971 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -123,7 +123,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
123 root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 123 root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
124 124
125 end_of_last_block = start_pos + num_bytes - 1; 125 end_of_last_block = start_pos + num_bytes - 1;
126 btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); 126 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
127 if (err)
128 return err;
129
127 for (i = 0; i < num_pages; i++) { 130 for (i = 0; i < num_pages; i++) {
128 struct page *p = pages[i]; 131 struct page *p = pages[i];
129 SetPageUptodate(p); 132 SetPageUptodate(p);
@@ -917,21 +920,35 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
917 start_pos = pos; 920 start_pos = pos;
918 921
919 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 922 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
923
924 /* do the reserve before the mutex lock in case we have to do some
925 * flushing. We wouldn't deadlock, but this is more polite.
926 */
927 err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
928 if (err)
929 goto out_nolock;
930
931 mutex_lock(&inode->i_mutex);
932
920 current->backing_dev_info = inode->i_mapping->backing_dev_info; 933 current->backing_dev_info = inode->i_mapping->backing_dev_info;
921 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 934 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
922 if (err) 935 if (err)
923 goto out_nolock; 936 goto out;
937
924 if (count == 0) 938 if (count == 0)
925 goto out_nolock; 939 goto out;
926 940
927 err = file_remove_suid(file); 941 err = file_remove_suid(file);
928 if (err) 942 if (err)
929 goto out_nolock; 943 goto out;
944
930 file_update_time(file); 945 file_update_time(file);
931 946
932 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 947 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
933 948
934 mutex_lock(&inode->i_mutex); 949 /* generic_write_checks can change our pos */
950 start_pos = pos;
951
935 BTRFS_I(inode)->sequence++; 952 BTRFS_I(inode)->sequence++;
936 first_index = pos >> PAGE_CACHE_SHIFT; 953 first_index = pos >> PAGE_CACHE_SHIFT;
937 last_index = (pos + count) >> PAGE_CACHE_SHIFT; 954 last_index = (pos + count) >> PAGE_CACHE_SHIFT;
@@ -1005,9 +1022,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
1005 } 1022 }
1006 1023
1007 if (will_write) { 1024 if (will_write) {
1008 btrfs_fdatawrite_range(inode->i_mapping, pos, 1025 filemap_fdatawrite_range(inode->i_mapping, pos,
1009 pos + write_bytes - 1, 1026 pos + write_bytes - 1);
1010 WB_SYNC_ALL);
1011 } else { 1027 } else {
1012 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1028 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
1013 num_pages); 1029 num_pages);
@@ -1028,6 +1044,7 @@ out:
1028 mutex_unlock(&inode->i_mutex); 1044 mutex_unlock(&inode->i_mutex);
1029 if (ret) 1045 if (ret)
1030 err = ret; 1046 err = ret;
1047 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
1031 1048
1032out_nolock: 1049out_nolock:
1033 kfree(pages); 1050 kfree(pages);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b9fe06d751c0..112e5aa85892 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1159,6 +1159,83 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1159 return ret; 1159 return ret;
1160} 1160}
1161 1161
1162static int btrfs_split_extent_hook(struct inode *inode,
1163 struct extent_state *orig, u64 split)
1164{
1165 struct btrfs_root *root = BTRFS_I(inode)->root;
1166 u64 size;
1167
1168 if (!(orig->state & EXTENT_DELALLOC))
1169 return 0;
1170
1171 size = orig->end - orig->start + 1;
1172 if (size > root->fs_info->max_extent) {
1173 u64 num_extents;
1174 u64 new_size;
1175
1176 new_size = orig->end - split + 1;
1177 num_extents = div64_u64(size + root->fs_info->max_extent - 1,
1178 root->fs_info->max_extent);
1179
1180 /*
1181 * if we break a large extent up then leave delalloc_extents be,
1182 * since we've already accounted for the large extent.
1183 */
1184 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1185 root->fs_info->max_extent) < num_extents)
1186 return 0;
1187 }
1188
1189 BTRFS_I(inode)->delalloc_extents++;
1190
1191 return 0;
1192}
1193
1194/*
1195 * extent_io.c merge_extent_hook, used to track merged delayed allocation
1196 * extents so we can keep track of new extents that are just merged onto old
1197 * extents, such as when we are doing sequential writes, so we can properly
1198 * account for the metadata space we'll need.
1199 */
1200static int btrfs_merge_extent_hook(struct inode *inode,
1201 struct extent_state *new,
1202 struct extent_state *other)
1203{
1204 struct btrfs_root *root = BTRFS_I(inode)->root;
1205 u64 new_size, old_size;
1206 u64 num_extents;
1207
1208 /* not delalloc, ignore it */
1209 if (!(other->state & EXTENT_DELALLOC))
1210 return 0;
1211
1212 old_size = other->end - other->start + 1;
1213 if (new->start < other->start)
1214 new_size = other->end - new->start + 1;
1215 else
1216 new_size = new->end - other->start + 1;
1217
1218 /* we're not bigger than the max, unreserve the space and go */
1219 if (new_size <= root->fs_info->max_extent) {
1220 BTRFS_I(inode)->delalloc_extents--;
1221 return 0;
1222 }
1223
1224 /*
1225 * If we grew by another max_extent, just return, we want to keep that
1226 * reserved amount.
1227 */
1228 num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
1229 root->fs_info->max_extent);
1230 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1231 root->fs_info->max_extent) > num_extents)
1232 return 0;
1233
1234 BTRFS_I(inode)->delalloc_extents--;
1235
1236 return 0;
1237}
1238
1162/* 1239/*
1163 * extent_io.c set_bit_hook, used to track delayed allocation 1240 * extent_io.c set_bit_hook, used to track delayed allocation
1164 * bytes in this file, and to maintain the list of inodes that 1241 * bytes in this file, and to maintain the list of inodes that
@@ -1167,6 +1244,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1167static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, 1244static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1168 unsigned long old, unsigned long bits) 1245 unsigned long old, unsigned long bits)
1169{ 1246{
1247
1170 /* 1248 /*
1171 * set_bit and clear bit hooks normally require _irqsave/restore 1249 * set_bit and clear bit hooks normally require _irqsave/restore
1172 * but in this case, we are only testeing for the DELALLOC 1250 * but in this case, we are only testeing for the DELALLOC
@@ -1174,6 +1252,8 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1174 */ 1252 */
1175 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1253 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1176 struct btrfs_root *root = BTRFS_I(inode)->root; 1254 struct btrfs_root *root = BTRFS_I(inode)->root;
1255
1256 BTRFS_I(inode)->delalloc_extents++;
1177 btrfs_delalloc_reserve_space(root, inode, end - start + 1); 1257 btrfs_delalloc_reserve_space(root, inode, end - start + 1);
1178 spin_lock(&root->fs_info->delalloc_lock); 1258 spin_lock(&root->fs_info->delalloc_lock);
1179 BTRFS_I(inode)->delalloc_bytes += end - start + 1; 1259 BTRFS_I(inode)->delalloc_bytes += end - start + 1;
@@ -1190,22 +1270,27 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1190/* 1270/*
1191 * extent_io.c clear_bit_hook, see set_bit_hook for why 1271 * extent_io.c clear_bit_hook, see set_bit_hook for why
1192 */ 1272 */
1193static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, 1273static int btrfs_clear_bit_hook(struct inode *inode,
1194 unsigned long old, unsigned long bits) 1274 struct extent_state *state, unsigned long bits)
1195{ 1275{
1196 /* 1276 /*
1197 * set_bit and clear bit hooks normally require _irqsave/restore 1277 * set_bit and clear bit hooks normally require _irqsave/restore
1198 * but in this case, we are only testeing for the DELALLOC 1278 * but in this case, we are only testeing for the DELALLOC
1199 * bit, which is only set or cleared with irqs on 1279 * bit, which is only set or cleared with irqs on
1200 */ 1280 */
1201 if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1281 if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1202 struct btrfs_root *root = BTRFS_I(inode)->root; 1282 struct btrfs_root *root = BTRFS_I(inode)->root;
1203 1283
1284 BTRFS_I(inode)->delalloc_extents--;
1285 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
1286
1204 spin_lock(&root->fs_info->delalloc_lock); 1287 spin_lock(&root->fs_info->delalloc_lock);
1205 if (end - start + 1 > root->fs_info->delalloc_bytes) { 1288 if (state->end - state->start + 1 >
1289 root->fs_info->delalloc_bytes) {
1206 printk(KERN_INFO "btrfs warning: delalloc account " 1290 printk(KERN_INFO "btrfs warning: delalloc account "
1207 "%llu %llu\n", 1291 "%llu %llu\n",
1208 (unsigned long long)end - start + 1, 1292 (unsigned long long)
1293 state->end - state->start + 1,
1209 (unsigned long long) 1294 (unsigned long long)
1210 root->fs_info->delalloc_bytes); 1295 root->fs_info->delalloc_bytes);
1211 btrfs_delalloc_free_space(root, inode, (u64)-1); 1296 btrfs_delalloc_free_space(root, inode, (u64)-1);
@@ -1213,9 +1298,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
1213 BTRFS_I(inode)->delalloc_bytes = 0; 1298 BTRFS_I(inode)->delalloc_bytes = 0;
1214 } else { 1299 } else {
1215 btrfs_delalloc_free_space(root, inode, 1300 btrfs_delalloc_free_space(root, inode,
1216 end - start + 1); 1301 state->end -
1217 root->fs_info->delalloc_bytes -= end - start + 1; 1302 state->start + 1);
1218 BTRFS_I(inode)->delalloc_bytes -= end - start + 1; 1303 root->fs_info->delalloc_bytes -= state->end -
1304 state->start + 1;
1305 BTRFS_I(inode)->delalloc_bytes -= state->end -
1306 state->start + 1;
1219 } 1307 }
1220 if (BTRFS_I(inode)->delalloc_bytes == 0 && 1308 if (BTRFS_I(inode)->delalloc_bytes == 0 &&
1221 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1309 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
@@ -2950,7 +3038,12 @@ again:
2950 goto again; 3038 goto again;
2951 } 3039 }
2952 3040
2953 btrfs_set_extent_delalloc(inode, page_start, page_end); 3041 ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
3042 if (ret) {
3043 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3044 goto out_unlock;
3045 }
3046
2954 ret = 0; 3047 ret = 0;
2955 if (offset != PAGE_CACHE_SIZE) { 3048 if (offset != PAGE_CACHE_SIZE) {
2956 kaddr = kmap(page); 3049 kaddr = kmap(page);
@@ -2981,15 +3074,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
2981 u64 last_byte; 3074 u64 last_byte;
2982 u64 cur_offset; 3075 u64 cur_offset;
2983 u64 hole_size; 3076 u64 hole_size;
2984 int err; 3077 int err = 0;
2985 3078
2986 if (size <= hole_start) 3079 if (size <= hole_start)
2987 return 0; 3080 return 0;
2988 3081
2989 err = btrfs_check_metadata_free_space(root);
2990 if (err)
2991 return err;
2992
2993 btrfs_truncate_page(inode->i_mapping, inode->i_size); 3082 btrfs_truncate_page(inode->i_mapping, inode->i_size);
2994 3083
2995 while (1) { 3084 while (1) {
@@ -3024,12 +3113,18 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3024 cur_offset, &hint_byte, 1); 3113 cur_offset, &hint_byte, 1);
3025 if (err) 3114 if (err)
3026 break; 3115 break;
3116
3117 err = btrfs_reserve_metadata_space(root, 1);
3118 if (err)
3119 break;
3120
3027 err = btrfs_insert_file_extent(trans, root, 3121 err = btrfs_insert_file_extent(trans, root,
3028 inode->i_ino, cur_offset, 0, 3122 inode->i_ino, cur_offset, 0,
3029 0, hole_size, 0, hole_size, 3123 0, hole_size, 0, hole_size,
3030 0, 0, 0); 3124 0, 0, 0);
3031 btrfs_drop_extent_cache(inode, hole_start, 3125 btrfs_drop_extent_cache(inode, hole_start,
3032 last_byte - 1, 0); 3126 last_byte - 1, 0);
3127 btrfs_unreserve_metadata_space(root, 1);
3033 } 3128 }
3034 free_extent_map(em); 3129 free_extent_map(em);
3035 cur_offset = last_byte; 3130 cur_offset = last_byte;
@@ -3990,11 +4085,18 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
3990 if (!new_valid_dev(rdev)) 4085 if (!new_valid_dev(rdev))
3991 return -EINVAL; 4086 return -EINVAL;
3992 4087
3993 err = btrfs_check_metadata_free_space(root); 4088 /*
4089 * 2 for inode item and ref
4090 * 2 for dir items
4091 * 1 for xattr if selinux is on
4092 */
4093 err = btrfs_reserve_metadata_space(root, 5);
3994 if (err) 4094 if (err)
3995 goto fail; 4095 return err;
3996 4096
3997 trans = btrfs_start_transaction(root, 1); 4097 trans = btrfs_start_transaction(root, 1);
4098 if (!trans)
4099 goto fail;
3998 btrfs_set_trans_block_group(trans, dir); 4100 btrfs_set_trans_block_group(trans, dir);
3999 4101
4000 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4102 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -4032,6 +4134,7 @@ out_unlock:
4032 nr = trans->blocks_used; 4134 nr = trans->blocks_used;
4033 btrfs_end_transaction_throttle(trans, root); 4135 btrfs_end_transaction_throttle(trans, root);
4034fail: 4136fail:
4137 btrfs_unreserve_metadata_space(root, 5);
4035 if (drop_inode) { 4138 if (drop_inode) {
4036 inode_dec_link_count(inode); 4139 inode_dec_link_count(inode);
4037 iput(inode); 4140 iput(inode);
@@ -4052,10 +4155,18 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4052 u64 objectid; 4155 u64 objectid;
4053 u64 index = 0; 4156 u64 index = 0;
4054 4157
4055 err = btrfs_check_metadata_free_space(root); 4158 /*
4159 * 2 for inode item and ref
4160 * 2 for dir items
4161 * 1 for xattr if selinux is on
4162 */
4163 err = btrfs_reserve_metadata_space(root, 5);
4056 if (err) 4164 if (err)
4057 goto fail; 4165 return err;
4166
4058 trans = btrfs_start_transaction(root, 1); 4167 trans = btrfs_start_transaction(root, 1);
4168 if (!trans)
4169 goto fail;
4059 btrfs_set_trans_block_group(trans, dir); 4170 btrfs_set_trans_block_group(trans, dir);
4060 4171
4061 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4172 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -4096,6 +4207,7 @@ out_unlock:
4096 nr = trans->blocks_used; 4207 nr = trans->blocks_used;
4097 btrfs_end_transaction_throttle(trans, root); 4208 btrfs_end_transaction_throttle(trans, root);
4098fail: 4209fail:
4210 btrfs_unreserve_metadata_space(root, 5);
4099 if (drop_inode) { 4211 if (drop_inode) {
4100 inode_dec_link_count(inode); 4212 inode_dec_link_count(inode);
4101 iput(inode); 4213 iput(inode);
@@ -4118,10 +4230,16 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4118 if (inode->i_nlink == 0) 4230 if (inode->i_nlink == 0)
4119 return -ENOENT; 4231 return -ENOENT;
4120 4232
4121 btrfs_inc_nlink(inode); 4233 /*
4122 err = btrfs_check_metadata_free_space(root); 4234 * 1 item for inode ref
4235 * 2 items for dir items
4236 */
4237 err = btrfs_reserve_metadata_space(root, 3);
4123 if (err) 4238 if (err)
4124 goto fail; 4239 return err;
4240
4241 btrfs_inc_nlink(inode);
4242
4125 err = btrfs_set_inode_index(dir, &index); 4243 err = btrfs_set_inode_index(dir, &index);
4126 if (err) 4244 if (err)
4127 goto fail; 4245 goto fail;
@@ -4145,6 +4263,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4145 nr = trans->blocks_used; 4263 nr = trans->blocks_used;
4146 btrfs_end_transaction_throttle(trans, root); 4264 btrfs_end_transaction_throttle(trans, root);
4147fail: 4265fail:
4266 btrfs_unreserve_metadata_space(root, 3);
4148 if (drop_inode) { 4267 if (drop_inode) {
4149 inode_dec_link_count(inode); 4268 inode_dec_link_count(inode);
4150 iput(inode); 4269 iput(inode);
@@ -4164,17 +4283,21 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4164 u64 index = 0; 4283 u64 index = 0;
4165 unsigned long nr = 1; 4284 unsigned long nr = 1;
4166 4285
4167 err = btrfs_check_metadata_free_space(root); 4286 /*
4287 * 2 items for inode and ref
4288 * 2 items for dir items
4289 * 1 for xattr if selinux is on
4290 */
4291 err = btrfs_reserve_metadata_space(root, 5);
4168 if (err) 4292 if (err)
4169 goto out_unlock; 4293 return err;
4170 4294
4171 trans = btrfs_start_transaction(root, 1); 4295 trans = btrfs_start_transaction(root, 1);
4172 btrfs_set_trans_block_group(trans, dir); 4296 if (!trans) {
4173 4297 err = -ENOMEM;
4174 if (IS_ERR(trans)) {
4175 err = PTR_ERR(trans);
4176 goto out_unlock; 4298 goto out_unlock;
4177 } 4299 }
4300 btrfs_set_trans_block_group(trans, dir);
4178 4301
4179 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4302 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4180 if (err) { 4303 if (err) {
@@ -4223,6 +4346,7 @@ out_fail:
4223 btrfs_end_transaction_throttle(trans, root); 4346 btrfs_end_transaction_throttle(trans, root);
4224 4347
4225out_unlock: 4348out_unlock:
4349 btrfs_unreserve_metadata_space(root, 5);
4226 if (drop_on_err) 4350 if (drop_on_err)
4227 iput(inode); 4351 iput(inode);
4228 btrfs_btree_balance_dirty(root, nr); 4352 btrfs_btree_balance_dirty(root, nr);
@@ -4747,6 +4871,13 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
4747 goto out; 4871 goto out;
4748 } 4872 }
4749 4873
4874 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
4875 if (ret) {
4876 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
4877 ret = VM_FAULT_SIGBUS;
4878 goto out;
4879 }
4880
4750 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 4881 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
4751again: 4882again:
4752 lock_page(page); 4883 lock_page(page);
@@ -4778,7 +4909,23 @@ again:
4778 goto again; 4909 goto again;
4779 } 4910 }
4780 4911
4781 btrfs_set_extent_delalloc(inode, page_start, page_end); 4912 /*
4913 * XXX - page_mkwrite gets called every time the page is dirtied, even
4914 * if it was already dirty, so for space accounting reasons we need to
4915 * clear any delalloc bits for the range we are fixing to save. There
4916 * is probably a better way to do this, but for now keep consistent with
4917 * prepare_pages in the normal write path.
4918 */
4919 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
4920 EXTENT_DIRTY | EXTENT_DELALLOC, GFP_NOFS);
4921
4922 ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
4923 if (ret) {
4924 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4925 ret = VM_FAULT_SIGBUS;
4926 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
4927 goto out_unlock;
4928 }
4782 ret = 0; 4929 ret = 0;
4783 4930
4784 /* page is wholly or partially inside EOF */ 4931 /* page is wholly or partially inside EOF */
@@ -4801,6 +4948,7 @@ again:
4801 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 4948 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4802 4949
4803out_unlock: 4950out_unlock:
4951 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
4804 if (!ret) 4952 if (!ret)
4805 return VM_FAULT_LOCKED; 4953 return VM_FAULT_LOCKED;
4806 unlock_page(page); 4954 unlock_page(page);
@@ -4917,6 +5065,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
4917 return NULL; 5065 return NULL;
4918 ei->last_trans = 0; 5066 ei->last_trans = 0;
4919 ei->logged_trans = 0; 5067 ei->logged_trans = 0;
5068 ei->delalloc_extents = 0;
5069 ei->delalloc_reserved_extents = 0;
4920 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 5070 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
4921 INIT_LIST_HEAD(&ei->i_orphan); 5071 INIT_LIST_HEAD(&ei->i_orphan);
4922 INIT_LIST_HEAD(&ei->ordered_operations); 5072 INIT_LIST_HEAD(&ei->ordered_operations);
@@ -5070,7 +5220,12 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
5070 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 5220 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
5071 return -ENOTEMPTY; 5221 return -ENOTEMPTY;
5072 5222
5073 ret = btrfs_check_metadata_free_space(root); 5223 /*
5224 * 2 items for dir items
5225 * 1 item for orphan entry
5226 * 1 item for ref
5227 */
5228 ret = btrfs_reserve_metadata_space(root, 4);
5074 if (ret) 5229 if (ret)
5075 return ret; 5230 return ret;
5076 5231
@@ -5185,6 +5340,8 @@ out_fail:
5185 5340
5186 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 5341 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
5187 up_read(&root->fs_info->subvol_sem); 5342 up_read(&root->fs_info->subvol_sem);
5343
5344 btrfs_unreserve_metadata_space(root, 4);
5188 return ret; 5345 return ret;
5189} 5346}
5190 5347
@@ -5256,11 +5413,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5256 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 5413 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
5257 return -ENAMETOOLONG; 5414 return -ENAMETOOLONG;
5258 5415
5259 err = btrfs_check_metadata_free_space(root); 5416 /*
5417 * 2 items for inode item and ref
5418 * 2 items for dir items
5419 * 1 item for xattr if selinux is on
5420 */
5421 err = btrfs_reserve_metadata_space(root, 5);
5260 if (err) 5422 if (err)
5261 goto out_fail; 5423 return err;
5262 5424
5263 trans = btrfs_start_transaction(root, 1); 5425 trans = btrfs_start_transaction(root, 1);
5426 if (!trans)
5427 goto out_fail;
5264 btrfs_set_trans_block_group(trans, dir); 5428 btrfs_set_trans_block_group(trans, dir);
5265 5429
5266 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 5430 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -5341,6 +5505,7 @@ out_unlock:
5341 nr = trans->blocks_used; 5505 nr = trans->blocks_used;
5342 btrfs_end_transaction_throttle(trans, root); 5506 btrfs_end_transaction_throttle(trans, root);
5343out_fail: 5507out_fail:
5508 btrfs_unreserve_metadata_space(root, 5);
5344 if (drop_inode) { 5509 if (drop_inode) {
5345 inode_dec_link_count(inode); 5510 inode_dec_link_count(inode);
5346 iput(inode); 5511 iput(inode);
@@ -5362,6 +5527,11 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
5362 5527
5363 while (num_bytes > 0) { 5528 while (num_bytes > 0) {
5364 alloc_size = min(num_bytes, root->fs_info->max_extent); 5529 alloc_size = min(num_bytes, root->fs_info->max_extent);
5530
5531 ret = btrfs_reserve_metadata_space(root, 1);
5532 if (ret)
5533 goto out;
5534
5365 ret = btrfs_reserve_extent(trans, root, alloc_size, 5535 ret = btrfs_reserve_extent(trans, root, alloc_size,
5366 root->sectorsize, 0, alloc_hint, 5536 root->sectorsize, 0, alloc_hint,
5367 (u64)-1, &ins, 1); 5537 (u64)-1, &ins, 1);
@@ -5381,6 +5551,7 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
5381 num_bytes -= ins.offset; 5551 num_bytes -= ins.offset;
5382 cur_offset += ins.offset; 5552 cur_offset += ins.offset;
5383 alloc_hint = ins.objectid + ins.offset; 5553 alloc_hint = ins.objectid + ins.offset;
5554 btrfs_unreserve_metadata_space(root, 1);
5384 } 5555 }
5385out: 5556out:
5386 if (cur_offset > start) { 5557 if (cur_offset > start) {
@@ -5566,6 +5737,8 @@ static struct extent_io_ops btrfs_extent_io_ops = {
5566 .readpage_io_failed_hook = btrfs_io_failed_hook, 5737 .readpage_io_failed_hook = btrfs_io_failed_hook,
5567 .set_bit_hook = btrfs_set_bit_hook, 5738 .set_bit_hook = btrfs_set_bit_hook,
5568 .clear_bit_hook = btrfs_clear_bit_hook, 5739 .clear_bit_hook = btrfs_clear_bit_hook,
5740 .merge_extent_hook = btrfs_merge_extent_hook,
5741 .split_extent_hook = btrfs_split_extent_hook,
5569}; 5742};
5570 5743
5571/* 5744/*
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a8577a7f26ab..9a780c8d0ac8 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -239,7 +239,13 @@ static noinline int create_subvol(struct btrfs_root *root,
239 u64 index = 0; 239 u64 index = 0;
240 unsigned long nr = 1; 240 unsigned long nr = 1;
241 241
242 ret = btrfs_check_metadata_free_space(root); 242 /*
243 * 1 - inode item
244 * 2 - refs
245 * 1 - root item
246 * 2 - dir items
247 */
248 ret = btrfs_reserve_metadata_space(root, 6);
243 if (ret) 249 if (ret)
244 return ret; 250 return ret;
245 251
@@ -340,6 +346,9 @@ fail:
340 err = btrfs_commit_transaction(trans, root); 346 err = btrfs_commit_transaction(trans, root);
341 if (err && !ret) 347 if (err && !ret)
342 ret = err; 348 ret = err;
349
350 btrfs_unreserve_metadata_space(root, 6);
351 btrfs_btree_balance_dirty(root, nr);
343 return ret; 352 return ret;
344} 353}
345 354
@@ -355,19 +364,27 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
355 if (!root->ref_cows) 364 if (!root->ref_cows)
356 return -EINVAL; 365 return -EINVAL;
357 366
358 ret = btrfs_check_metadata_free_space(root); 367 /*
368 * 1 - inode item
369 * 2 - refs
370 * 1 - root item
371 * 2 - dir items
372 */
373 ret = btrfs_reserve_metadata_space(root, 6);
359 if (ret) 374 if (ret)
360 goto fail_unlock; 375 goto fail_unlock;
361 376
362 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 377 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
363 if (!pending_snapshot) { 378 if (!pending_snapshot) {
364 ret = -ENOMEM; 379 ret = -ENOMEM;
380 btrfs_unreserve_metadata_space(root, 6);
365 goto fail_unlock; 381 goto fail_unlock;
366 } 382 }
367 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); 383 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
368 if (!pending_snapshot->name) { 384 if (!pending_snapshot->name) {
369 ret = -ENOMEM; 385 ret = -ENOMEM;
370 kfree(pending_snapshot); 386 kfree(pending_snapshot);
387 btrfs_unreserve_metadata_space(root, 6);
371 goto fail_unlock; 388 goto fail_unlock;
372 } 389 }
373 memcpy(pending_snapshot->name, name, namelen); 390 memcpy(pending_snapshot->name, name, namelen);
@@ -1215,15 +1232,15 @@ static long btrfs_ioctl_trans_start(struct file *file)
1215 struct inode *inode = fdentry(file)->d_inode; 1232 struct inode *inode = fdentry(file)->d_inode;
1216 struct btrfs_root *root = BTRFS_I(inode)->root; 1233 struct btrfs_root *root = BTRFS_I(inode)->root;
1217 struct btrfs_trans_handle *trans; 1234 struct btrfs_trans_handle *trans;
1218 int ret = 0; 1235 int ret;
1219 1236
1237 ret = -EPERM;
1220 if (!capable(CAP_SYS_ADMIN)) 1238 if (!capable(CAP_SYS_ADMIN))
1221 return -EPERM; 1239 goto out;
1222 1240
1223 if (file->private_data) { 1241 ret = -EINPROGRESS;
1224 ret = -EINPROGRESS; 1242 if (file->private_data)
1225 goto out; 1243 goto out;
1226 }
1227 1244
1228 ret = mnt_want_write(file->f_path.mnt); 1245 ret = mnt_want_write(file->f_path.mnt);
1229 if (ret) 1246 if (ret)
@@ -1233,12 +1250,19 @@ static long btrfs_ioctl_trans_start(struct file *file)
1233 root->fs_info->open_ioctl_trans++; 1250 root->fs_info->open_ioctl_trans++;
1234 mutex_unlock(&root->fs_info->trans_mutex); 1251 mutex_unlock(&root->fs_info->trans_mutex);
1235 1252
1253 ret = -ENOMEM;
1236 trans = btrfs_start_ioctl_transaction(root, 0); 1254 trans = btrfs_start_ioctl_transaction(root, 0);
1237 if (trans) 1255 if (!trans)
1238 file->private_data = trans; 1256 goto out_drop;
1239 else 1257
1240 ret = -ENOMEM; 1258 file->private_data = trans;
1241 /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/ 1259 return 0;
1260
1261out_drop:
1262 mutex_lock(&root->fs_info->trans_mutex);
1263 root->fs_info->open_ioctl_trans--;
1264 mutex_unlock(&root->fs_info->trans_mutex);
1265 mnt_drop_write(file->f_path.mnt);
1242out: 1266out:
1243 return ret; 1267 return ret;
1244} 1268}
@@ -1254,24 +1278,20 @@ long btrfs_ioctl_trans_end(struct file *file)
1254 struct inode *inode = fdentry(file)->d_inode; 1278 struct inode *inode = fdentry(file)->d_inode;
1255 struct btrfs_root *root = BTRFS_I(inode)->root; 1279 struct btrfs_root *root = BTRFS_I(inode)->root;
1256 struct btrfs_trans_handle *trans; 1280 struct btrfs_trans_handle *trans;
1257 int ret = 0;
1258 1281
1259 trans = file->private_data; 1282 trans = file->private_data;
1260 if (!trans) { 1283 if (!trans)
1261 ret = -EINVAL; 1284 return -EINVAL;
1262 goto out;
1263 }
1264 btrfs_end_transaction(trans, root);
1265 file->private_data = NULL; 1285 file->private_data = NULL;
1266 1286
1287 btrfs_end_transaction(trans, root);
1288
1267 mutex_lock(&root->fs_info->trans_mutex); 1289 mutex_lock(&root->fs_info->trans_mutex);
1268 root->fs_info->open_ioctl_trans--; 1290 root->fs_info->open_ioctl_trans--;
1269 mutex_unlock(&root->fs_info->trans_mutex); 1291 mutex_unlock(&root->fs_info->trans_mutex);
1270 1292
1271 mnt_drop_write(file->f_path.mnt); 1293 mnt_drop_write(file->f_path.mnt);
1272 1294 return 0;
1273out:
1274 return ret;
1275} 1295}
1276 1296
1277long btrfs_ioctl(struct file *file, unsigned int 1297long btrfs_ioctl(struct file *file, unsigned int
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b5d6d24726b0..897fba835f89 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -458,7 +458,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
458 * start IO on any dirty ones so the wait doesn't stall waiting 458 * start IO on any dirty ones so the wait doesn't stall waiting
459 * for pdflush to find them 459 * for pdflush to find them
460 */ 460 */
461 btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL); 461 filemap_fdatawrite_range(inode->i_mapping, start, end);
462 if (wait) { 462 if (wait) {
463 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, 463 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
464 &entry->flags)); 464 &entry->flags));
@@ -488,17 +488,15 @@ again:
488 /* start IO across the range first to instantiate any delalloc 488 /* start IO across the range first to instantiate any delalloc
489 * extents 489 * extents
490 */ 490 */
491 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); 491 filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
492 492
493 /* The compression code will leave pages locked but return from 493 /* The compression code will leave pages locked but return from
494 * writepage without setting the page writeback. Starting again 494 * writepage without setting the page writeback. Starting again
495 * with WB_SYNC_ALL will end up waiting for the IO to actually start. 495 * with WB_SYNC_ALL will end up waiting for the IO to actually start.
496 */ 496 */
497 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); 497 filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
498 498
499 btrfs_wait_on_page_writeback_range(inode->i_mapping, 499 filemap_fdatawait_range(inode->i_mapping, start, orig_end);
500 start >> PAGE_CACHE_SHIFT,
501 orig_end >> PAGE_CACHE_SHIFT);
502 500
503 end = orig_end; 501 end = orig_end;
504 found = 0; 502 found = 0;
@@ -716,89 +714,6 @@ out:
716} 714}
717 715
718 716
719/**
720 * taken from mm/filemap.c because it isn't exported
721 *
722 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
723 * @mapping: address space structure to write
724 * @start: offset in bytes where the range starts
725 * @end: offset in bytes where the range ends (inclusive)
726 * @sync_mode: enable synchronous operation
727 *
728 * Start writeback against all of a mapping's dirty pages that lie
729 * within the byte offsets <start, end> inclusive.
730 *
731 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
732 * opposed to a regular memory cleansing writeback. The difference between
733 * these two operations is that if a dirty page/buffer is encountered, it must
734 * be waited upon, and not just skipped over.
735 */
736int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
737 loff_t end, int sync_mode)
738{
739 struct writeback_control wbc = {
740 .sync_mode = sync_mode,
741 .nr_to_write = mapping->nrpages * 2,
742 .range_start = start,
743 .range_end = end,
744 };
745 return btrfs_writepages(mapping, &wbc);
746}
747
748/**
749 * taken from mm/filemap.c because it isn't exported
750 *
751 * wait_on_page_writeback_range - wait for writeback to complete
752 * @mapping: target address_space
753 * @start: beginning page index
754 * @end: ending page index
755 *
756 * Wait for writeback to complete against pages indexed by start->end
757 * inclusive
758 */
759int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
760 pgoff_t start, pgoff_t end)
761{
762 struct pagevec pvec;
763 int nr_pages;
764 int ret = 0;
765 pgoff_t index;
766
767 if (end < start)
768 return 0;
769
770 pagevec_init(&pvec, 0);
771 index = start;
772 while ((index <= end) &&
773 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
774 PAGECACHE_TAG_WRITEBACK,
775 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
776 unsigned i;
777
778 for (i = 0; i < nr_pages; i++) {
779 struct page *page = pvec.pages[i];
780
781 /* until radix tree lookup accepts end_index */
782 if (page->index > end)
783 continue;
784
785 wait_on_page_writeback(page);
786 if (PageError(page))
787 ret = -EIO;
788 }
789 pagevec_release(&pvec);
790 cond_resched();
791 }
792
793 /* Check for outstanding write errors */
794 if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
795 ret = -ENOSPC;
796 if (test_and_clear_bit(AS_EIO, &mapping->flags))
797 ret = -EIO;
798
799 return ret;
800}
801
802/* 717/*
803 * add a given inode to the list of inodes that must be fully on 718 * add a given inode to the list of inodes that must be fully on
804 * disk before a transaction commit finishes. 719 * disk before a transaction commit finishes.
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 993a7ea45c70..f82e87488ca8 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -153,10 +153,6 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
153int btrfs_ordered_update_i_size(struct inode *inode, 153int btrfs_ordered_update_i_size(struct inode *inode,
154 struct btrfs_ordered_extent *ordered); 154 struct btrfs_ordered_extent *ordered);
155int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); 155int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
156int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
157 pgoff_t start, pgoff_t end);
158int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
159 loff_t end, int sync_mode);
160int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); 156int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
161int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); 157int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
162int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, 158int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 67035385444c..9de9b2236419 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -344,7 +344,9 @@ static int btrfs_fill_super(struct super_block *sb,
344 sb->s_export_op = &btrfs_export_ops; 344 sb->s_export_op = &btrfs_export_ops;
345 sb->s_xattr = btrfs_xattr_handlers; 345 sb->s_xattr = btrfs_xattr_handlers;
346 sb->s_time_gran = 1; 346 sb->s_time_gran = 1;
347#ifdef CONFIG_BTRFS_POSIX_ACL
347 sb->s_flags |= MS_POSIXACL; 348 sb->s_flags |= MS_POSIXACL;
349#endif
348 350
349 tree_root = open_ctree(sb, fs_devices, (char *)data); 351 tree_root = open_ctree(sb, fs_devices, (char *)data);
350 352
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 88f866f85e7a..0b8f36d4400a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -186,6 +186,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
186 h->alloc_exclude_start = 0; 186 h->alloc_exclude_start = 0;
187 h->delayed_ref_updates = 0; 187 h->delayed_ref_updates = 0;
188 188
189 if (!current->journal_info)
190 current->journal_info = h;
191
189 root->fs_info->running_transaction->use_count++; 192 root->fs_info->running_transaction->use_count++;
190 record_root_in_trans(h, root); 193 record_root_in_trans(h, root);
191 mutex_unlock(&root->fs_info->trans_mutex); 194 mutex_unlock(&root->fs_info->trans_mutex);
@@ -317,6 +320,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
317 wake_up(&cur_trans->writer_wait); 320 wake_up(&cur_trans->writer_wait);
318 put_transaction(cur_trans); 321 put_transaction(cur_trans);
319 mutex_unlock(&info->trans_mutex); 322 mutex_unlock(&info->trans_mutex);
323
324 if (current->journal_info == trans)
325 current->journal_info = NULL;
320 memset(trans, 0, sizeof(*trans)); 326 memset(trans, 0, sizeof(*trans));
321 kmem_cache_free(btrfs_trans_handle_cachep, trans); 327 kmem_cache_free(btrfs_trans_handle_cachep, trans);
322 328
@@ -743,6 +749,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
743 memcpy(&pending->root_key, &key, sizeof(key)); 749 memcpy(&pending->root_key, &key, sizeof(key));
744fail: 750fail:
745 kfree(new_root_item); 751 kfree(new_root_item);
752 btrfs_unreserve_metadata_space(root, 6);
746 return ret; 753 return ret;
747} 754}
748 755
@@ -1059,6 +1066,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1059 1066
1060 mutex_unlock(&root->fs_info->trans_mutex); 1067 mutex_unlock(&root->fs_info->trans_mutex);
1061 1068
1069 if (current->journal_info == trans)
1070 current->journal_info = NULL;
1071
1062 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1072 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1063 return ret; 1073 return ret;
1064} 1074}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 23e7d36ff325..7eda483d7b5a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -446,8 +446,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
446 goto error; 446 goto error;
447 447
448 device->name = kstrdup(orig_dev->name, GFP_NOFS); 448 device->name = kstrdup(orig_dev->name, GFP_NOFS);
449 if (!device->name) 449 if (!device->name) {
450 kfree(device);
450 goto error; 451 goto error;
452 }
451 453
452 device->devid = orig_dev->devid; 454 device->devid = orig_dev->devid;
453 device->work.func = pending_bios_fn; 455 device->work.func = pending_bios_fn;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index a9d3bf4d2689..b0fc93f95fd0 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -260,7 +260,7 @@ err:
260 * attributes are handled directly. 260 * attributes are handled directly.
261 */ 261 */
262struct xattr_handler *btrfs_xattr_handlers[] = { 262struct xattr_handler *btrfs_xattr_handlers[] = {
263#ifdef CONFIG_FS_POSIX_ACL 263#ifdef CONFIG_BTRFS_POSIX_ACL
264 &btrfs_xattr_acl_access_handler, 264 &btrfs_xattr_acl_access_handler,
265 &btrfs_xattr_acl_default_handler, 265 &btrfs_xattr_acl_default_handler,
266#endif 266#endif