aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDavid Sterba <dsterba@suse.cz>2011-10-24 08:47:57 -0400
committerDavid Sterba <dsterba@suse.cz>2011-10-24 08:47:57 -0400
commitafd582ac8f10382002a72b4d17d9c2db328ed8b8 (patch)
tree91246c1296c06cc0d5add8d10452e7fb110ed920 /fs
parentc3b92c8787367a8bb53d57d9789b558f1295cc96 (diff)
parent016fc6a63e465d5b94e4028f6d05d9703e195428 (diff)
Merge remote-tracking branch 'remotes/josef/for-chris' into btrfs-next-stable
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/btrfs_inode.h17
-rw-r--r--fs/btrfs/ctree.h56
-rw-r--r--fs/btrfs/disk-io.c8
-rw-r--r--fs/btrfs/extent-tree.c668
-rw-r--r--fs/btrfs/extent_io.c194
-rw-r--r--fs/btrfs/extent_io.h3
-rw-r--r--fs/btrfs/file.c25
-rw-r--r--fs/btrfs/free-space-cache.c902
-rw-r--r--fs/btrfs/inode-map.c6
-rw-r--r--fs/btrfs/inode.c286
-rw-r--r--fs/btrfs/ioctl.c20
-rw-r--r--fs/btrfs/relocation.c19
-rw-r--r--fs/btrfs/super.c245
-rw-r--r--fs/btrfs/transaction.c116
-rw-r--r--fs/btrfs/volumes.c39
-rw-r--r--fs/btrfs/xattr.c11
16 files changed, 1557 insertions, 1058 deletions
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index d9f99a16edd6..5a5d325a3935 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -103,11 +103,6 @@ struct btrfs_inode {
103 */ 103 */
104 u64 delalloc_bytes; 104 u64 delalloc_bytes;
105 105
106 /* total number of bytes that may be used for this inode for
107 * delalloc
108 */
109 u64 reserved_bytes;
110
111 /* 106 /*
112 * the size of the file stored in the metadata on disk. data=ordered 107 * the size of the file stored in the metadata on disk. data=ordered
113 * means the in-memory i_size might be larger than the size on disk 108 * means the in-memory i_size might be larger than the size on disk
@@ -115,9 +110,6 @@ struct btrfs_inode {
115 */ 110 */
116 u64 disk_i_size; 111 u64 disk_i_size;
117 112
118 /* flags field from the on disk inode */
119 u32 flags;
120
121 /* 113 /*
122 * if this is a directory then index_cnt is the counter for the index 114 * if this is a directory then index_cnt is the counter for the index
123 * number for new files that are created 115 * number for new files that are created
@@ -132,6 +124,15 @@ struct btrfs_inode {
132 u64 last_unlink_trans; 124 u64 last_unlink_trans;
133 125
134 /* 126 /*
127 * Number of bytes outstanding that are going to need csums. This is
128 * used in ENOSPC accounting.
129 */
130 u64 csum_bytes;
131
132 /* flags field from the on disk inode */
133 u32 flags;
134
135 /*
135 * Counters to keep track of the number of extent item's we may use due 136 * Counters to keep track of the number of extent item's we may use due
136 * to delalloc and such. outstanding_extents is the number of extent 137 * to delalloc and such. outstanding_extents is the number of extent
137 * items we think we'll end up using, and reserved_extents is the number 138 * items we think we'll end up using, and reserved_extents is the number
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 03912c5c6f49..227620993bce 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -30,6 +30,7 @@
30#include <linux/kobject.h> 30#include <linux/kobject.h>
31#include <trace/events/btrfs.h> 31#include <trace/events/btrfs.h>
32#include <asm/kmap_types.h> 32#include <asm/kmap_types.h>
33#include <linux/pagemap.h>
33#include "extent_io.h" 34#include "extent_io.h"
34#include "extent_map.h" 35#include "extent_map.h"
35#include "async-thread.h" 36#include "async-thread.h"
@@ -772,14 +773,8 @@ struct btrfs_space_info {
772struct btrfs_block_rsv { 773struct btrfs_block_rsv {
773 u64 size; 774 u64 size;
774 u64 reserved; 775 u64 reserved;
775 u64 freed[2];
776 struct btrfs_space_info *space_info; 776 struct btrfs_space_info *space_info;
777 struct list_head list;
778 spinlock_t lock; 777 spinlock_t lock;
779 atomic_t usage;
780 unsigned int priority:8;
781 unsigned int durable:1;
782 unsigned int refill_used:1;
783 unsigned int full:1; 778 unsigned int full:1;
784}; 779};
785 780
@@ -840,10 +835,10 @@ struct btrfs_block_group_cache {
840 spinlock_t lock; 835 spinlock_t lock;
841 u64 pinned; 836 u64 pinned;
842 u64 reserved; 837 u64 reserved;
843 u64 reserved_pinned;
844 u64 bytes_super; 838 u64 bytes_super;
845 u64 flags; 839 u64 flags;
846 u64 sectorsize; 840 u64 sectorsize;
841 u64 cache_generation;
847 unsigned int ro:1; 842 unsigned int ro:1;
848 unsigned int dirty:1; 843 unsigned int dirty:1;
849 unsigned int iref:1; 844 unsigned int iref:1;
@@ -899,6 +894,10 @@ struct btrfs_fs_info {
899 spinlock_t block_group_cache_lock; 894 spinlock_t block_group_cache_lock;
900 struct rb_root block_group_cache_tree; 895 struct rb_root block_group_cache_tree;
901 896
897 /* keep track of unallocated space */
898 spinlock_t free_chunk_lock;
899 u64 free_chunk_space;
900
902 struct extent_io_tree freed_extents[2]; 901 struct extent_io_tree freed_extents[2];
903 struct extent_io_tree *pinned_extents; 902 struct extent_io_tree *pinned_extents;
904 903
@@ -919,11 +918,6 @@ struct btrfs_fs_info {
919 918
920 struct btrfs_block_rsv empty_block_rsv; 919 struct btrfs_block_rsv empty_block_rsv;
921 920
922 /* list of block reservations that cross multiple transactions */
923 struct list_head durable_block_rsv_list;
924
925 struct mutex durable_block_rsv_mutex;
926
927 u64 generation; 921 u64 generation;
928 u64 last_trans_committed; 922 u64 last_trans_committed;
929 923
@@ -2129,6 +2123,11 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
2129 (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); 2123 (space_info->flags & BTRFS_BLOCK_GROUP_DATA));
2130} 2124}
2131 2125
2126static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
2127{
2128 return mapping_gfp_mask(mapping) & ~__GFP_FS;
2129}
2130
2132/* extent-tree.c */ 2131/* extent-tree.c */
2133static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, 2132static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
2134 unsigned num_items) 2133 unsigned num_items)
@@ -2137,6 +2136,17 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
2137 3 * num_items; 2136 3 * num_items;
2138} 2137}
2139 2138
2139/*
2140 * Doing a truncate won't result in new nodes or leaves, just what we need for
2141 * COW.
2142 */
2143static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
2144 unsigned num_items)
2145{
2146 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
2147 num_items;
2148}
2149
2140void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 2150void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
2141int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2151int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2142 struct btrfs_root *root, unsigned long count); 2152 struct btrfs_root *root, unsigned long count);
@@ -2196,8 +2206,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
2196 u64 root_objectid, u64 owner, u64 offset); 2206 u64 root_objectid, u64 owner, u64 offset);
2197 2207
2198int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); 2208int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
2199int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
2200 u64 num_bytes, int reserve, int sinfo);
2201int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 2209int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
2202 struct btrfs_root *root); 2210 struct btrfs_root *root);
2203int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 2211int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
@@ -2240,25 +2248,20 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
2240struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); 2248struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
2241void btrfs_free_block_rsv(struct btrfs_root *root, 2249void btrfs_free_block_rsv(struct btrfs_root *root,
2242 struct btrfs_block_rsv *rsv); 2250 struct btrfs_block_rsv *rsv);
2243void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, 2251int btrfs_block_rsv_add(struct btrfs_root *root,
2244 struct btrfs_block_rsv *rsv);
2245int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
2246 struct btrfs_root *root,
2247 struct btrfs_block_rsv *block_rsv, 2252 struct btrfs_block_rsv *block_rsv,
2248 u64 num_bytes); 2253 u64 num_bytes);
2249int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, 2254int btrfs_block_rsv_check(struct btrfs_root *root,
2250 struct btrfs_root *root, 2255 struct btrfs_block_rsv *block_rsv, int min_factor);
2256int btrfs_block_rsv_refill(struct btrfs_root *root,
2251 struct btrfs_block_rsv *block_rsv, 2257 struct btrfs_block_rsv *block_rsv,
2252 u64 min_reserved, int min_factor); 2258 u64 min_reserved);
2253int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 2259int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2254 struct btrfs_block_rsv *dst_rsv, 2260 struct btrfs_block_rsv *dst_rsv,
2255 u64 num_bytes); 2261 u64 num_bytes);
2256void btrfs_block_rsv_release(struct btrfs_root *root, 2262void btrfs_block_rsv_release(struct btrfs_root *root,
2257 struct btrfs_block_rsv *block_rsv, 2263 struct btrfs_block_rsv *block_rsv,
2258 u64 num_bytes); 2264 u64 num_bytes);
2259int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
2260 struct btrfs_root *root,
2261 struct btrfs_block_rsv *rsv);
2262int btrfs_set_block_group_ro(struct btrfs_root *root, 2265int btrfs_set_block_group_ro(struct btrfs_root *root,
2263 struct btrfs_block_group_cache *cache); 2266 struct btrfs_block_group_cache *cache);
2264int btrfs_set_block_group_rw(struct btrfs_root *root, 2267int btrfs_set_block_group_rw(struct btrfs_root *root,
@@ -2579,11 +2582,6 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
2579int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); 2582int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
2580int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); 2583int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2581int btrfs_orphan_cleanup(struct btrfs_root *root); 2584int btrfs_orphan_cleanup(struct btrfs_root *root);
2582void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2583 struct btrfs_pending_snapshot *pending,
2584 u64 *bytes_to_reserve);
2585void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2586 struct btrfs_pending_snapshot *pending);
2587void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, 2585void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2588 struct btrfs_root *root); 2586 struct btrfs_root *root);
2589int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); 2587int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 07b3ac662e19..51372a521167 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1648,6 +1648,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1648 spin_lock_init(&fs_info->fs_roots_radix_lock); 1648 spin_lock_init(&fs_info->fs_roots_radix_lock);
1649 spin_lock_init(&fs_info->delayed_iput_lock); 1649 spin_lock_init(&fs_info->delayed_iput_lock);
1650 spin_lock_init(&fs_info->defrag_inodes_lock); 1650 spin_lock_init(&fs_info->defrag_inodes_lock);
1651 spin_lock_init(&fs_info->free_chunk_lock);
1651 mutex_init(&fs_info->reloc_mutex); 1652 mutex_init(&fs_info->reloc_mutex);
1652 1653
1653 init_completion(&fs_info->kobj_unregister); 1654 init_completion(&fs_info->kobj_unregister);
@@ -1665,8 +1666,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1665 btrfs_init_block_rsv(&fs_info->trans_block_rsv); 1666 btrfs_init_block_rsv(&fs_info->trans_block_rsv);
1666 btrfs_init_block_rsv(&fs_info->chunk_block_rsv); 1667 btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
1667 btrfs_init_block_rsv(&fs_info->empty_block_rsv); 1668 btrfs_init_block_rsv(&fs_info->empty_block_rsv);
1668 INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
1669 mutex_init(&fs_info->durable_block_rsv_mutex);
1670 atomic_set(&fs_info->nr_async_submits, 0); 1669 atomic_set(&fs_info->nr_async_submits, 0);
1671 atomic_set(&fs_info->async_delalloc_pages, 0); 1670 atomic_set(&fs_info->async_delalloc_pages, 0);
1672 atomic_set(&fs_info->async_submit_draining, 0); 1671 atomic_set(&fs_info->async_submit_draining, 0);
@@ -1677,6 +1676,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1677 fs_info->metadata_ratio = 0; 1676 fs_info->metadata_ratio = 0;
1678 fs_info->defrag_inodes = RB_ROOT; 1677 fs_info->defrag_inodes = RB_ROOT;
1679 fs_info->trans_no_join = 0; 1678 fs_info->trans_no_join = 0;
1679 fs_info->free_chunk_space = 0;
1680 1680
1681 fs_info->thread_pool_size = min_t(unsigned long, 1681 fs_info->thread_pool_size = min_t(unsigned long,
1682 num_online_cpus() + 2, 8); 1682 num_online_cpus() + 2, 8);
@@ -2545,8 +2545,6 @@ int close_ctree(struct btrfs_root *root)
2545 /* clear out the rbtree of defraggable inodes */ 2545 /* clear out the rbtree of defraggable inodes */
2546 btrfs_run_defrag_inodes(root->fs_info); 2546 btrfs_run_defrag_inodes(root->fs_info);
2547 2547
2548 btrfs_put_block_group_cache(fs_info);
2549
2550 /* 2548 /*
2551 * Here come 2 situations when btrfs is broken to flip readonly: 2549 * Here come 2 situations when btrfs is broken to flip readonly:
2552 * 2550 *
@@ -2572,6 +2570,8 @@ int close_ctree(struct btrfs_root *root)
2572 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 2570 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2573 } 2571 }
2574 2572
2573 btrfs_put_block_group_cache(fs_info);
2574
2575 kthread_stop(root->fs_info->transaction_kthread); 2575 kthread_stop(root->fs_info->transaction_kthread);
2576 kthread_stop(root->fs_info->cleaner_kthread); 2576 kthread_stop(root->fs_info->cleaner_kthread);
2577 2577
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5be06a2462f..4eb7d2ba38f8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -52,6 +52,21 @@ enum {
52 CHUNK_ALLOC_LIMITED = 2, 52 CHUNK_ALLOC_LIMITED = 2,
53}; 53};
54 54
55/*
56 * Control how reservations are dealt with.
57 *
58 * RESERVE_FREE - freeing a reservation.
59 * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
60 * ENOSPC accounting
61 * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
62 * bytes_may_use as the ENOSPC accounting is done elsewhere
63 */
64enum {
65 RESERVE_FREE = 0,
66 RESERVE_ALLOC = 1,
67 RESERVE_ALLOC_NO_ACCOUNT = 2,
68};
69
55static int update_block_group(struct btrfs_trans_handle *trans, 70static int update_block_group(struct btrfs_trans_handle *trans,
56 struct btrfs_root *root, 71 struct btrfs_root *root,
57 u64 bytenr, u64 num_bytes, int alloc); 72 u64 bytenr, u64 num_bytes, int alloc);
@@ -81,6 +96,8 @@ static int find_next_key(struct btrfs_path *path, int level,
81 struct btrfs_key *key); 96 struct btrfs_key *key);
82static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 97static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
83 int dump_block_groups); 98 int dump_block_groups);
99static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
100 u64 num_bytes, int reserve);
84 101
85static noinline int 102static noinline int
86block_group_cache_done(struct btrfs_block_group_cache *cache) 103block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -104,7 +121,6 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
104 if (atomic_dec_and_test(&cache->count)) { 121 if (atomic_dec_and_test(&cache->count)) {
105 WARN_ON(cache->pinned > 0); 122 WARN_ON(cache->pinned > 0);
106 WARN_ON(cache->reserved > 0); 123 WARN_ON(cache->reserved > 0);
107 WARN_ON(cache->reserved_pinned > 0);
108 kfree(cache->free_space_ctl); 124 kfree(cache->free_space_ctl);
109 kfree(cache); 125 kfree(cache);
110 } 126 }
@@ -465,7 +481,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
465 * we likely hold important locks. 481 * we likely hold important locks.
466 */ 482 */
467 if (trans && (!trans->transaction->in_commit) && 483 if (trans && (!trans->transaction->in_commit) &&
468 (root && root != root->fs_info->tree_root)) { 484 (root && root != root->fs_info->tree_root) &&
485 btrfs_test_opt(root, SPACE_CACHE)) {
469 spin_lock(&cache->lock); 486 spin_lock(&cache->lock);
470 if (cache->cached != BTRFS_CACHE_NO) { 487 if (cache->cached != BTRFS_CACHE_NO) {
471 spin_unlock(&cache->lock); 488 spin_unlock(&cache->lock);
@@ -2700,6 +2717,13 @@ again:
2700 goto again; 2717 goto again;
2701 } 2718 }
2702 2719
2720 /* We've already setup this transaction, go ahead and exit */
2721 if (block_group->cache_generation == trans->transid &&
2722 i_size_read(inode)) {
2723 dcs = BTRFS_DC_SETUP;
2724 goto out_put;
2725 }
2726
2703 /* 2727 /*
2704 * We want to set the generation to 0, that way if anything goes wrong 2728 * We want to set the generation to 0, that way if anything goes wrong
2705 * from here on out we know not to trust this cache when we load up next 2729 * from here on out we know not to trust this cache when we load up next
@@ -2749,12 +2773,15 @@ again:
2749 if (!ret) 2773 if (!ret)
2750 dcs = BTRFS_DC_SETUP; 2774 dcs = BTRFS_DC_SETUP;
2751 btrfs_free_reserved_data_space(inode, num_pages); 2775 btrfs_free_reserved_data_space(inode, num_pages);
2776
2752out_put: 2777out_put:
2753 iput(inode); 2778 iput(inode);
2754out_free: 2779out_free:
2755 btrfs_release_path(path); 2780 btrfs_release_path(path);
2756out: 2781out:
2757 spin_lock(&block_group->lock); 2782 spin_lock(&block_group->lock);
2783 if (!ret)
2784 block_group->cache_generation = trans->transid;
2758 block_group->disk_cache_state = dcs; 2785 block_group->disk_cache_state = dcs;
2759 spin_unlock(&block_group->lock); 2786 spin_unlock(&block_group->lock);
2760 2787
@@ -3122,16 +3149,13 @@ commit_trans:
3122 return -ENOSPC; 3149 return -ENOSPC;
3123 } 3150 }
3124 data_sinfo->bytes_may_use += bytes; 3151 data_sinfo->bytes_may_use += bytes;
3125 BTRFS_I(inode)->reserved_bytes += bytes;
3126 spin_unlock(&data_sinfo->lock); 3152 spin_unlock(&data_sinfo->lock);
3127 3153
3128 return 0; 3154 return 0;
3129} 3155}
3130 3156
3131/* 3157/*
3132 * called when we are clearing an delalloc extent from the 3158 * Called if we need to clear a data reservation for this inode.
3133 * inode's io_tree or there was an error for whatever reason
3134 * after calling btrfs_check_data_free_space
3135 */ 3159 */
3136void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) 3160void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3137{ 3161{
@@ -3144,7 +3168,6 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3144 data_sinfo = BTRFS_I(inode)->space_info; 3168 data_sinfo = BTRFS_I(inode)->space_info;
3145 spin_lock(&data_sinfo->lock); 3169 spin_lock(&data_sinfo->lock);
3146 data_sinfo->bytes_may_use -= bytes; 3170 data_sinfo->bytes_may_use -= bytes;
3147 BTRFS_I(inode)->reserved_bytes -= bytes;
3148 spin_unlock(&data_sinfo->lock); 3171 spin_unlock(&data_sinfo->lock);
3149} 3172}
3150 3173
@@ -3165,6 +3188,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
3165 struct btrfs_space_info *sinfo, u64 alloc_bytes, 3188 struct btrfs_space_info *sinfo, u64 alloc_bytes,
3166 int force) 3189 int force)
3167{ 3190{
3191 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3168 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3192 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3169 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; 3193 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3170 u64 thresh; 3194 u64 thresh;
@@ -3173,6 +3197,13 @@ static int should_alloc_chunk(struct btrfs_root *root,
3173 return 1; 3197 return 1;
3174 3198
3175 /* 3199 /*
3200 * We need to take into account the global rsv because for all intents
3201 * and purposes it's used space. Don't worry about locking the
3202 * global_rsv, it doesn't change except when the transaction commits.
3203 */
3204 num_allocated += global_rsv->size;
3205
3206 /*
3176 * in limited mode, we want to have some free space up to 3207 * in limited mode, we want to have some free space up to
3177 * about 1% of the FS size. 3208 * about 1% of the FS size.
3178 */ 3209 */
@@ -3303,7 +3334,8 @@ out:
3303 * shrink metadata reservation for delalloc 3334 * shrink metadata reservation for delalloc
3304 */ 3335 */
3305static int shrink_delalloc(struct btrfs_trans_handle *trans, 3336static int shrink_delalloc(struct btrfs_trans_handle *trans,
3306 struct btrfs_root *root, u64 to_reclaim, int sync) 3337 struct btrfs_root *root, u64 to_reclaim,
3338 bool wait_ordered)
3307{ 3339{
3308 struct btrfs_block_rsv *block_rsv; 3340 struct btrfs_block_rsv *block_rsv;
3309 struct btrfs_space_info *space_info; 3341 struct btrfs_space_info *space_info;
@@ -3311,7 +3343,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3311 u64 max_reclaim; 3343 u64 max_reclaim;
3312 u64 reclaimed = 0; 3344 u64 reclaimed = 0;
3313 long time_left; 3345 long time_left;
3314 int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3346 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3315 int loops = 0; 3347 int loops = 0;
3316 unsigned long progress; 3348 unsigned long progress;
3317 3349
@@ -3319,7 +3351,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3319 space_info = block_rsv->space_info; 3351 space_info = block_rsv->space_info;
3320 3352
3321 smp_mb(); 3353 smp_mb();
3322 reserved = space_info->bytes_reserved; 3354 reserved = space_info->bytes_may_use;
3323 progress = space_info->reservation_progress; 3355 progress = space_info->reservation_progress;
3324 3356
3325 if (reserved == 0) 3357 if (reserved == 0)
@@ -3334,7 +3366,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3334 } 3366 }
3335 3367
3336 max_reclaim = min(reserved, to_reclaim); 3368 max_reclaim = min(reserved, to_reclaim);
3337 3369 nr_pages = max_t(unsigned long, nr_pages,
3370 max_reclaim >> PAGE_CACHE_SHIFT);
3338 while (loops < 1024) { 3371 while (loops < 1024) {
3339 /* have the flusher threads jump in and do some IO */ 3372 /* have the flusher threads jump in and do some IO */
3340 smp_mb(); 3373 smp_mb();
@@ -3343,9 +3376,9 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3343 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); 3376 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
3344 3377
3345 spin_lock(&space_info->lock); 3378 spin_lock(&space_info->lock);
3346 if (reserved > space_info->bytes_reserved) 3379 if (reserved > space_info->bytes_may_use)
3347 reclaimed += reserved - space_info->bytes_reserved; 3380 reclaimed += reserved - space_info->bytes_may_use;
3348 reserved = space_info->bytes_reserved; 3381 reserved = space_info->bytes_may_use;
3349 spin_unlock(&space_info->lock); 3382 spin_unlock(&space_info->lock);
3350 3383
3351 loops++; 3384 loops++;
@@ -3356,11 +3389,15 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3356 if (trans && trans->transaction->blocked) 3389 if (trans && trans->transaction->blocked)
3357 return -EAGAIN; 3390 return -EAGAIN;
3358 3391
3359 time_left = schedule_timeout_interruptible(1); 3392 if (wait_ordered && !trans) {
3393 btrfs_wait_ordered_extents(root, 0, 0);
3394 } else {
3395 time_left = schedule_timeout_interruptible(1);
3360 3396
3361 /* We were interrupted, exit */ 3397 /* We were interrupted, exit */
3362 if (time_left) 3398 if (time_left)
3363 break; 3399 break;
3400 }
3364 3401
3365 /* we've kicked the IO a few times, if anything has been freed, 3402 /* we've kicked the IO a few times, if anything has been freed,
3366 * exit. There is no sense in looping here for a long time 3403 * exit. There is no sense in looping here for a long time
@@ -3375,35 +3412,39 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3375 } 3412 }
3376 3413
3377 } 3414 }
3378 if (reclaimed >= to_reclaim && !trans) 3415
3379 btrfs_wait_ordered_extents(root, 0, 0);
3380 return reclaimed >= to_reclaim; 3416 return reclaimed >= to_reclaim;
3381} 3417}
3382 3418
3383/* 3419/**
3384 * Retries tells us how many times we've called reserve_metadata_bytes. The 3420 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
3385 * idea is if this is the first call (retries == 0) then we will add to our 3421 * @root - the root we're allocating for
3386 * reserved count if we can't make the allocation in order to hold our place 3422 * @block_rsv - the block_rsv we're allocating for
3387 * while we go and try and free up space. That way for retries > 1 we don't try 3423 * @orig_bytes - the number of bytes we want
3388 * and add space, we just check to see if the amount of unused space is >= the 3424 * @flush - wether or not we can flush to make our reservation
3389 * total space, meaning that our reservation is valid.
3390 * 3425 *
3391 * However if we don't intend to retry this reservation, pass -1 as retries so 3426 * This will reserve orgi_bytes number of bytes from the space info associated
3392 * that it short circuits this logic. 3427 * with the block_rsv. If there is not enough space it will make an attempt to
3428 * flush out space to make room. It will do this by flushing delalloc if
3429 * possible or committing the transaction. If flush is 0 then no attempts to
3430 * regain reservations will be made and this will fail if there is not enough
3431 * space already.
3393 */ 3432 */
3394static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, 3433static int reserve_metadata_bytes(struct btrfs_root *root,
3395 struct btrfs_root *root,
3396 struct btrfs_block_rsv *block_rsv, 3434 struct btrfs_block_rsv *block_rsv,
3397 u64 orig_bytes, int flush) 3435 u64 orig_bytes, int flush)
3398{ 3436{
3399 struct btrfs_space_info *space_info = block_rsv->space_info; 3437 struct btrfs_space_info *space_info = block_rsv->space_info;
3400 u64 unused; 3438 struct btrfs_trans_handle *trans;
3439 u64 used;
3401 u64 num_bytes = orig_bytes; 3440 u64 num_bytes = orig_bytes;
3402 int retries = 0; 3441 int retries = 0;
3403 int ret = 0; 3442 int ret = 0;
3404 bool committed = false; 3443 bool committed = false;
3405 bool flushing = false; 3444 bool flushing = false;
3445 bool wait_ordered = false;
3406 3446
3447 trans = (struct btrfs_trans_handle *)current->journal_info;
3407again: 3448again:
3408 ret = 0; 3449 ret = 0;
3409 spin_lock(&space_info->lock); 3450 spin_lock(&space_info->lock);
@@ -3431,9 +3472,9 @@ again:
3431 } 3472 }
3432 3473
3433 ret = -ENOSPC; 3474 ret = -ENOSPC;
3434 unused = space_info->bytes_used + space_info->bytes_reserved + 3475 used = space_info->bytes_used + space_info->bytes_reserved +
3435 space_info->bytes_pinned + space_info->bytes_readonly + 3476 space_info->bytes_pinned + space_info->bytes_readonly +
3436 space_info->bytes_may_use; 3477 space_info->bytes_may_use;
3437 3478
3438 /* 3479 /*
3439 * The idea here is that we've not already over-reserved the block group 3480 * The idea here is that we've not already over-reserved the block group
@@ -3442,10 +3483,9 @@ again:
3442 * lets start flushing stuff first and then come back and try to make 3483 * lets start flushing stuff first and then come back and try to make
3443 * our reservation. 3484 * our reservation.
3444 */ 3485 */
3445 if (unused <= space_info->total_bytes) { 3486 if (used <= space_info->total_bytes) {
3446 unused = space_info->total_bytes - unused; 3487 if (used + orig_bytes <= space_info->total_bytes) {
3447 if (unused >= num_bytes) { 3488 space_info->bytes_may_use += orig_bytes;
3448 space_info->bytes_reserved += orig_bytes;
3449 ret = 0; 3489 ret = 0;
3450 } else { 3490 } else {
3451 /* 3491 /*
@@ -3461,10 +3501,60 @@ again:
3461 * amount plus the amount of bytes that we need for this 3501 * amount plus the amount of bytes that we need for this
3462 * reservation. 3502 * reservation.
3463 */ 3503 */
3464 num_bytes = unused - space_info->total_bytes + 3504 wait_ordered = true;
3505 num_bytes = used - space_info->total_bytes +
3465 (orig_bytes * (retries + 1)); 3506 (orig_bytes * (retries + 1));
3466 } 3507 }
3467 3508
3509 if (ret) {
3510 u64 profile = btrfs_get_alloc_profile(root, 0);
3511 u64 avail;
3512
3513 /*
3514 * If we have a lot of space that's pinned, don't bother doing
3515 * the overcommit dance yet and just commit the transaction.
3516 */
3517 avail = (space_info->total_bytes - space_info->bytes_used) * 8;
3518 do_div(avail, 10);
3519 if (space_info->bytes_pinned >= avail && flush && !trans &&
3520 !committed) {
3521 space_info->flush = 1;
3522 flushing = true;
3523 spin_unlock(&space_info->lock);
3524 goto commit;
3525 }
3526
3527 spin_lock(&root->fs_info->free_chunk_lock);
3528 avail = root->fs_info->free_chunk_space;
3529
3530 /*
3531 * If we have dup, raid1 or raid10 then only half of the free
3532 * space is actually useable.
3533 */
3534 if (profile & (BTRFS_BLOCK_GROUP_DUP |
3535 BTRFS_BLOCK_GROUP_RAID1 |
3536 BTRFS_BLOCK_GROUP_RAID10))
3537 avail >>= 1;
3538
3539 /*
3540 * If we aren't flushing don't let us overcommit too much, say
3541 * 1/8th of the space. If we can flush, let it overcommit up to
3542 * 1/2 of the space.
3543 */
3544 if (flush)
3545 avail >>= 3;
3546 else
3547 avail >>= 1;
3548 spin_unlock(&root->fs_info->free_chunk_lock);
3549
3550 if (used + num_bytes < space_info->total_bytes + avail) {
3551 space_info->bytes_may_use += orig_bytes;
3552 ret = 0;
3553 } else {
3554 wait_ordered = true;
3555 }
3556 }
3557
3468 /* 3558 /*
3469 * Couldn't make our reservation, save our place so while we're trying 3559 * Couldn't make our reservation, save our place so while we're trying
3470 * to reclaim space we can actually use it instead of somebody else 3560 * to reclaim space we can actually use it instead of somebody else
@@ -3484,7 +3574,7 @@ again:
3484 * We do synchronous shrinking since we don't actually unreserve 3574 * We do synchronous shrinking since we don't actually unreserve
3485 * metadata until after the IO is completed. 3575 * metadata until after the IO is completed.
3486 */ 3576 */
3487 ret = shrink_delalloc(trans, root, num_bytes, 1); 3577 ret = shrink_delalloc(trans, root, num_bytes, wait_ordered);
3488 if (ret < 0) 3578 if (ret < 0)
3489 goto out; 3579 goto out;
3490 3580
@@ -3496,25 +3586,16 @@ again:
3496 * so go back around and try again. 3586 * so go back around and try again.
3497 */ 3587 */
3498 if (retries < 2) { 3588 if (retries < 2) {
3589 wait_ordered = true;
3499 retries++; 3590 retries++;
3500 goto again; 3591 goto again;
3501 } 3592 }
3502 3593
3503 /*
3504 * Not enough space to be reclaimed, don't bother committing the
3505 * transaction.
3506 */
3507 spin_lock(&space_info->lock);
3508 if (space_info->bytes_pinned < orig_bytes)
3509 ret = -ENOSPC;
3510 spin_unlock(&space_info->lock);
3511 if (ret)
3512 goto out;
3513
3514 ret = -EAGAIN; 3594 ret = -EAGAIN;
3515 if (trans) 3595 if (trans)
3516 goto out; 3596 goto out;
3517 3597
3598commit:
3518 ret = -ENOSPC; 3599 ret = -ENOSPC;
3519 if (committed) 3600 if (committed)
3520 goto out; 3601 goto out;
@@ -3542,10 +3623,12 @@ out:
3542static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans, 3623static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
3543 struct btrfs_root *root) 3624 struct btrfs_root *root)
3544{ 3625{
3545 struct btrfs_block_rsv *block_rsv; 3626 struct btrfs_block_rsv *block_rsv = NULL;
3546 if (root->ref_cows) 3627
3628 if (root->ref_cows || root == root->fs_info->csum_root)
3547 block_rsv = trans->block_rsv; 3629 block_rsv = trans->block_rsv;
3548 else 3630
3631 if (!block_rsv)
3549 block_rsv = root->block_rsv; 3632 block_rsv = root->block_rsv;
3550 3633
3551 if (!block_rsv) 3634 if (!block_rsv)
@@ -3616,7 +3699,7 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3616 } 3699 }
3617 if (num_bytes) { 3700 if (num_bytes) {
3618 spin_lock(&space_info->lock); 3701 spin_lock(&space_info->lock);
3619 space_info->bytes_reserved -= num_bytes; 3702 space_info->bytes_may_use -= num_bytes;
3620 space_info->reservation_progress++; 3703 space_info->reservation_progress++;
3621 spin_unlock(&space_info->lock); 3704 spin_unlock(&space_info->lock);
3622 } 3705 }
@@ -3640,9 +3723,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
3640{ 3723{
3641 memset(rsv, 0, sizeof(*rsv)); 3724 memset(rsv, 0, sizeof(*rsv));
3642 spin_lock_init(&rsv->lock); 3725 spin_lock_init(&rsv->lock);
3643 atomic_set(&rsv->usage, 1);
3644 rsv->priority = 6;
3645 INIT_LIST_HEAD(&rsv->list);
3646} 3726}
3647 3727
3648struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) 3728struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
@@ -3663,29 +3743,11 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3663void btrfs_free_block_rsv(struct btrfs_root *root, 3743void btrfs_free_block_rsv(struct btrfs_root *root,
3664 struct btrfs_block_rsv *rsv) 3744 struct btrfs_block_rsv *rsv)
3665{ 3745{
3666 if (rsv && atomic_dec_and_test(&rsv->usage)) { 3746 btrfs_block_rsv_release(root, rsv, (u64)-1);
3667 btrfs_block_rsv_release(root, rsv, (u64)-1); 3747 kfree(rsv);
3668 if (!rsv->durable)
3669 kfree(rsv);
3670 }
3671}
3672
3673/*
3674 * make the block_rsv struct be able to capture freed space.
3675 * the captured space will re-add to the the block_rsv struct
3676 * after transaction commit
3677 */
3678void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
3679 struct btrfs_block_rsv *block_rsv)
3680{
3681 block_rsv->durable = 1;
3682 mutex_lock(&fs_info->durable_block_rsv_mutex);
3683 list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
3684 mutex_unlock(&fs_info->durable_block_rsv_mutex);
3685} 3748}
3686 3749
3687int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, 3750int btrfs_block_rsv_add(struct btrfs_root *root,
3688 struct btrfs_root *root,
3689 struct btrfs_block_rsv *block_rsv, 3751 struct btrfs_block_rsv *block_rsv,
3690 u64 num_bytes) 3752 u64 num_bytes)
3691{ 3753{
@@ -3694,7 +3756,7 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
3694 if (num_bytes == 0) 3756 if (num_bytes == 0)
3695 return 0; 3757 return 0;
3696 3758
3697 ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1); 3759 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
3698 if (!ret) { 3760 if (!ret) {
3699 block_rsv_add_bytes(block_rsv, num_bytes, 1); 3761 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3700 return 0; 3762 return 0;
@@ -3703,55 +3765,52 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
3703 return ret; 3765 return ret;
3704} 3766}
3705 3767
3706int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, 3768int btrfs_block_rsv_check(struct btrfs_root *root,
3707 struct btrfs_root *root, 3769 struct btrfs_block_rsv *block_rsv, int min_factor)
3708 struct btrfs_block_rsv *block_rsv,
3709 u64 min_reserved, int min_factor)
3710{ 3770{
3711 u64 num_bytes = 0; 3771 u64 num_bytes = 0;
3712 int commit_trans = 0;
3713 int ret = -ENOSPC; 3772 int ret = -ENOSPC;
3714 3773
3715 if (!block_rsv) 3774 if (!block_rsv)
3716 return 0; 3775 return 0;
3717 3776
3718 spin_lock(&block_rsv->lock); 3777 spin_lock(&block_rsv->lock);
3719 if (min_factor > 0) 3778 num_bytes = div_factor(block_rsv->size, min_factor);
3720 num_bytes = div_factor(block_rsv->size, min_factor); 3779 if (block_rsv->reserved >= num_bytes)
3721 if (min_reserved > num_bytes) 3780 ret = 0;
3722 num_bytes = min_reserved; 3781 spin_unlock(&block_rsv->lock);
3723 3782
3724 if (block_rsv->reserved >= num_bytes) { 3783 return ret;
3784}
3785
3786int btrfs_block_rsv_refill(struct btrfs_root *root,
3787 struct btrfs_block_rsv *block_rsv,
3788 u64 min_reserved)
3789{
3790 u64 num_bytes = 0;
3791 int ret = -ENOSPC;
3792
3793 if (!block_rsv)
3794 return 0;
3795
3796 spin_lock(&block_rsv->lock);
3797 num_bytes = min_reserved;
3798 if (block_rsv->reserved >= num_bytes)
3725 ret = 0; 3799 ret = 0;
3726 } else { 3800 else
3727 num_bytes -= block_rsv->reserved; 3801 num_bytes -= block_rsv->reserved;
3728 if (block_rsv->durable &&
3729 block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
3730 commit_trans = 1;
3731 }
3732 spin_unlock(&block_rsv->lock); 3802 spin_unlock(&block_rsv->lock);
3803
3733 if (!ret) 3804 if (!ret)
3734 return 0; 3805 return 0;
3735 3806
3736 if (block_rsv->refill_used) { 3807 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
3737 ret = reserve_metadata_bytes(trans, root, block_rsv, 3808 if (!ret) {
3738 num_bytes, 0); 3809 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3739 if (!ret) {
3740 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3741 return 0;
3742 }
3743 }
3744
3745 if (commit_trans) {
3746 if (trans)
3747 return -EAGAIN;
3748 trans = btrfs_join_transaction(root);
3749 BUG_ON(IS_ERR(trans));
3750 ret = btrfs_commit_transaction(trans, root);
3751 return 0; 3810 return 0;
3752 } 3811 }
3753 3812
3754 return -ENOSPC; 3813 return ret;
3755} 3814}
3756 3815
3757int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 3816int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
@@ -3827,12 +3886,12 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3827 if (sinfo->total_bytes > num_bytes) { 3886 if (sinfo->total_bytes > num_bytes) {
3828 num_bytes = sinfo->total_bytes - num_bytes; 3887 num_bytes = sinfo->total_bytes - num_bytes;
3829 block_rsv->reserved += num_bytes; 3888 block_rsv->reserved += num_bytes;
3830 sinfo->bytes_reserved += num_bytes; 3889 sinfo->bytes_may_use += num_bytes;
3831 } 3890 }
3832 3891
3833 if (block_rsv->reserved >= block_rsv->size) { 3892 if (block_rsv->reserved >= block_rsv->size) {
3834 num_bytes = block_rsv->reserved - block_rsv->size; 3893 num_bytes = block_rsv->reserved - block_rsv->size;
3835 sinfo->bytes_reserved -= num_bytes; 3894 sinfo->bytes_may_use -= num_bytes;
3836 sinfo->reservation_progress++; 3895 sinfo->reservation_progress++;
3837 block_rsv->reserved = block_rsv->size; 3896 block_rsv->reserved = block_rsv->size;
3838 block_rsv->full = 1; 3897 block_rsv->full = 1;
@@ -3848,16 +3907,12 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3848 3907
3849 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 3908 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3850 fs_info->chunk_block_rsv.space_info = space_info; 3909 fs_info->chunk_block_rsv.space_info = space_info;
3851 fs_info->chunk_block_rsv.priority = 10;
3852 3910
3853 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 3911 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3854 fs_info->global_block_rsv.space_info = space_info; 3912 fs_info->global_block_rsv.space_info = space_info;
3855 fs_info->global_block_rsv.priority = 10;
3856 fs_info->global_block_rsv.refill_used = 1;
3857 fs_info->delalloc_block_rsv.space_info = space_info; 3913 fs_info->delalloc_block_rsv.space_info = space_info;
3858 fs_info->trans_block_rsv.space_info = space_info; 3914 fs_info->trans_block_rsv.space_info = space_info;
3859 fs_info->empty_block_rsv.space_info = space_info; 3915 fs_info->empty_block_rsv.space_info = space_info;
3860 fs_info->empty_block_rsv.priority = 10;
3861 3916
3862 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; 3917 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
3863 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; 3918 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
@@ -3865,10 +3920,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3865 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 3920 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
3866 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 3921 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
3867 3922
3868 btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
3869
3870 btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
3871
3872 update_global_block_rsv(fs_info); 3923 update_global_block_rsv(fs_info);
3873} 3924}
3874 3925
@@ -3883,46 +3934,13 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
3883 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 3934 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
3884} 3935}
3885 3936
3886int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
3887 struct btrfs_root *root,
3888 struct btrfs_block_rsv *rsv)
3889{
3890 struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv;
3891 u64 num_bytes;
3892 int ret;
3893
3894 /*
3895 * Truncate should be freeing data, but give us 2 items just in case it
3896 * needs to use some space. We may want to be smarter about this in the
3897 * future.
3898 */
3899 num_bytes = btrfs_calc_trans_metadata_size(root, 2);
3900
3901 /* We already have enough bytes, just return */
3902 if (rsv->reserved >= num_bytes)
3903 return 0;
3904
3905 num_bytes -= rsv->reserved;
3906
3907 /*
3908 * You should have reserved enough space before hand to do this, so this
3909 * should not fail.
3910 */
3911 ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes);
3912 BUG_ON(ret);
3913
3914 return 0;
3915}
3916
3917void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 3937void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3918 struct btrfs_root *root) 3938 struct btrfs_root *root)
3919{ 3939{
3920 if (!trans->bytes_reserved) 3940 if (!trans->bytes_reserved)
3921 return; 3941 return;
3922 3942
3923 BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv); 3943 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
3924 btrfs_block_rsv_release(root, trans->block_rsv,
3925 trans->bytes_reserved);
3926 trans->bytes_reserved = 0; 3944 trans->bytes_reserved = 0;
3927} 3945}
3928 3946
@@ -3964,11 +3982,19 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
3964 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 3982 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3965} 3983}
3966 3984
3985/**
3986 * drop_outstanding_extent - drop an outstanding extent
3987 * @inode: the inode we're dropping the extent for
3988 *
3989 * This is called when we are freeing up an outstanding extent, either called
3990 * after an error or after an extent is written. This will return the number of
3991 * reserved extents that need to be freed. This must be called with
3992 * BTRFS_I(inode)->lock held.
3993 */
3967static unsigned drop_outstanding_extent(struct inode *inode) 3994static unsigned drop_outstanding_extent(struct inode *inode)
3968{ 3995{
3969 unsigned dropped_extents = 0; 3996 unsigned dropped_extents = 0;
3970 3997
3971 spin_lock(&BTRFS_I(inode)->lock);
3972 BUG_ON(!BTRFS_I(inode)->outstanding_extents); 3998 BUG_ON(!BTRFS_I(inode)->outstanding_extents);
3973 BTRFS_I(inode)->outstanding_extents--; 3999 BTRFS_I(inode)->outstanding_extents--;
3974 4000
@@ -3978,19 +4004,70 @@ static unsigned drop_outstanding_extent(struct inode *inode)
3978 */ 4004 */
3979 if (BTRFS_I(inode)->outstanding_extents >= 4005 if (BTRFS_I(inode)->outstanding_extents >=
3980 BTRFS_I(inode)->reserved_extents) 4006 BTRFS_I(inode)->reserved_extents)
3981 goto out; 4007 return 0;
3982 4008
3983 dropped_extents = BTRFS_I(inode)->reserved_extents - 4009 dropped_extents = BTRFS_I(inode)->reserved_extents -
3984 BTRFS_I(inode)->outstanding_extents; 4010 BTRFS_I(inode)->outstanding_extents;
3985 BTRFS_I(inode)->reserved_extents -= dropped_extents; 4011 BTRFS_I(inode)->reserved_extents -= dropped_extents;
3986out:
3987 spin_unlock(&BTRFS_I(inode)->lock);
3988 return dropped_extents; 4012 return dropped_extents;
3989} 4013}
3990 4014
3991static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes) 4015/**
4016 * calc_csum_metadata_size - return the amount of metada space that must be
4017 * reserved/free'd for the given bytes.
4018 * @inode: the inode we're manipulating
4019 * @num_bytes: the number of bytes in question
4020 * @reserve: 1 if we are reserving space, 0 if we are freeing space
4021 *
4022 * This adjusts the number of csum_bytes in the inode and then returns the
4023 * correct amount of metadata that must either be reserved or freed. We
4024 * calculate how many checksums we can fit into one leaf and then divide the
4025 * number of bytes that will need to be checksumed by this value to figure out
4026 * how many checksums will be required. If we are adding bytes then the number
4027 * may go up and we will return the number of additional bytes that must be
4028 * reserved. If it is going down we will return the number of bytes that must
4029 * be freed.
4030 *
4031 * This must be called with BTRFS_I(inode)->lock held.
4032 */
4033static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
4034 int reserve)
3992{ 4035{
3993 return num_bytes >>= 3; 4036 struct btrfs_root *root = BTRFS_I(inode)->root;
4037 u64 csum_size;
4038 int num_csums_per_leaf;
4039 int num_csums;
4040 int old_csums;
4041
4042 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
4043 BTRFS_I(inode)->csum_bytes == 0)
4044 return 0;
4045
4046 old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4047 if (reserve)
4048 BTRFS_I(inode)->csum_bytes += num_bytes;
4049 else
4050 BTRFS_I(inode)->csum_bytes -= num_bytes;
4051 csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
4052 num_csums_per_leaf = (int)div64_u64(csum_size,
4053 sizeof(struct btrfs_csum_item) +
4054 sizeof(struct btrfs_disk_key));
4055 num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4056 num_csums = num_csums + num_csums_per_leaf - 1;
4057 num_csums = num_csums / num_csums_per_leaf;
4058
4059 old_csums = old_csums + num_csums_per_leaf - 1;
4060 old_csums = old_csums / num_csums_per_leaf;
4061
4062 /* No change, no need to reserve more */
4063 if (old_csums == num_csums)
4064 return 0;
4065
4066 if (reserve)
4067 return btrfs_calc_trans_metadata_size(root,
4068 num_csums - old_csums);
4069
4070 return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
3994} 4071}
3995 4072
3996int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) 4073int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
@@ -3999,9 +4076,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3999 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 4076 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4000 u64 to_reserve = 0; 4077 u64 to_reserve = 0;
4001 unsigned nr_extents = 0; 4078 unsigned nr_extents = 0;
4079 int flush = 1;
4002 int ret; 4080 int ret;
4003 4081
4004 if (btrfs_transaction_in_commit(root->fs_info)) 4082 if (btrfs_is_free_space_inode(root, inode))
4083 flush = 0;
4084
4085 if (flush && btrfs_transaction_in_commit(root->fs_info))
4005 schedule_timeout(1); 4086 schedule_timeout(1);
4006 4087
4007 num_bytes = ALIGN(num_bytes, root->sectorsize); 4088 num_bytes = ALIGN(num_bytes, root->sectorsize);
@@ -4017,18 +4098,29 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4017 4098
4018 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); 4099 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4019 } 4100 }
4101 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4020 spin_unlock(&BTRFS_I(inode)->lock); 4102 spin_unlock(&BTRFS_I(inode)->lock);
4021 4103
4022 to_reserve += calc_csum_metadata_size(inode, num_bytes); 4104 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4023 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
4024 if (ret) { 4105 if (ret) {
4106 u64 to_free = 0;
4025 unsigned dropped; 4107 unsigned dropped;
4108
4109 spin_lock(&BTRFS_I(inode)->lock);
4110 dropped = drop_outstanding_extent(inode);
4111 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4112 spin_unlock(&BTRFS_I(inode)->lock);
4113 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4114
4026 /* 4115 /*
4027 * We don't need the return value since our reservation failed, 4116 * Somebody could have come in and twiddled with the
4028 * we just need to clean up our counter. 4117 * reservation, so if we have to free more than we would have
4118 * reserved from this reservation go ahead and release those
4119 * bytes.
4029 */ 4120 */
4030 dropped = drop_outstanding_extent(inode); 4121 to_free -= to_reserve;
4031 WARN_ON(dropped > 1); 4122 if (to_free)
4123 btrfs_block_rsv_release(root, block_rsv, to_free);
4032 return ret; 4124 return ret;
4033 } 4125 }
4034 4126
@@ -4037,6 +4129,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4037 return 0; 4129 return 0;
4038} 4130}
4039 4131
4132/**
4133 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
4134 * @inode: the inode to release the reservation for
4135 * @num_bytes: the number of bytes we're releasing
4136 *
4137 * This will release the metadata reservation for an inode. This can be called
4138 * once we complete IO for a given set of bytes to release their metadata
4139 * reservations.
4140 */
4040void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) 4141void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4041{ 4142{
4042 struct btrfs_root *root = BTRFS_I(inode)->root; 4143 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4044,9 +4145,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4044 unsigned dropped; 4145 unsigned dropped;
4045 4146
4046 num_bytes = ALIGN(num_bytes, root->sectorsize); 4147 num_bytes = ALIGN(num_bytes, root->sectorsize);
4148 spin_lock(&BTRFS_I(inode)->lock);
4047 dropped = drop_outstanding_extent(inode); 4149 dropped = drop_outstanding_extent(inode);
4048 4150
4049 to_free = calc_csum_metadata_size(inode, num_bytes); 4151 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4152 spin_unlock(&BTRFS_I(inode)->lock);
4050 if (dropped > 0) 4153 if (dropped > 0)
4051 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4154 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4052 4155
@@ -4054,6 +4157,21 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4054 to_free); 4157 to_free);
4055} 4158}
4056 4159
4160/**
4161 * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
4162 * @inode: inode we're writing to
4163 * @num_bytes: the number of bytes we want to allocate
4164 *
4165 * This will do the following things
4166 *
4167 * o reserve space in the data space info for num_bytes
4168 * o reserve space in the metadata space info based on number of outstanding
4169 * extents and how much csums will be needed
4170 * o add to the inodes ->delalloc_bytes
4171 * o add it to the fs_info's delalloc inodes list.
4172 *
4173 * This will return 0 for success and -ENOSPC if there is no space left.
4174 */
4057int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) 4175int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4058{ 4176{
4059 int ret; 4177 int ret;
@@ -4071,6 +4189,19 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4071 return 0; 4189 return 0;
4072} 4190}
4073 4191
4192/**
4193 * btrfs_delalloc_release_space - release data and metadata space for delalloc
4194 * @inode: inode we're releasing space for
4195 * @num_bytes: the number of bytes we want to free up
4196 *
4197 * This must be matched with a call to btrfs_delalloc_reserve_space. This is
4198 * called in the case that we don't need the metadata AND data reservations
4199 * anymore. So if there is an error or we insert an inline extent.
4200 *
4201 * This function will release the metadata space that was not used and will
4202 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
4203 * list if there are no delalloc bytes left.
4204 */
4074void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) 4205void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4075{ 4206{
4076 btrfs_delalloc_release_metadata(inode, num_bytes); 4207 btrfs_delalloc_release_metadata(inode, num_bytes);
@@ -4123,7 +4254,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4123 spin_lock(&cache->space_info->lock); 4254 spin_lock(&cache->space_info->lock);
4124 spin_lock(&cache->lock); 4255 spin_lock(&cache->lock);
4125 4256
4126 if (btrfs_super_cache_generation(&info->super_copy) != 0 && 4257 if (btrfs_test_opt(root, SPACE_CACHE) &&
4127 cache->disk_cache_state < BTRFS_DC_CLEAR) 4258 cache->disk_cache_state < BTRFS_DC_CLEAR)
4128 cache->disk_cache_state = BTRFS_DC_CLEAR; 4259 cache->disk_cache_state = BTRFS_DC_CLEAR;
4129 4260
@@ -4135,7 +4266,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4135 btrfs_set_block_group_used(&cache->item, old_val); 4266 btrfs_set_block_group_used(&cache->item, old_val);
4136 cache->reserved -= num_bytes; 4267 cache->reserved -= num_bytes;
4137 cache->space_info->bytes_reserved -= num_bytes; 4268 cache->space_info->bytes_reserved -= num_bytes;
4138 cache->space_info->reservation_progress++;
4139 cache->space_info->bytes_used += num_bytes; 4269 cache->space_info->bytes_used += num_bytes;
4140 cache->space_info->disk_used += num_bytes * factor; 4270 cache->space_info->disk_used += num_bytes * factor;
4141 spin_unlock(&cache->lock); 4271 spin_unlock(&cache->lock);
@@ -4187,7 +4317,6 @@ static int pin_down_extent(struct btrfs_root *root,
4187 if (reserved) { 4317 if (reserved) {
4188 cache->reserved -= num_bytes; 4318 cache->reserved -= num_bytes;
4189 cache->space_info->bytes_reserved -= num_bytes; 4319 cache->space_info->bytes_reserved -= num_bytes;
4190 cache->space_info->reservation_progress++;
4191 } 4320 }
4192 spin_unlock(&cache->lock); 4321 spin_unlock(&cache->lock);
4193 spin_unlock(&cache->space_info->lock); 4322 spin_unlock(&cache->space_info->lock);
@@ -4214,46 +4343,55 @@ int btrfs_pin_extent(struct btrfs_root *root,
4214 return 0; 4343 return 0;
4215} 4344}
4216 4345
4217/* 4346/**
4218 * update size of reserved extents. this function may return -EAGAIN 4347 * btrfs_update_reserved_bytes - update the block_group and space info counters
4219 * if 'reserve' is true or 'sinfo' is false. 4348 * @cache: The cache we are manipulating
4349 * @num_bytes: The number of bytes in question
4350 * @reserve: One of the reservation enums
4351 *
4352 * This is called by the allocator when it reserves space, or by somebody who is
4353 * freeing space that was never actually used on disk. For example if you
4354 * reserve some space for a new leaf in transaction A and before transaction A
4355 * commits you free that leaf, you call this with reserve set to 0 in order to
4356 * clear the reservation.
4357 *
4358 * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
4359 * ENOSPC accounting. For data we handle the reservation through clearing the
4360 * delalloc bits in the io_tree. We have to do this since we could end up
4361 * allocating less disk space for the amount of data we have reserved in the
4362 * case of compression.
4363 *
4364 * If this is a reservation and the block group has become read only we cannot
4365 * make the reservation and return -EAGAIN, otherwise this function always
4366 * succeeds.
4220 */ 4367 */
4221int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 4368static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4222 u64 num_bytes, int reserve, int sinfo) 4369 u64 num_bytes, int reserve)
4223{ 4370{
4371 struct btrfs_space_info *space_info = cache->space_info;
4224 int ret = 0; 4372 int ret = 0;
4225 if (sinfo) { 4373 spin_lock(&space_info->lock);
4226 struct btrfs_space_info *space_info = cache->space_info; 4374 spin_lock(&cache->lock);
4227 spin_lock(&space_info->lock); 4375 if (reserve != RESERVE_FREE) {
4228 spin_lock(&cache->lock);
4229 if (reserve) {
4230 if (cache->ro) {
4231 ret = -EAGAIN;
4232 } else {
4233 cache->reserved += num_bytes;
4234 space_info->bytes_reserved += num_bytes;
4235 }
4236 } else {
4237 if (cache->ro)
4238 space_info->bytes_readonly += num_bytes;
4239 cache->reserved -= num_bytes;
4240 space_info->bytes_reserved -= num_bytes;
4241 space_info->reservation_progress++;
4242 }
4243 spin_unlock(&cache->lock);
4244 spin_unlock(&space_info->lock);
4245 } else {
4246 spin_lock(&cache->lock);
4247 if (cache->ro) { 4376 if (cache->ro) {
4248 ret = -EAGAIN; 4377 ret = -EAGAIN;
4249 } else { 4378 } else {
4250 if (reserve) 4379 cache->reserved += num_bytes;
4251 cache->reserved += num_bytes; 4380 space_info->bytes_reserved += num_bytes;
4252 else 4381 if (reserve == RESERVE_ALLOC) {
4253 cache->reserved -= num_bytes; 4382 BUG_ON(space_info->bytes_may_use < num_bytes);
4383 space_info->bytes_may_use -= num_bytes;
4384 }
4254 } 4385 }
4255 spin_unlock(&cache->lock); 4386 } else {
4387 if (cache->ro)
4388 space_info->bytes_readonly += num_bytes;
4389 cache->reserved -= num_bytes;
4390 space_info->bytes_reserved -= num_bytes;
4391 space_info->reservation_progress++;
4256 } 4392 }
4393 spin_unlock(&cache->lock);
4394 spin_unlock(&space_info->lock);
4257 return ret; 4395 return ret;
4258} 4396}
4259 4397
@@ -4319,13 +4457,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4319 spin_lock(&cache->lock); 4457 spin_lock(&cache->lock);
4320 cache->pinned -= len; 4458 cache->pinned -= len;
4321 cache->space_info->bytes_pinned -= len; 4459 cache->space_info->bytes_pinned -= len;
4322 if (cache->ro) { 4460 if (cache->ro)
4323 cache->space_info->bytes_readonly += len; 4461 cache->space_info->bytes_readonly += len;
4324 } else if (cache->reserved_pinned > 0) {
4325 len = min(len, cache->reserved_pinned);
4326 cache->reserved_pinned -= len;
4327 cache->space_info->bytes_reserved += len;
4328 }
4329 spin_unlock(&cache->lock); 4462 spin_unlock(&cache->lock);
4330 spin_unlock(&cache->space_info->lock); 4463 spin_unlock(&cache->space_info->lock);
4331 } 4464 }
@@ -4340,11 +4473,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4340{ 4473{
4341 struct btrfs_fs_info *fs_info = root->fs_info; 4474 struct btrfs_fs_info *fs_info = root->fs_info;
4342 struct extent_io_tree *unpin; 4475 struct extent_io_tree *unpin;
4343 struct btrfs_block_rsv *block_rsv;
4344 struct btrfs_block_rsv *next_rsv;
4345 u64 start; 4476 u64 start;
4346 u64 end; 4477 u64 end;
4347 int idx;
4348 int ret; 4478 int ret;
4349 4479
4350 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 4480 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -4367,30 +4497,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4367 cond_resched(); 4497 cond_resched();
4368 } 4498 }
4369 4499
4370 mutex_lock(&fs_info->durable_block_rsv_mutex);
4371 list_for_each_entry_safe(block_rsv, next_rsv,
4372 &fs_info->durable_block_rsv_list, list) {
4373
4374 idx = trans->transid & 0x1;
4375 if (block_rsv->freed[idx] > 0) {
4376 block_rsv_add_bytes(block_rsv,
4377 block_rsv->freed[idx], 0);
4378 block_rsv->freed[idx] = 0;
4379 }
4380 if (atomic_read(&block_rsv->usage) == 0) {
4381 btrfs_block_rsv_release(root, block_rsv, (u64)-1);
4382
4383 if (block_rsv->freed[0] == 0 &&
4384 block_rsv->freed[1] == 0) {
4385 list_del_init(&block_rsv->list);
4386 kfree(block_rsv);
4387 }
4388 } else {
4389 btrfs_block_rsv_release(root, block_rsv, 0);
4390 }
4391 }
4392 mutex_unlock(&fs_info->durable_block_rsv_mutex);
4393
4394 return 0; 4500 return 0;
4395} 4501}
4396 4502
@@ -4668,7 +4774,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4668 struct extent_buffer *buf, 4774 struct extent_buffer *buf,
4669 u64 parent, int last_ref) 4775 u64 parent, int last_ref)
4670{ 4776{
4671 struct btrfs_block_rsv *block_rsv;
4672 struct btrfs_block_group_cache *cache = NULL; 4777 struct btrfs_block_group_cache *cache = NULL;
4673 int ret; 4778 int ret;
4674 4779
@@ -4683,64 +4788,24 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4683 if (!last_ref) 4788 if (!last_ref)
4684 return; 4789 return;
4685 4790
4686 block_rsv = get_block_rsv(trans, root);
4687 cache = btrfs_lookup_block_group(root->fs_info, buf->start); 4791 cache = btrfs_lookup_block_group(root->fs_info, buf->start);
4688 if (block_rsv->space_info != cache->space_info)
4689 goto out;
4690 4792
4691 if (btrfs_header_generation(buf) == trans->transid) { 4793 if (btrfs_header_generation(buf) == trans->transid) {
4692 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 4794 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4693 ret = check_ref_cleanup(trans, root, buf->start); 4795 ret = check_ref_cleanup(trans, root, buf->start);
4694 if (!ret) 4796 if (!ret)
4695 goto pin; 4797 goto out;
4696 } 4798 }
4697 4799
4698 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 4800 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
4699 pin_down_extent(root, cache, buf->start, buf->len, 1); 4801 pin_down_extent(root, cache, buf->start, buf->len, 1);
4700 goto pin; 4802 goto out;
4701 } 4803 }
4702 4804
4703 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 4805 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4704 4806
4705 btrfs_add_free_space(cache, buf->start, buf->len); 4807 btrfs_add_free_space(cache, buf->start, buf->len);
4706 ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0); 4808 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
4707 if (ret == -EAGAIN) {
4708 /* block group became read-only */
4709 btrfs_update_reserved_bytes(cache, buf->len, 0, 1);
4710 goto out;
4711 }
4712
4713 ret = 1;
4714 spin_lock(&block_rsv->lock);
4715 if (block_rsv->reserved < block_rsv->size) {
4716 block_rsv->reserved += buf->len;
4717 ret = 0;
4718 }
4719 spin_unlock(&block_rsv->lock);
4720
4721 if (ret) {
4722 spin_lock(&cache->space_info->lock);
4723 cache->space_info->bytes_reserved -= buf->len;
4724 cache->space_info->reservation_progress++;
4725 spin_unlock(&cache->space_info->lock);
4726 }
4727 goto out;
4728 }
4729pin:
4730 if (block_rsv->durable && !cache->ro) {
4731 ret = 0;
4732 spin_lock(&cache->lock);
4733 if (!cache->ro) {
4734 cache->reserved_pinned += buf->len;
4735 ret = 1;
4736 }
4737 spin_unlock(&cache->lock);
4738
4739 if (ret) {
4740 spin_lock(&block_rsv->lock);
4741 block_rsv->freed[trans->transid & 0x1] += buf->len;
4742 spin_unlock(&block_rsv->lock);
4743 }
4744 } 4809 }
4745out: 4810out:
4746 /* 4811 /*
@@ -4883,6 +4948,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4883 int last_ptr_loop = 0; 4948 int last_ptr_loop = 0;
4884 int loop = 0; 4949 int loop = 0;
4885 int index = 0; 4950 int index = 0;
4951 int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
4952 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
4886 bool found_uncached_bg = false; 4953 bool found_uncached_bg = false;
4887 bool failed_cluster_refill = false; 4954 bool failed_cluster_refill = false;
4888 bool failed_alloc = false; 4955 bool failed_alloc = false;
@@ -5202,8 +5269,8 @@ checks:
5202 search_start - offset); 5269 search_start - offset);
5203 BUG_ON(offset > search_start); 5270 BUG_ON(offset > search_start);
5204 5271
5205 ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1, 5272 ret = btrfs_update_reserved_bytes(block_group, num_bytes,
5206 (data & BTRFS_BLOCK_GROUP_DATA)); 5273 alloc_type);
5207 if (ret == -EAGAIN) { 5274 if (ret == -EAGAIN) {
5208 btrfs_add_free_space(block_group, offset, num_bytes); 5275 btrfs_add_free_space(block_group, offset, num_bytes);
5209 goto loop; 5276 goto loop;
@@ -5325,7 +5392,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
5325 int index = 0; 5392 int index = 0;
5326 5393
5327 spin_lock(&info->lock); 5394 spin_lock(&info->lock);
5328 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 5395 printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
5396 (unsigned long long)info->flags,
5329 (unsigned long long)(info->total_bytes - info->bytes_used - 5397 (unsigned long long)(info->total_bytes - info->bytes_used -
5330 info->bytes_pinned - info->bytes_reserved - 5398 info->bytes_pinned - info->bytes_reserved -
5331 info->bytes_readonly), 5399 info->bytes_readonly),
@@ -5427,7 +5495,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
5427 ret = btrfs_discard_extent(root, start, len, NULL); 5495 ret = btrfs_discard_extent(root, start, len, NULL);
5428 5496
5429 btrfs_add_free_space(cache, start, len); 5497 btrfs_add_free_space(cache, start, len);
5430 btrfs_update_reserved_bytes(cache, len, 0, 1); 5498 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
5431 btrfs_put_block_group(cache); 5499 btrfs_put_block_group(cache);
5432 5500
5433 trace_btrfs_reserved_extent_free(root, start, len); 5501 trace_btrfs_reserved_extent_free(root, start, len);
@@ -5630,7 +5698,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5630 put_caching_control(caching_ctl); 5698 put_caching_control(caching_ctl);
5631 } 5699 }
5632 5700
5633 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1); 5701 ret = btrfs_update_reserved_bytes(block_group, ins->offset,
5702 RESERVE_ALLOC_NO_ACCOUNT);
5634 BUG_ON(ret); 5703 BUG_ON(ret);
5635 btrfs_put_block_group(block_group); 5704 btrfs_put_block_group(block_group);
5636 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 5705 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
@@ -5687,8 +5756,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5687 block_rsv = get_block_rsv(trans, root); 5756 block_rsv = get_block_rsv(trans, root);
5688 5757
5689 if (block_rsv->size == 0) { 5758 if (block_rsv->size == 0) {
5690 ret = reserve_metadata_bytes(trans, root, block_rsv, 5759 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
5691 blocksize, 0);
5692 /* 5760 /*
5693 * If we couldn't reserve metadata bytes try and use some from 5761 * If we couldn't reserve metadata bytes try and use some from
5694 * the global reserve. 5762 * the global reserve.
@@ -5709,12 +5777,8 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5709 return block_rsv; 5777 return block_rsv;
5710 if (ret) { 5778 if (ret) {
5711 WARN_ON(1); 5779 WARN_ON(1);
5712 ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize, 5780 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
5713 0);
5714 if (!ret) { 5781 if (!ret) {
5715 spin_lock(&block_rsv->lock);
5716 block_rsv->size += blocksize;
5717 spin_unlock(&block_rsv->lock);
5718 return block_rsv; 5782 return block_rsv;
5719 } else if (ret && block_rsv != global_rsv) { 5783 } else if (ret && block_rsv != global_rsv) {
5720 ret = block_rsv_use_bytes(global_rsv, blocksize); 5784 ret = block_rsv_use_bytes(global_rsv, blocksize);
@@ -6592,12 +6656,9 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
6592 cache->bytes_super - btrfs_block_group_used(&cache->item); 6656 cache->bytes_super - btrfs_block_group_used(&cache->item);
6593 6657
6594 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 6658 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
6595 sinfo->bytes_may_use + sinfo->bytes_readonly + 6659 sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
6596 cache->reserved_pinned + num_bytes + min_allocable_bytes <= 6660 min_allocable_bytes <= sinfo->total_bytes) {
6597 sinfo->total_bytes) {
6598 sinfo->bytes_readonly += num_bytes; 6661 sinfo->bytes_readonly += num_bytes;
6599 sinfo->bytes_reserved += cache->reserved_pinned;
6600 cache->reserved_pinned = 0;
6601 cache->ro = 1; 6662 cache->ro = 1;
6602 ret = 0; 6663 ret = 0;
6603 } 6664 }
@@ -6964,7 +7025,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
6964 struct btrfs_space_info, 7025 struct btrfs_space_info,
6965 list); 7026 list);
6966 if (space_info->bytes_pinned > 0 || 7027 if (space_info->bytes_pinned > 0 ||
6967 space_info->bytes_reserved > 0) { 7028 space_info->bytes_reserved > 0 ||
7029 space_info->bytes_may_use > 0) {
6968 WARN_ON(1); 7030 WARN_ON(1);
6969 dump_space_info(space_info, 0, 0); 7031 dump_space_info(space_info, 0, 0);
6970 } 7032 }
@@ -7007,13 +7069,11 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7007 path->reada = 1; 7069 path->reada = 1;
7008 7070
7009 cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); 7071 cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
7010 if (cache_gen != 0 && 7072 if (btrfs_test_opt(root, SPACE_CACHE) &&
7011 btrfs_super_generation(&root->fs_info->super_copy) != cache_gen) 7073 btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
7012 need_clear = 1; 7074 need_clear = 1;
7013 if (btrfs_test_opt(root, CLEAR_CACHE)) 7075 if (btrfs_test_opt(root, CLEAR_CACHE))
7014 need_clear = 1; 7076 need_clear = 1;
7015 if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
7016 printk(KERN_INFO "btrfs: disk space caching is enabled\n");
7017 7077
7018 while (1) { 7078 while (1) {
7019 ret = find_first_block_group(root, path, &key); 7079 ret = find_first_block_group(root, path, &key);
@@ -7268,7 +7328,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7268 spin_unlock(&block_group->lock); 7328 spin_unlock(&block_group->lock);
7269 } 7329 }
7270 /* One for our lookup ref */ 7330 /* One for our lookup ref */
7271 iput(inode); 7331 btrfs_add_delayed_iput(inode);
7272 } 7332 }
7273 7333
7274 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 7334 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d418164a35f1..0ada0b700b44 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -894,6 +894,194 @@ search_again:
894 goto again; 894 goto again;
895} 895}
896 896
897/**
898 * convert_extent - convert all bits in a given range from one bit to another
899 * @tree: the io tree to search
900 * @start: the start offset in bytes
901 * @end: the end offset in bytes (inclusive)
902 * @bits: the bits to set in this range
903 * @clear_bits: the bits to clear in this range
904 * @mask: the allocation mask
905 *
906 * This will go through and set bits for the given range. If any states exist
907 * already in this range they are set with the given bit and cleared of the
908 * clear_bits. This is only meant to be used by things that are mergeable, ie
909 * converting from say DELALLOC to DIRTY. This is not meant to be used with
910 * boundary bits like LOCK.
911 */
912int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
913 int bits, int clear_bits, gfp_t mask)
914{
915 struct extent_state *state;
916 struct extent_state *prealloc = NULL;
917 struct rb_node *node;
918 int err = 0;
919 u64 last_start;
920 u64 last_end;
921
922again:
923 if (!prealloc && (mask & __GFP_WAIT)) {
924 prealloc = alloc_extent_state(mask);
925 if (!prealloc)
926 return -ENOMEM;
927 }
928
929 spin_lock(&tree->lock);
930 /*
931 * this search will find all the extents that end after
932 * our range starts.
933 */
934 node = tree_search(tree, start);
935 if (!node) {
936 prealloc = alloc_extent_state_atomic(prealloc);
937 if (!prealloc)
938 return -ENOMEM;
939 err = insert_state(tree, prealloc, start, end, &bits);
940 prealloc = NULL;
941 BUG_ON(err == -EEXIST);
942 goto out;
943 }
944 state = rb_entry(node, struct extent_state, rb_node);
945hit_next:
946 last_start = state->start;
947 last_end = state->end;
948
949 /*
950 * | ---- desired range ---- |
951 * | state |
952 *
953 * Just lock what we found and keep going
954 */
955 if (state->start == start && state->end <= end) {
956 struct rb_node *next_node;
957
958 set_state_bits(tree, state, &bits);
959 clear_state_bit(tree, state, &clear_bits, 0);
960
961 merge_state(tree, state);
962 if (last_end == (u64)-1)
963 goto out;
964
965 start = last_end + 1;
966 next_node = rb_next(&state->rb_node);
967 if (next_node && start < end && prealloc && !need_resched()) {
968 state = rb_entry(next_node, struct extent_state,
969 rb_node);
970 if (state->start == start)
971 goto hit_next;
972 }
973 goto search_again;
974 }
975
976 /*
977 * | ---- desired range ---- |
978 * | state |
979 * or
980 * | ------------- state -------------- |
981 *
982 * We need to split the extent we found, and may flip bits on
983 * second half.
984 *
985 * If the extent we found extends past our
986 * range, we just split and search again. It'll get split
987 * again the next time though.
988 *
989 * If the extent we found is inside our range, we set the
990 * desired bit on it.
991 */
992 if (state->start < start) {
993 prealloc = alloc_extent_state_atomic(prealloc);
994 if (!prealloc)
995 return -ENOMEM;
996 err = split_state(tree, state, prealloc, start);
997 BUG_ON(err == -EEXIST);
998 prealloc = NULL;
999 if (err)
1000 goto out;
1001 if (state->end <= end) {
1002 set_state_bits(tree, state, &bits);
1003 clear_state_bit(tree, state, &clear_bits, 0);
1004 merge_state(tree, state);
1005 if (last_end == (u64)-1)
1006 goto out;
1007 start = last_end + 1;
1008 }
1009 goto search_again;
1010 }
1011 /*
1012 * | ---- desired range ---- |
1013 * | state | or | state |
1014 *
1015 * There's a hole, we need to insert something in it and
1016 * ignore the extent we found.
1017 */
1018 if (state->start > start) {
1019 u64 this_end;
1020 if (end < last_start)
1021 this_end = end;
1022 else
1023 this_end = last_start - 1;
1024
1025 prealloc = alloc_extent_state_atomic(prealloc);
1026 if (!prealloc)
1027 return -ENOMEM;
1028
1029 /*
1030 * Avoid to free 'prealloc' if it can be merged with
1031 * the later extent.
1032 */
1033 err = insert_state(tree, prealloc, start, this_end,
1034 &bits);
1035 BUG_ON(err == -EEXIST);
1036 if (err) {
1037 free_extent_state(prealloc);
1038 prealloc = NULL;
1039 goto out;
1040 }
1041 prealloc = NULL;
1042 start = this_end + 1;
1043 goto search_again;
1044 }
1045 /*
1046 * | ---- desired range ---- |
1047 * | state |
1048 * We need to split the extent, and set the bit
1049 * on the first half
1050 */
1051 if (state->start <= end && state->end > end) {
1052 prealloc = alloc_extent_state_atomic(prealloc);
1053 if (!prealloc)
1054 return -ENOMEM;
1055
1056 err = split_state(tree, state, prealloc, end + 1);
1057 BUG_ON(err == -EEXIST);
1058
1059 set_state_bits(tree, prealloc, &bits);
1060 clear_state_bit(tree, prealloc, &clear_bits, 0);
1061
1062 merge_state(tree, prealloc);
1063 prealloc = NULL;
1064 goto out;
1065 }
1066
1067 goto search_again;
1068
1069out:
1070 spin_unlock(&tree->lock);
1071 if (prealloc)
1072 free_extent_state(prealloc);
1073
1074 return err;
1075
1076search_again:
1077 if (start > end)
1078 goto out;
1079 spin_unlock(&tree->lock);
1080 if (mask & __GFP_WAIT)
1081 cond_resched();
1082 goto again;
1083}
1084
897/* wrappers around set/clear extent bit */ 1085/* wrappers around set/clear extent bit */
898int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1086int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
899 gfp_t mask) 1087 gfp_t mask)
@@ -2136,6 +2324,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2136 int compressed; 2324 int compressed;
2137 int write_flags; 2325 int write_flags;
2138 unsigned long nr_written = 0; 2326 unsigned long nr_written = 0;
2327 bool fill_delalloc = true;
2139 2328
2140 if (wbc->sync_mode == WB_SYNC_ALL) 2329 if (wbc->sync_mode == WB_SYNC_ALL)
2141 write_flags = WRITE_SYNC; 2330 write_flags = WRITE_SYNC;
@@ -2166,10 +2355,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2166 2355
2167 set_page_extent_mapped(page); 2356 set_page_extent_mapped(page);
2168 2357
2358 if (!tree->ops || !tree->ops->fill_delalloc)
2359 fill_delalloc = false;
2360
2169 delalloc_start = start; 2361 delalloc_start = start;
2170 delalloc_end = 0; 2362 delalloc_end = 0;
2171 page_started = 0; 2363 page_started = 0;
2172 if (!epd->extent_locked) { 2364 if (!epd->extent_locked && fill_delalloc) {
2173 u64 delalloc_to_write = 0; 2365 u64 delalloc_to_write = 0;
2174 /* 2366 /*
2175 * make sure the wbc mapping index is at least updated 2367 * make sure the wbc mapping index is at least updated
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7b2f0c3e7929..325a346369da 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -17,6 +17,7 @@
17#define EXTENT_NODATASUM (1 << 10) 17#define EXTENT_NODATASUM (1 << 10)
18#define EXTENT_DO_ACCOUNTING (1 << 11) 18#define EXTENT_DO_ACCOUNTING (1 << 11)
19#define EXTENT_FIRST_DELALLOC (1 << 12) 19#define EXTENT_FIRST_DELALLOC (1 << 12)
20#define EXTENT_NEED_WAIT (1 << 13)
20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 21#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) 22#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
22 23
@@ -214,6 +215,8 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
214 gfp_t mask); 215 gfp_t mask);
215int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 216int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
216 gfp_t mask); 217 gfp_t mask);
218int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
219 int bits, int clear_bits, gfp_t mask);
217int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 220int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
218 struct extent_state **cached_state, gfp_t mask); 221 struct extent_state **cached_state, gfp_t mask);
219int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 222int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e4e57d59edb7..f2e928289600 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1069,6 +1069,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1069 int i; 1069 int i;
1070 unsigned long index = pos >> PAGE_CACHE_SHIFT; 1070 unsigned long index = pos >> PAGE_CACHE_SHIFT;
1071 struct inode *inode = fdentry(file)->d_inode; 1071 struct inode *inode = fdentry(file)->d_inode;
1072 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1072 int err = 0; 1073 int err = 0;
1073 int faili = 0; 1074 int faili = 0;
1074 u64 start_pos; 1075 u64 start_pos;
@@ -1080,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1080again: 1081again:
1081 for (i = 0; i < num_pages; i++) { 1082 for (i = 0; i < num_pages; i++) {
1082 pages[i] = find_or_create_page(inode->i_mapping, index + i, 1083 pages[i] = find_or_create_page(inode->i_mapping, index + i,
1083 GFP_NOFS); 1084 mask);
1084 if (!pages[i]) { 1085 if (!pages[i]) {
1085 faili = i - 1; 1086 faili = i - 1;
1086 err = -ENOMEM; 1087 err = -ENOMEM;
@@ -1615,10 +1616,6 @@ static long btrfs_fallocate(struct file *file, int mode,
1615 goto out; 1616 goto out;
1616 } 1617 }
1617 1618
1618 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
1619 if (ret)
1620 goto out;
1621
1622 locked_end = alloc_end - 1; 1619 locked_end = alloc_end - 1;
1623 while (1) { 1620 while (1) {
1624 struct btrfs_ordered_extent *ordered; 1621 struct btrfs_ordered_extent *ordered;
@@ -1664,11 +1661,27 @@ static long btrfs_fallocate(struct file *file, int mode,
1664 if (em->block_start == EXTENT_MAP_HOLE || 1661 if (em->block_start == EXTENT_MAP_HOLE ||
1665 (cur_offset >= inode->i_size && 1662 (cur_offset >= inode->i_size &&
1666 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 1663 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
1664
1665 /*
1666 * Make sure we have enough space before we do the
1667 * allocation.
1668 */
1669 ret = btrfs_check_data_free_space(inode, last_byte -
1670 cur_offset);
1671 if (ret) {
1672 free_extent_map(em);
1673 break;
1674 }
1675
1667 ret = btrfs_prealloc_file_range(inode, mode, cur_offset, 1676 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
1668 last_byte - cur_offset, 1677 last_byte - cur_offset,
1669 1 << inode->i_blkbits, 1678 1 << inode->i_blkbits,
1670 offset + len, 1679 offset + len,
1671 &alloc_hint); 1680 &alloc_hint);
1681
1682 /* Let go of our reservation. */
1683 btrfs_free_reserved_data_space(inode, last_byte -
1684 cur_offset);
1672 if (ret < 0) { 1685 if (ret < 0) {
1673 free_extent_map(em); 1686 free_extent_map(em);
1674 break; 1687 break;
@@ -1694,8 +1707,6 @@ static long btrfs_fallocate(struct file *file, int mode,
1694 } 1707 }
1695 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 1708 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
1696 &cached_state, GFP_NOFS); 1709 &cached_state, GFP_NOFS);
1697
1698 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
1699out: 1710out:
1700 mutex_unlock(&inode->i_mutex); 1711 mutex_unlock(&inode->i_mutex);
1701 return ret; 1712 return ret;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 41ac927401d0..de205d59b74b 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -20,6 +20,7 @@
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/math64.h> 22#include <linux/math64.h>
23#include <linux/ratelimit.h>
23#include "ctree.h" 24#include "ctree.h"
24#include "free-space-cache.h" 25#include "free-space-cache.h"
25#include "transaction.h" 26#include "transaction.h"
@@ -84,6 +85,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
84 *block_group, struct btrfs_path *path) 85 *block_group, struct btrfs_path *path)
85{ 86{
86 struct inode *inode = NULL; 87 struct inode *inode = NULL;
88 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
87 89
88 spin_lock(&block_group->lock); 90 spin_lock(&block_group->lock);
89 if (block_group->inode) 91 if (block_group->inode)
@@ -98,13 +100,14 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
98 return inode; 100 return inode;
99 101
100 spin_lock(&block_group->lock); 102 spin_lock(&block_group->lock);
101 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) { 103 if (!((BTRFS_I(inode)->flags & flags) == flags)) {
102 printk(KERN_INFO "Old style space inode found, converting.\n"); 104 printk(KERN_INFO "Old style space inode found, converting.\n");
103 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM; 105 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
106 BTRFS_INODE_NODATACOW;
104 block_group->disk_cache_state = BTRFS_DC_CLEAR; 107 block_group->disk_cache_state = BTRFS_DC_CLEAR;
105 } 108 }
106 109
107 if (!btrfs_fs_closing(root->fs_info)) { 110 if (!block_group->iref) {
108 block_group->inode = igrab(inode); 111 block_group->inode = igrab(inode);
109 block_group->iref = 1; 112 block_group->iref = 1;
110 } 113 }
@@ -122,12 +125,17 @@ int __create_free_space_inode(struct btrfs_root *root,
122 struct btrfs_free_space_header *header; 125 struct btrfs_free_space_header *header;
123 struct btrfs_inode_item *inode_item; 126 struct btrfs_inode_item *inode_item;
124 struct extent_buffer *leaf; 127 struct extent_buffer *leaf;
128 u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
125 int ret; 129 int ret;
126 130
127 ret = btrfs_insert_empty_inode(trans, root, path, ino); 131 ret = btrfs_insert_empty_inode(trans, root, path, ino);
128 if (ret) 132 if (ret)
129 return ret; 133 return ret;
130 134
135 /* We inline crc's for the free disk space cache */
136 if (ino != BTRFS_FREE_INO_OBJECTID)
137 flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
138
131 leaf = path->nodes[0]; 139 leaf = path->nodes[0];
132 inode_item = btrfs_item_ptr(leaf, path->slots[0], 140 inode_item = btrfs_item_ptr(leaf, path->slots[0],
133 struct btrfs_inode_item); 141 struct btrfs_inode_item);
@@ -140,8 +148,7 @@ int __create_free_space_inode(struct btrfs_root *root,
140 btrfs_set_inode_uid(leaf, inode_item, 0); 148 btrfs_set_inode_uid(leaf, inode_item, 0);
141 btrfs_set_inode_gid(leaf, inode_item, 0); 149 btrfs_set_inode_gid(leaf, inode_item, 0);
142 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 150 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
143 btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | 151 btrfs_set_inode_flags(leaf, inode_item, flags);
144 BTRFS_INODE_PREALLOC);
145 btrfs_set_inode_nlink(leaf, inode_item, 1); 152 btrfs_set_inode_nlink(leaf, inode_item, 1);
146 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 153 btrfs_set_inode_transid(leaf, inode_item, trans->transid);
147 btrfs_set_inode_block_group(leaf, inode_item, offset); 154 btrfs_set_inode_block_group(leaf, inode_item, offset);
@@ -196,9 +203,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
196 203
197 rsv = trans->block_rsv; 204 rsv = trans->block_rsv;
198 trans->block_rsv = root->orphan_block_rsv; 205 trans->block_rsv = root->orphan_block_rsv;
199 ret = btrfs_block_rsv_check(trans, root, 206 ret = btrfs_block_rsv_check(root, root->orphan_block_rsv, 5);
200 root->orphan_block_rsv,
201 0, 5);
202 if (ret) 207 if (ret)
203 return ret; 208 return ret;
204 209
@@ -242,26 +247,342 @@ static int readahead_cache(struct inode *inode)
242 return 0; 247 return 0;
243} 248}
244 249
250struct io_ctl {
251 void *cur, *orig;
252 struct page *page;
253 struct page **pages;
254 struct btrfs_root *root;
255 unsigned long size;
256 int index;
257 int num_pages;
258 unsigned check_crcs:1;
259};
260
261static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
262 struct btrfs_root *root)
263{
264 memset(io_ctl, 0, sizeof(struct io_ctl));
265 io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
266 PAGE_CACHE_SHIFT;
267 io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages,
268 GFP_NOFS);
269 if (!io_ctl->pages)
270 return -ENOMEM;
271 io_ctl->root = root;
272 if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
273 io_ctl->check_crcs = 1;
274 return 0;
275}
276
277static void io_ctl_free(struct io_ctl *io_ctl)
278{
279 kfree(io_ctl->pages);
280}
281
282static void io_ctl_unmap_page(struct io_ctl *io_ctl)
283{
284 if (io_ctl->cur) {
285 kunmap(io_ctl->page);
286 io_ctl->cur = NULL;
287 io_ctl->orig = NULL;
288 }
289}
290
291static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
292{
293 WARN_ON(io_ctl->cur);
294 BUG_ON(io_ctl->index >= io_ctl->num_pages);
295 io_ctl->page = io_ctl->pages[io_ctl->index++];
296 io_ctl->cur = kmap(io_ctl->page);
297 io_ctl->orig = io_ctl->cur;
298 io_ctl->size = PAGE_CACHE_SIZE;
299 if (clear)
300 memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
301}
302
303static void io_ctl_drop_pages(struct io_ctl *io_ctl)
304{
305 int i;
306
307 io_ctl_unmap_page(io_ctl);
308
309 for (i = 0; i < io_ctl->num_pages; i++) {
310 ClearPageChecked(io_ctl->pages[i]);
311 unlock_page(io_ctl->pages[i]);
312 page_cache_release(io_ctl->pages[i]);
313 }
314}
315
316static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
317 int uptodate)
318{
319 struct page *page;
320 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
321 int i;
322
323 for (i = 0; i < io_ctl->num_pages; i++) {
324 page = find_or_create_page(inode->i_mapping, i, mask);
325 if (!page) {
326 io_ctl_drop_pages(io_ctl);
327 return -ENOMEM;
328 }
329 io_ctl->pages[i] = page;
330 if (uptodate && !PageUptodate(page)) {
331 btrfs_readpage(NULL, page);
332 lock_page(page);
333 if (!PageUptodate(page)) {
334 printk(KERN_ERR "btrfs: error reading free "
335 "space cache\n");
336 io_ctl_drop_pages(io_ctl);
337 return -EIO;
338 }
339 }
340 }
341
342 return 0;
343}
344
345static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
346{
347 u64 *val;
348
349 io_ctl_map_page(io_ctl, 1);
350
351 /*
352 * Skip the csum areas. If we don't check crcs then we just have a
353 * 64bit chunk at the front of the first page.
354 */
355 if (io_ctl->check_crcs) {
356 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
357 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
358 } else {
359 io_ctl->cur += sizeof(u64);
360 io_ctl->size -= sizeof(u64) * 2;
361 }
362
363 val = io_ctl->cur;
364 *val = cpu_to_le64(generation);
365 io_ctl->cur += sizeof(u64);
366}
367
368static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
369{
370 u64 *gen;
371
372 /*
373 * Skip the crc area. If we don't check crcs then we just have a 64bit
374 * chunk at the front of the first page.
375 */
376 if (io_ctl->check_crcs) {
377 io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
378 io_ctl->size -= sizeof(u64) +
379 (sizeof(u32) * io_ctl->num_pages);
380 } else {
381 io_ctl->cur += sizeof(u64);
382 io_ctl->size -= sizeof(u64) * 2;
383 }
384
385 gen = io_ctl->cur;
386 if (le64_to_cpu(*gen) != generation) {
387 printk_ratelimited(KERN_ERR "btrfs: space cache generation "
388 "(%Lu) does not match inode (%Lu)\n", *gen,
389 generation);
390 io_ctl_unmap_page(io_ctl);
391 return -EIO;
392 }
393 io_ctl->cur += sizeof(u64);
394 return 0;
395}
396
397static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
398{
399 u32 *tmp;
400 u32 crc = ~(u32)0;
401 unsigned offset = 0;
402
403 if (!io_ctl->check_crcs) {
404 io_ctl_unmap_page(io_ctl);
405 return;
406 }
407
408 if (index == 0)
409 offset = sizeof(u32) * io_ctl->num_pages;;
410
411 crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
412 PAGE_CACHE_SIZE - offset);
413 btrfs_csum_final(crc, (char *)&crc);
414 io_ctl_unmap_page(io_ctl);
415 tmp = kmap(io_ctl->pages[0]);
416 tmp += index;
417 *tmp = crc;
418 kunmap(io_ctl->pages[0]);
419}
420
421static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
422{
423 u32 *tmp, val;
424 u32 crc = ~(u32)0;
425 unsigned offset = 0;
426
427 if (!io_ctl->check_crcs) {
428 io_ctl_map_page(io_ctl, 0);
429 return 0;
430 }
431
432 if (index == 0)
433 offset = sizeof(u32) * io_ctl->num_pages;
434
435 tmp = kmap(io_ctl->pages[0]);
436 tmp += index;
437 val = *tmp;
438 kunmap(io_ctl->pages[0]);
439
440 io_ctl_map_page(io_ctl, 0);
441 crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
442 PAGE_CACHE_SIZE - offset);
443 btrfs_csum_final(crc, (char *)&crc);
444 if (val != crc) {
445 printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free "
446 "space cache\n");
447 io_ctl_unmap_page(io_ctl);
448 return -EIO;
449 }
450
451 return 0;
452}
453
454static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
455 void *bitmap)
456{
457 struct btrfs_free_space_entry *entry;
458
459 if (!io_ctl->cur)
460 return -ENOSPC;
461
462 entry = io_ctl->cur;
463 entry->offset = cpu_to_le64(offset);
464 entry->bytes = cpu_to_le64(bytes);
465 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
466 BTRFS_FREE_SPACE_EXTENT;
467 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
468 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
469
470 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
471 return 0;
472
473 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
474
475 /* No more pages to map */
476 if (io_ctl->index >= io_ctl->num_pages)
477 return 0;
478
479 /* map the next page */
480 io_ctl_map_page(io_ctl, 1);
481 return 0;
482}
483
484static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
485{
486 if (!io_ctl->cur)
487 return -ENOSPC;
488
489 /*
490 * If we aren't at the start of the current page, unmap this one and
491 * map the next one if there is any left.
492 */
493 if (io_ctl->cur != io_ctl->orig) {
494 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
495 if (io_ctl->index >= io_ctl->num_pages)
496 return -ENOSPC;
497 io_ctl_map_page(io_ctl, 0);
498 }
499
500 memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
501 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
502 if (io_ctl->index < io_ctl->num_pages)
503 io_ctl_map_page(io_ctl, 0);
504 return 0;
505}
506
507static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
508{
509 /*
510 * If we're not on the boundary we know we've modified the page and we
511 * need to crc the page.
512 */
513 if (io_ctl->cur != io_ctl->orig)
514 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
515 else
516 io_ctl_unmap_page(io_ctl);
517
518 while (io_ctl->index < io_ctl->num_pages) {
519 io_ctl_map_page(io_ctl, 1);
520 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
521 }
522}
523
524static int io_ctl_read_entry(struct io_ctl *io_ctl,
525 struct btrfs_free_space *entry, u8 *type)
526{
527 struct btrfs_free_space_entry *e;
528
529 e = io_ctl->cur;
530 entry->offset = le64_to_cpu(e->offset);
531 entry->bytes = le64_to_cpu(e->bytes);
532 *type = e->type;
533 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
534 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
535
536 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
537 return 0;
538
539 io_ctl_unmap_page(io_ctl);
540
541 if (io_ctl->index >= io_ctl->num_pages)
542 return 0;
543
544 return io_ctl_check_crc(io_ctl, io_ctl->index);
545}
546
547static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
548 struct btrfs_free_space *entry)
549{
550 int ret;
551
552 if (io_ctl->cur && io_ctl->cur != io_ctl->orig)
553 io_ctl_unmap_page(io_ctl);
554
555 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
556 if (ret)
557 return ret;
558
559 memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE);
560 io_ctl_unmap_page(io_ctl);
561
562 return 0;
563}
564
245int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, 565int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
246 struct btrfs_free_space_ctl *ctl, 566 struct btrfs_free_space_ctl *ctl,
247 struct btrfs_path *path, u64 offset) 567 struct btrfs_path *path, u64 offset)
248{ 568{
249 struct btrfs_free_space_header *header; 569 struct btrfs_free_space_header *header;
250 struct extent_buffer *leaf; 570 struct extent_buffer *leaf;
251 struct page *page; 571 struct io_ctl io_ctl;
252 struct btrfs_key key; 572 struct btrfs_key key;
573 struct btrfs_free_space *e, *n;
253 struct list_head bitmaps; 574 struct list_head bitmaps;
254 u64 num_entries; 575 u64 num_entries;
255 u64 num_bitmaps; 576 u64 num_bitmaps;
256 u64 generation; 577 u64 generation;
257 pgoff_t index = 0; 578 u8 type;
258 int ret = 0; 579 int ret = 0;
259 580
260 INIT_LIST_HEAD(&bitmaps); 581 INIT_LIST_HEAD(&bitmaps);
261 582
262 /* Nothing in the space cache, goodbye */ 583 /* Nothing in the space cache, goodbye */
263 if (!i_size_read(inode)) 584 if (!i_size_read(inode))
264 goto out; 585 return 0;
265 586
266 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 587 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
267 key.offset = offset; 588 key.offset = offset;
@@ -269,11 +590,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
269 590
270 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 591 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
271 if (ret < 0) 592 if (ret < 0)
272 goto out; 593 return 0;
273 else if (ret > 0) { 594 else if (ret > 0) {
274 btrfs_release_path(path); 595 btrfs_release_path(path);
275 ret = 0; 596 return 0;
276 goto out;
277 } 597 }
278 598
279 ret = -1; 599 ret = -1;
@@ -291,169 +611,100 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
291 " not match free space cache generation (%llu)\n", 611 " not match free space cache generation (%llu)\n",
292 (unsigned long long)BTRFS_I(inode)->generation, 612 (unsigned long long)BTRFS_I(inode)->generation,
293 (unsigned long long)generation); 613 (unsigned long long)generation);
294 goto out; 614 return 0;
295 } 615 }
296 616
297 if (!num_entries) 617 if (!num_entries)
298 goto out; 618 return 0;
299 619
620 io_ctl_init(&io_ctl, inode, root);
300 ret = readahead_cache(inode); 621 ret = readahead_cache(inode);
301 if (ret) 622 if (ret)
302 goto out; 623 goto out;
303 624
304 while (1) { 625 ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
305 struct btrfs_free_space_entry *entry; 626 if (ret)
306 struct btrfs_free_space *e; 627 goto out;
307 void *addr;
308 unsigned long offset = 0;
309 int need_loop = 0;
310 628
311 if (!num_entries && !num_bitmaps) 629 ret = io_ctl_check_crc(&io_ctl, 0);
312 break; 630 if (ret)
631 goto free_cache;
313 632
314 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 633 ret = io_ctl_check_generation(&io_ctl, generation);
315 if (!page) 634 if (ret)
635 goto free_cache;
636
637 while (num_entries) {
638 e = kmem_cache_zalloc(btrfs_free_space_cachep,
639 GFP_NOFS);
640 if (!e)
316 goto free_cache; 641 goto free_cache;
317 642
318 if (!PageUptodate(page)) { 643 ret = io_ctl_read_entry(&io_ctl, e, &type);
319 btrfs_readpage(NULL, page); 644 if (ret) {
320 lock_page(page); 645 kmem_cache_free(btrfs_free_space_cachep, e);
321 if (!PageUptodate(page)) { 646 goto free_cache;
322 unlock_page(page);
323 page_cache_release(page);
324 printk(KERN_ERR "btrfs: error reading free "
325 "space cache\n");
326 goto free_cache;
327 }
328 } 647 }
329 addr = kmap(page);
330 648
331 if (index == 0) { 649 if (!e->bytes) {
332 u64 *gen; 650 kmem_cache_free(btrfs_free_space_cachep, e);
651 goto free_cache;
652 }
333 653
334 /* 654 if (type == BTRFS_FREE_SPACE_EXTENT) {
335 * We put a bogus crc in the front of the first page in 655 spin_lock(&ctl->tree_lock);
336 * case old kernels try to mount a fs with the new 656 ret = link_free_space(ctl, e);
337 * format to make sure they discard the cache. 657 spin_unlock(&ctl->tree_lock);
338 */ 658 if (ret) {
339 addr += sizeof(u64); 659 printk(KERN_ERR "Duplicate entries in "
340 offset += sizeof(u64); 660 "free space cache, dumping\n");
341 661 kmem_cache_free(btrfs_free_space_cachep, e);
342 gen = addr;
343 if (*gen != BTRFS_I(inode)->generation) {
344 printk(KERN_ERR "btrfs: space cache generation"
345 " (%llu) does not match inode (%llu)\n",
346 (unsigned long long)*gen,
347 (unsigned long long)
348 BTRFS_I(inode)->generation);
349 kunmap(page);
350 unlock_page(page);
351 page_cache_release(page);
352 goto free_cache; 662 goto free_cache;
353 } 663 }
354 addr += sizeof(u64); 664 } else {
355 offset += sizeof(u64); 665 BUG_ON(!num_bitmaps);
356 } 666 num_bitmaps--;
357 entry = addr; 667 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
358 668 if (!e->bitmap) {
359 while (1) { 669 kmem_cache_free(
360 if (!num_entries) 670 btrfs_free_space_cachep, e);
361 break;
362
363 need_loop = 1;
364 e = kmem_cache_zalloc(btrfs_free_space_cachep,
365 GFP_NOFS);
366 if (!e) {
367 kunmap(page);
368 unlock_page(page);
369 page_cache_release(page);
370 goto free_cache; 671 goto free_cache;
371 } 672 }
372 673 spin_lock(&ctl->tree_lock);
373 e->offset = le64_to_cpu(entry->offset); 674 ret = link_free_space(ctl, e);
374 e->bytes = le64_to_cpu(entry->bytes); 675 ctl->total_bitmaps++;
375 if (!e->bytes) { 676 ctl->op->recalc_thresholds(ctl);
376 kunmap(page); 677 spin_unlock(&ctl->tree_lock);
678 if (ret) {
679 printk(KERN_ERR "Duplicate entries in "
680 "free space cache, dumping\n");
377 kmem_cache_free(btrfs_free_space_cachep, e); 681 kmem_cache_free(btrfs_free_space_cachep, e);
378 unlock_page(page);
379 page_cache_release(page);
380 goto free_cache; 682 goto free_cache;
381 } 683 }
382 684 list_add_tail(&e->list, &bitmaps);
383 if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
384 spin_lock(&ctl->tree_lock);
385 ret = link_free_space(ctl, e);
386 spin_unlock(&ctl->tree_lock);
387 if (ret) {
388 printk(KERN_ERR "Duplicate entries in "
389 "free space cache, dumping\n");
390 kunmap(page);
391 unlock_page(page);
392 page_cache_release(page);
393 goto free_cache;
394 }
395 } else {
396 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
397 if (!e->bitmap) {
398 kunmap(page);
399 kmem_cache_free(
400 btrfs_free_space_cachep, e);
401 unlock_page(page);
402 page_cache_release(page);
403 goto free_cache;
404 }
405 spin_lock(&ctl->tree_lock);
406 ret = link_free_space(ctl, e);
407 ctl->total_bitmaps++;
408 ctl->op->recalc_thresholds(ctl);
409 spin_unlock(&ctl->tree_lock);
410 if (ret) {
411 printk(KERN_ERR "Duplicate entries in "
412 "free space cache, dumping\n");
413 kunmap(page);
414 unlock_page(page);
415 page_cache_release(page);
416 goto free_cache;
417 }
418 list_add_tail(&e->list, &bitmaps);
419 }
420
421 num_entries--;
422 offset += sizeof(struct btrfs_free_space_entry);
423 if (offset + sizeof(struct btrfs_free_space_entry) >=
424 PAGE_CACHE_SIZE)
425 break;
426 entry++;
427 } 685 }
428 686
429 /* 687 num_entries--;
430 * We read an entry out of this page, we need to move on to the 688 }
431 * next page.
432 */
433 if (need_loop) {
434 kunmap(page);
435 goto next;
436 }
437 689
438 /* 690 /*
439 * We add the bitmaps at the end of the entries in order that 691 * We add the bitmaps at the end of the entries in order that
440 * the bitmap entries are added to the cache. 692 * the bitmap entries are added to the cache.
441 */ 693 */
442 e = list_entry(bitmaps.next, struct btrfs_free_space, list); 694 list_for_each_entry_safe(e, n, &bitmaps, list) {
443 list_del_init(&e->list); 695 list_del_init(&e->list);
444 memcpy(e->bitmap, addr, PAGE_CACHE_SIZE); 696 ret = io_ctl_read_bitmap(&io_ctl, e);
445 kunmap(page); 697 if (ret)
446 num_bitmaps--; 698 goto free_cache;
447next:
448 unlock_page(page);
449 page_cache_release(page);
450 index++;
451 } 699 }
452 700
701 io_ctl_drop_pages(&io_ctl);
453 ret = 1; 702 ret = 1;
454out: 703out:
704 io_ctl_free(&io_ctl);
455 return ret; 705 return ret;
456free_cache: 706free_cache:
707 io_ctl_drop_pages(&io_ctl);
457 __btrfs_remove_free_space_cache(ctl); 708 __btrfs_remove_free_space_cache(ctl);
458 goto out; 709 goto out;
459} 710}
@@ -465,7 +716,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
465 struct btrfs_root *root = fs_info->tree_root; 716 struct btrfs_root *root = fs_info->tree_root;
466 struct inode *inode; 717 struct inode *inode;
467 struct btrfs_path *path; 718 struct btrfs_path *path;
468 int ret; 719 int ret = 0;
469 bool matched; 720 bool matched;
470 u64 used = btrfs_block_group_used(&block_group->item); 721 u64 used = btrfs_block_group_used(&block_group->item);
471 722
@@ -497,6 +748,14 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
497 return 0; 748 return 0;
498 } 749 }
499 750
751 /* We may have converted the inode and made the cache invalid. */
752 spin_lock(&block_group->lock);
753 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
754 spin_unlock(&block_group->lock);
755 goto out;
756 }
757 spin_unlock(&block_group->lock);
758
500 ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, 759 ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
501 path, block_group->key.objectid); 760 path, block_group->key.objectid);
502 btrfs_free_path(path); 761 btrfs_free_path(path);
@@ -530,6 +789,19 @@ out:
530 return ret; 789 return ret;
531} 790}
532 791
792/**
793 * __btrfs_write_out_cache - write out cached info to an inode
794 * @root - the root the inode belongs to
795 * @ctl - the free space cache we are going to write out
796 * @block_group - the block_group for this cache if it belongs to a block_group
797 * @trans - the trans handle
798 * @path - the path to use
799 * @offset - the offset for the key we'll insert
800 *
801 * This function writes out a free space cache struct to disk for quick recovery
802 * on mount. This will return 0 if it was successfull in writing the cache out,
803 * and -1 if it was not.
804 */
533int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, 805int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
534 struct btrfs_free_space_ctl *ctl, 806 struct btrfs_free_space_ctl *ctl,
535 struct btrfs_block_group_cache *block_group, 807 struct btrfs_block_group_cache *block_group,
@@ -540,42 +812,24 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
540 struct extent_buffer *leaf; 812 struct extent_buffer *leaf;
541 struct rb_node *node; 813 struct rb_node *node;
542 struct list_head *pos, *n; 814 struct list_head *pos, *n;
543 struct page **pages;
544 struct page *page;
545 struct extent_state *cached_state = NULL; 815 struct extent_state *cached_state = NULL;
546 struct btrfs_free_cluster *cluster = NULL; 816 struct btrfs_free_cluster *cluster = NULL;
547 struct extent_io_tree *unpin = NULL; 817 struct extent_io_tree *unpin = NULL;
818 struct io_ctl io_ctl;
548 struct list_head bitmap_list; 819 struct list_head bitmap_list;
549 struct btrfs_key key; 820 struct btrfs_key key;
550 u64 start, end, len; 821 u64 start, end, len;
551 u64 bytes = 0;
552 u32 crc = ~(u32)0;
553 int index = 0, num_pages = 0;
554 int entries = 0; 822 int entries = 0;
555 int bitmaps = 0; 823 int bitmaps = 0;
556 int ret = -1; 824 int ret;
557 bool next_page = false; 825 int err = -1;
558 bool out_of_space = false;
559 826
560 INIT_LIST_HEAD(&bitmap_list); 827 INIT_LIST_HEAD(&bitmap_list);
561 828
562 node = rb_first(&ctl->free_space_offset);
563 if (!node)
564 return 0;
565
566 if (!i_size_read(inode)) 829 if (!i_size_read(inode))
567 return -1; 830 return -1;
568 831
569 num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> 832 io_ctl_init(&io_ctl, inode, root);
570 PAGE_CACHE_SHIFT;
571
572 filemap_write_and_wait(inode->i_mapping);
573 btrfs_wait_ordered_range(inode, inode->i_size &
574 ~(root->sectorsize - 1), (u64)-1);
575
576 pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
577 if (!pages)
578 return -1;
579 833
580 /* Get the cluster for this block_group if it exists */ 834 /* Get the cluster for this block_group if it exists */
581 if (block_group && !list_empty(&block_group->cluster_list)) 835 if (block_group && !list_empty(&block_group->cluster_list))
@@ -589,30 +843,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
589 */ 843 */
590 unpin = root->fs_info->pinned_extents; 844 unpin = root->fs_info->pinned_extents;
591 845
592 /* 846 /* Lock all pages first so we can lock the extent safely. */
593 * Lock all pages first so we can lock the extent safely. 847 io_ctl_prepare_pages(&io_ctl, inode, 0);
594 *
595 * NOTE: Because we hold the ref the entire time we're going to write to
596 * the page find_get_page should never fail, so we don't do a check
597 * after find_get_page at this point. Just putting this here so people
598 * know and don't freak out.
599 */
600 while (index < num_pages) {
601 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
602 if (!page) {
603 int i;
604
605 for (i = 0; i < num_pages; i++) {
606 unlock_page(pages[i]);
607 page_cache_release(pages[i]);
608 }
609 goto out;
610 }
611 pages[index] = page;
612 index++;
613 }
614 848
615 index = 0;
616 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 849 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
617 0, &cached_state, GFP_NOFS); 850 0, &cached_state, GFP_NOFS);
618 851
@@ -623,189 +856,111 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
623 if (block_group) 856 if (block_group)
624 start = block_group->key.objectid; 857 start = block_group->key.objectid;
625 858
626 /* Write out the extent entries */ 859 node = rb_first(&ctl->free_space_offset);
627 do { 860 if (!node && cluster) {
628 struct btrfs_free_space_entry *entry; 861 node = rb_first(&cluster->root);
629 void *addr, *orig; 862 cluster = NULL;
630 unsigned long offset = 0; 863 }
631 864
632 next_page = false; 865 /* Make sure we can fit our crcs into the first page */
866 if (io_ctl.check_crcs &&
867 (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) {
868 WARN_ON(1);
869 goto out_nospc;
870 }
633 871
634 if (index >= num_pages) { 872 io_ctl_set_generation(&io_ctl, trans->transid);
635 out_of_space = true;
636 break;
637 }
638 873
639 page = pages[index]; 874 /* Write out the extent entries */
875 while (node) {
876 struct btrfs_free_space *e;
640 877
641 orig = addr = kmap(page); 878 e = rb_entry(node, struct btrfs_free_space, offset_index);
642 if (index == 0) { 879 entries++;
643 u64 *gen;
644 880
645 /* 881 ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes,
646 * We're going to put in a bogus crc for this page to 882 e->bitmap);
647 * make sure that old kernels who aren't aware of this 883 if (ret)
648 * format will be sure to discard the cache. 884 goto out_nospc;
649 */
650 addr += sizeof(u64);
651 offset += sizeof(u64);
652 885
653 gen = addr; 886 if (e->bitmap) {
654 *gen = trans->transid; 887 list_add_tail(&e->list, &bitmap_list);
655 addr += sizeof(u64); 888 bitmaps++;
656 offset += sizeof(u64);
657 } 889 }
658 entry = addr; 890 node = rb_next(node);
659 891 if (!node && cluster) {
660 memset(addr, 0, PAGE_CACHE_SIZE - offset); 892 node = rb_first(&cluster->root);
661 while (node && !next_page) { 893 cluster = NULL;
662 struct btrfs_free_space *e;
663
664 e = rb_entry(node, struct btrfs_free_space, offset_index);
665 entries++;
666
667 entry->offset = cpu_to_le64(e->offset);
668 entry->bytes = cpu_to_le64(e->bytes);
669 if (e->bitmap) {
670 entry->type = BTRFS_FREE_SPACE_BITMAP;
671 list_add_tail(&e->list, &bitmap_list);
672 bitmaps++;
673 } else {
674 entry->type = BTRFS_FREE_SPACE_EXTENT;
675 }
676 node = rb_next(node);
677 if (!node && cluster) {
678 node = rb_first(&cluster->root);
679 cluster = NULL;
680 }
681 offset += sizeof(struct btrfs_free_space_entry);
682 if (offset + sizeof(struct btrfs_free_space_entry) >=
683 PAGE_CACHE_SIZE)
684 next_page = true;
685 entry++;
686 } 894 }
895 }
687 896
688 /* 897 /*
689 * We want to add any pinned extents to our free space cache 898 * We want to add any pinned extents to our free space cache
690 * so we don't leak the space 899 * so we don't leak the space
691 */ 900 */
692 while (block_group && !next_page && 901 while (block_group && (start < block_group->key.objectid +
693 (start < block_group->key.objectid + 902 block_group->key.offset)) {
694 block_group->key.offset)) { 903 ret = find_first_extent_bit(unpin, start, &start, &end,
695 ret = find_first_extent_bit(unpin, start, &start, &end, 904 EXTENT_DIRTY);
696 EXTENT_DIRTY); 905 if (ret) {
697 if (ret) { 906 ret = 0;
698 ret = 0; 907 break;
699 break;
700 }
701
702 /* This pinned extent is out of our range */
703 if (start >= block_group->key.objectid +
704 block_group->key.offset)
705 break;
706
707 len = block_group->key.objectid +
708 block_group->key.offset - start;
709 len = min(len, end + 1 - start);
710
711 entries++;
712 entry->offset = cpu_to_le64(start);
713 entry->bytes = cpu_to_le64(len);
714 entry->type = BTRFS_FREE_SPACE_EXTENT;
715
716 start = end + 1;
717 offset += sizeof(struct btrfs_free_space_entry);
718 if (offset + sizeof(struct btrfs_free_space_entry) >=
719 PAGE_CACHE_SIZE)
720 next_page = true;
721 entry++;
722 } 908 }
723 909
724 /* Generate bogus crc value */ 910 /* This pinned extent is out of our range */
725 if (index == 0) { 911 if (start >= block_group->key.objectid +
726 u32 *tmp; 912 block_group->key.offset)
727 crc = btrfs_csum_data(root, orig + sizeof(u64), crc, 913 break;
728 PAGE_CACHE_SIZE - sizeof(u64));
729 btrfs_csum_final(crc, (char *)&crc);
730 crc++;
731 tmp = orig;
732 *tmp = crc;
733 }
734 914
735 kunmap(page); 915 len = block_group->key.objectid +
916 block_group->key.offset - start;
917 len = min(len, end + 1 - start);
736 918
737 bytes += PAGE_CACHE_SIZE; 919 entries++;
920 ret = io_ctl_add_entry(&io_ctl, start, len, NULL);
921 if (ret)
922 goto out_nospc;
738 923
739 index++; 924 start = end + 1;
740 } while (node || next_page); 925 }
741 926
742 /* Write out the bitmaps */ 927 /* Write out the bitmaps */
743 list_for_each_safe(pos, n, &bitmap_list) { 928 list_for_each_safe(pos, n, &bitmap_list) {
744 void *addr;
745 struct btrfs_free_space *entry = 929 struct btrfs_free_space *entry =
746 list_entry(pos, struct btrfs_free_space, list); 930 list_entry(pos, struct btrfs_free_space, list);
747 931
748 if (index >= num_pages) { 932 ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap);
749 out_of_space = true; 933 if (ret)
750 break; 934 goto out_nospc;
751 }
752 page = pages[index];
753
754 addr = kmap(page);
755 memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
756 kunmap(page);
757 bytes += PAGE_CACHE_SIZE;
758
759 list_del_init(&entry->list); 935 list_del_init(&entry->list);
760 index++;
761 }
762
763 if (out_of_space) {
764 btrfs_drop_pages(pages, num_pages);
765 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
766 i_size_read(inode) - 1, &cached_state,
767 GFP_NOFS);
768 ret = 0;
769 goto out;
770 } 936 }
771 937
772 /* Zero out the rest of the pages just to make sure */ 938 /* Zero out the rest of the pages just to make sure */
773 while (index < num_pages) { 939 io_ctl_zero_remaining_pages(&io_ctl);
774 void *addr;
775 940
776 page = pages[index]; 941 ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
777 addr = kmap(page); 942 0, i_size_read(inode), &cached_state);
778 memset(addr, 0, PAGE_CACHE_SIZE); 943 io_ctl_drop_pages(&io_ctl);
779 kunmap(page);
780 bytes += PAGE_CACHE_SIZE;
781 index++;
782 }
783
784 ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0,
785 bytes, &cached_state);
786 btrfs_drop_pages(pages, num_pages);
787 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 944 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
788 i_size_read(inode) - 1, &cached_state, GFP_NOFS); 945 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
789 946
790 if (ret) { 947 if (ret)
791 ret = 0;
792 goto out; 948 goto out;
793 }
794 949
795 BTRFS_I(inode)->generation = trans->transid;
796 950
797 filemap_write_and_wait(inode->i_mapping); 951 ret = filemap_write_and_wait(inode->i_mapping);
952 if (ret)
953 goto out;
798 954
799 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 955 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
800 key.offset = offset; 956 key.offset = offset;
801 key.type = 0; 957 key.type = 0;
802 958
803 ret = btrfs_search_slot(trans, root, &key, path, 1, 1); 959 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
804 if (ret < 0) { 960 if (ret < 0) {
805 ret = -1; 961 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
806 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, 962 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
807 EXTENT_DIRTY | EXTENT_DELALLOC | 963 GFP_NOFS);
808 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
809 goto out; 964 goto out;
810 } 965 }
811 leaf = path->nodes[0]; 966 leaf = path->nodes[0];
@@ -816,15 +971,16 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
816 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 971 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
817 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 972 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
818 found_key.offset != offset) { 973 found_key.offset != offset) {
819 ret = -1; 974 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
820 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, 975 inode->i_size - 1,
821 EXTENT_DIRTY | EXTENT_DELALLOC | 976 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
822 EXTENT_DO_ACCOUNTING, 0, 0, NULL, 977 NULL, GFP_NOFS);
823 GFP_NOFS);
824 btrfs_release_path(path); 978 btrfs_release_path(path);
825 goto out; 979 goto out;
826 } 980 }
827 } 981 }
982
983 BTRFS_I(inode)->generation = trans->transid;
828 header = btrfs_item_ptr(leaf, path->slots[0], 984 header = btrfs_item_ptr(leaf, path->slots[0],
829 struct btrfs_free_space_header); 985 struct btrfs_free_space_header);
830 btrfs_set_free_space_entries(leaf, header, entries); 986 btrfs_set_free_space_entries(leaf, header, entries);
@@ -833,16 +989,26 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
833 btrfs_mark_buffer_dirty(leaf); 989 btrfs_mark_buffer_dirty(leaf);
834 btrfs_release_path(path); 990 btrfs_release_path(path);
835 991
836 ret = 1; 992 err = 0;
837
838out: 993out:
839 kfree(pages); 994 io_ctl_free(&io_ctl);
840 if (ret != 1) { 995 if (err) {
841 invalidate_inode_pages2_range(inode->i_mapping, 0, index); 996 invalidate_inode_pages2(inode->i_mapping);
842 BTRFS_I(inode)->generation = 0; 997 BTRFS_I(inode)->generation = 0;
843 } 998 }
844 btrfs_update_inode(trans, root, inode); 999 btrfs_update_inode(trans, root, inode);
845 return ret; 1000 return err;
1001
1002out_nospc:
1003 list_for_each_safe(pos, n, &bitmap_list) {
1004 struct btrfs_free_space *entry =
1005 list_entry(pos, struct btrfs_free_space, list);
1006 list_del_init(&entry->list);
1007 }
1008 io_ctl_drop_pages(&io_ctl);
1009 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1010 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
1011 goto out;
846} 1012}
847 1013
848int btrfs_write_out_cache(struct btrfs_root *root, 1014int btrfs_write_out_cache(struct btrfs_root *root,
@@ -869,14 +1035,15 @@ int btrfs_write_out_cache(struct btrfs_root *root,
869 1035
870 ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, 1036 ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
871 path, block_group->key.objectid); 1037 path, block_group->key.objectid);
872 if (ret < 0) { 1038 if (ret) {
873 spin_lock(&block_group->lock); 1039 spin_lock(&block_group->lock);
874 block_group->disk_cache_state = BTRFS_DC_ERROR; 1040 block_group->disk_cache_state = BTRFS_DC_ERROR;
875 spin_unlock(&block_group->lock); 1041 spin_unlock(&block_group->lock);
876 ret = 0; 1042 ret = 0;
877 1043#ifdef DEBUG
878 printk(KERN_ERR "btrfs: failed to write free space cace " 1044 printk(KERN_ERR "btrfs: failed to write free space cace "
879 "for block group %llu\n", block_group->key.objectid); 1045 "for block group %llu\n", block_group->key.objectid);
1046#endif
880 } 1047 }
881 1048
882 iput(inode); 1049 iput(inode);
@@ -2472,9 +2639,19 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2472 spin_unlock(&ctl->tree_lock); 2639 spin_unlock(&ctl->tree_lock);
2473 2640
2474 if (bytes >= minlen) { 2641 if (bytes >= minlen) {
2475 int update_ret; 2642 struct btrfs_space_info *space_info;
2476 update_ret = btrfs_update_reserved_bytes(block_group, 2643 int update = 0;
2477 bytes, 1, 1); 2644
2645 space_info = block_group->space_info;
2646 spin_lock(&space_info->lock);
2647 spin_lock(&block_group->lock);
2648 if (!block_group->ro) {
2649 block_group->reserved += bytes;
2650 space_info->bytes_reserved += bytes;
2651 update = 1;
2652 }
2653 spin_unlock(&block_group->lock);
2654 spin_unlock(&space_info->lock);
2478 2655
2479 ret = btrfs_error_discard_extent(fs_info->extent_root, 2656 ret = btrfs_error_discard_extent(fs_info->extent_root,
2480 start, 2657 start,
@@ -2482,9 +2659,16 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2482 &actually_trimmed); 2659 &actually_trimmed);
2483 2660
2484 btrfs_add_free_space(block_group, start, bytes); 2661 btrfs_add_free_space(block_group, start, bytes);
2485 if (!update_ret) 2662 if (update) {
2486 btrfs_update_reserved_bytes(block_group, 2663 spin_lock(&space_info->lock);
2487 bytes, 0, 1); 2664 spin_lock(&block_group->lock);
2665 if (block_group->ro)
2666 space_info->bytes_readonly += bytes;
2667 block_group->reserved -= bytes;
2668 space_info->bytes_reserved -= bytes;
2669 spin_unlock(&space_info->lock);
2670 spin_unlock(&block_group->lock);
2671 }
2488 2672
2489 if (ret) 2673 if (ret)
2490 break; 2674 break;
@@ -2643,9 +2827,13 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
2643 return 0; 2827 return 0;
2644 2828
2645 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); 2829 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
2646 if (ret < 0) 2830 if (ret) {
2831 btrfs_delalloc_release_metadata(inode, inode->i_size);
2832#ifdef DEBUG
2647 printk(KERN_ERR "btrfs: failed to write free ino cache " 2833 printk(KERN_ERR "btrfs: failed to write free ino cache "
2648 "for root %llu\n", root->root_key.objectid); 2834 "for root %llu\n", root->root_key.objectid);
2835#endif
2836 }
2649 2837
2650 iput(inode); 2838 iput(inode);
2651 return ret; 2839 return ret;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index b4087e0fa871..53dcbdf446cd 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -465,14 +465,16 @@ again:
465 /* Just to make sure we have enough space */ 465 /* Just to make sure we have enough space */
466 prealloc += 8 * PAGE_CACHE_SIZE; 466 prealloc += 8 * PAGE_CACHE_SIZE;
467 467
468 ret = btrfs_check_data_free_space(inode, prealloc); 468 ret = btrfs_delalloc_reserve_space(inode, prealloc);
469 if (ret) 469 if (ret)
470 goto out_put; 470 goto out_put;
471 471
472 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, 472 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
473 prealloc, prealloc, &alloc_hint); 473 prealloc, prealloc, &alloc_hint);
474 if (ret) 474 if (ret) {
475 btrfs_delalloc_release_space(inode, prealloc);
475 goto out_put; 476 goto out_put;
477 }
476 btrfs_free_reserved_data_space(inode, prealloc); 478 btrfs_free_reserved_data_space(inode, prealloc);
477 479
478out_put: 480out_put:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b2d004ad66a0..f12747c9447b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1792,12 +1792,12 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1792 } 1792 }
1793 ret = 0; 1793 ret = 0;
1794out: 1794out:
1795 if (nolock) { 1795 if (root != root->fs_info->tree_root)
1796 if (trans)
1797 btrfs_end_transaction_nolock(trans, root);
1798 } else {
1799 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 1796 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
1800 if (trans) 1797 if (trans) {
1798 if (nolock)
1799 btrfs_end_transaction_nolock(trans, root);
1800 else
1801 btrfs_end_transaction(trans, root); 1801 btrfs_end_transaction(trans, root);
1802 } 1802 }
1803 1803
@@ -2079,89 +2079,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
2079 up_read(&root->fs_info->cleanup_work_sem); 2079 up_read(&root->fs_info->cleanup_work_sem);
2080} 2080}
2081 2081
2082/*
2083 * calculate extra metadata reservation when snapshotting a subvolume
2084 * contains orphan files.
2085 */
2086void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2087 struct btrfs_pending_snapshot *pending,
2088 u64 *bytes_to_reserve)
2089{
2090 struct btrfs_root *root;
2091 struct btrfs_block_rsv *block_rsv;
2092 u64 num_bytes;
2093 int index;
2094
2095 root = pending->root;
2096 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2097 return;
2098
2099 block_rsv = root->orphan_block_rsv;
2100
2101 /* orphan block reservation for the snapshot */
2102 num_bytes = block_rsv->size;
2103
2104 /*
2105 * after the snapshot is created, COWing tree blocks may use more
2106 * space than it frees. So we should make sure there is enough
2107 * reserved space.
2108 */
2109 index = trans->transid & 0x1;
2110 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2111 num_bytes += block_rsv->size -
2112 (block_rsv->reserved + block_rsv->freed[index]);
2113 }
2114
2115 *bytes_to_reserve += num_bytes;
2116}
2117
2118void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2119 struct btrfs_pending_snapshot *pending)
2120{
2121 struct btrfs_root *root = pending->root;
2122 struct btrfs_root *snap = pending->snap;
2123 struct btrfs_block_rsv *block_rsv;
2124 u64 num_bytes;
2125 int index;
2126 int ret;
2127
2128 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2129 return;
2130
2131 /* refill source subvolume's orphan block reservation */
2132 block_rsv = root->orphan_block_rsv;
2133 index = trans->transid & 0x1;
2134 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2135 num_bytes = block_rsv->size -
2136 (block_rsv->reserved + block_rsv->freed[index]);
2137 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2138 root->orphan_block_rsv,
2139 num_bytes);
2140 BUG_ON(ret);
2141 }
2142
2143 /* setup orphan block reservation for the snapshot */
2144 block_rsv = btrfs_alloc_block_rsv(snap);
2145 BUG_ON(!block_rsv);
2146
2147 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2148 snap->orphan_block_rsv = block_rsv;
2149
2150 num_bytes = root->orphan_block_rsv->size;
2151 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2152 block_rsv, num_bytes);
2153 BUG_ON(ret);
2154
2155#if 0
2156 /* insert orphan item for the snapshot */
2157 WARN_ON(!root->orphan_item_inserted);
2158 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2159 snap->root_key.objectid);
2160 BUG_ON(ret);
2161 snap->orphan_item_inserted = 1;
2162#endif
2163}
2164
2165enum btrfs_orphan_cleanup_state { 2082enum btrfs_orphan_cleanup_state {
2166 ORPHAN_CLEANUP_STARTED = 1, 2083 ORPHAN_CLEANUP_STARTED = 1,
2167 ORPHAN_CLEANUP_DONE = 2, 2084 ORPHAN_CLEANUP_DONE = 2,
@@ -2247,9 +2164,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2247 } 2164 }
2248 spin_unlock(&root->orphan_lock); 2165 spin_unlock(&root->orphan_lock);
2249 2166
2250 if (block_rsv)
2251 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2252
2253 /* grab metadata reservation from transaction handle */ 2167 /* grab metadata reservation from transaction handle */
2254 if (reserve) { 2168 if (reserve) {
2255 ret = btrfs_orphan_reserve_metadata(trans, inode); 2169 ret = btrfs_orphan_reserve_metadata(trans, inode);
@@ -2316,6 +2230,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2316 struct btrfs_key key, found_key; 2230 struct btrfs_key key, found_key;
2317 struct btrfs_trans_handle *trans; 2231 struct btrfs_trans_handle *trans;
2318 struct inode *inode; 2232 struct inode *inode;
2233 u64 last_objectid = 0;
2319 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2234 int ret = 0, nr_unlink = 0, nr_truncate = 0;
2320 2235
2321 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) 2236 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
@@ -2367,41 +2282,49 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2367 * crossing root thing. we store the inode number in the 2282 * crossing root thing. we store the inode number in the
2368 * offset of the orphan item. 2283 * offset of the orphan item.
2369 */ 2284 */
2285
2286 if (found_key.offset == last_objectid) {
2287 printk(KERN_ERR "btrfs: Error removing orphan entry, "
2288 "stopping orphan cleanup\n");
2289 ret = -EINVAL;
2290 goto out;
2291 }
2292
2293 last_objectid = found_key.offset;
2294
2370 found_key.objectid = found_key.offset; 2295 found_key.objectid = found_key.offset;
2371 found_key.type = BTRFS_INODE_ITEM_KEY; 2296 found_key.type = BTRFS_INODE_ITEM_KEY;
2372 found_key.offset = 0; 2297 found_key.offset = 0;
2373 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 2298 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
2374 if (IS_ERR(inode)) { 2299 ret = PTR_RET(inode);
2375 ret = PTR_ERR(inode); 2300 if (ret && ret != -ESTALE)
2376 goto out; 2301 goto out;
2377 }
2378
2379 /*
2380 * add this inode to the orphan list so btrfs_orphan_del does
2381 * the proper thing when we hit it
2382 */
2383 spin_lock(&root->orphan_lock);
2384 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2385 spin_unlock(&root->orphan_lock);
2386 2302
2387 /* 2303 /*
2388 * if this is a bad inode, means we actually succeeded in 2304 * Inode is already gone but the orphan item is still there,
2389 * removing the inode, but not the orphan record, which means 2305 * kill the orphan item.
2390 * we need to manually delete the orphan since iput will just
2391 * do a destroy_inode
2392 */ 2306 */
2393 if (is_bad_inode(inode)) { 2307 if (ret == -ESTALE) {
2394 trans = btrfs_start_transaction(root, 0); 2308 trans = btrfs_start_transaction(root, 1);
2395 if (IS_ERR(trans)) { 2309 if (IS_ERR(trans)) {
2396 ret = PTR_ERR(trans); 2310 ret = PTR_ERR(trans);
2397 goto out; 2311 goto out;
2398 } 2312 }
2399 btrfs_orphan_del(trans, inode); 2313 ret = btrfs_del_orphan_item(trans, root,
2314 found_key.objectid);
2315 BUG_ON(ret);
2400 btrfs_end_transaction(trans, root); 2316 btrfs_end_transaction(trans, root);
2401 iput(inode);
2402 continue; 2317 continue;
2403 } 2318 }
2404 2319
2320 /*
2321 * add this inode to the orphan list so btrfs_orphan_del does
2322 * the proper thing when we hit it
2323 */
2324 spin_lock(&root->orphan_lock);
2325 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2326 spin_unlock(&root->orphan_lock);
2327
2405 /* if we have links, this was a truncate, lets do that */ 2328 /* if we have links, this was a truncate, lets do that */
2406 if (inode->i_nlink) { 2329 if (inode->i_nlink) {
2407 if (!S_ISREG(inode->i_mode)) { 2330 if (!S_ISREG(inode->i_mode)) {
@@ -2835,7 +2758,16 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2835 u64 ino = btrfs_ino(inode); 2758 u64 ino = btrfs_ino(inode);
2836 u64 dir_ino = btrfs_ino(dir); 2759 u64 dir_ino = btrfs_ino(dir);
2837 2760
2838 trans = btrfs_start_transaction(root, 10); 2761 /*
2762 * 1 for the possible orphan item
2763 * 1 for the dir item
2764 * 1 for the dir index
2765 * 1 for the inode ref
2766 * 1 for the inode ref in the tree log
2767 * 2 for the dir entries in the log
2768 * 1 for the inode
2769 */
2770 trans = btrfs_start_transaction(root, 8);
2839 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) 2771 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
2840 return trans; 2772 return trans;
2841 2773
@@ -2858,7 +2790,8 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2858 return ERR_PTR(-ENOMEM); 2790 return ERR_PTR(-ENOMEM);
2859 } 2791 }
2860 2792
2861 trans = btrfs_start_transaction(root, 0); 2793 /* 1 for the orphan item */
2794 trans = btrfs_start_transaction(root, 1);
2862 if (IS_ERR(trans)) { 2795 if (IS_ERR(trans)) {
2863 btrfs_free_path(path); 2796 btrfs_free_path(path);
2864 root->fs_info->enospc_unlink = 0; 2797 root->fs_info->enospc_unlink = 0;
@@ -2963,6 +2896,12 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2963 err = 0; 2896 err = 0;
2964out: 2897out:
2965 btrfs_free_path(path); 2898 btrfs_free_path(path);
2899 /* Migrate the orphan reservation over */
2900 if (!err)
2901 err = btrfs_block_rsv_migrate(trans->block_rsv,
2902 &root->fs_info->global_block_rsv,
2903 btrfs_calc_trans_metadata_size(root, 1));
2904
2966 if (err) { 2905 if (err) {
2967 btrfs_end_transaction(trans, root); 2906 btrfs_end_transaction(trans, root);
2968 root->fs_info->enospc_unlink = 0; 2907 root->fs_info->enospc_unlink = 0;
@@ -3368,6 +3307,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3368 pgoff_t index = from >> PAGE_CACHE_SHIFT; 3307 pgoff_t index = from >> PAGE_CACHE_SHIFT;
3369 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3308 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3370 struct page *page; 3309 struct page *page;
3310 gfp_t mask = btrfs_alloc_write_mask(mapping);
3371 int ret = 0; 3311 int ret = 0;
3372 u64 page_start; 3312 u64 page_start;
3373 u64 page_end; 3313 u64 page_end;
@@ -3380,7 +3320,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3380 3320
3381 ret = -ENOMEM; 3321 ret = -ENOMEM;
3382again: 3322again:
3383 page = find_or_create_page(mapping, index, GFP_NOFS); 3323 page = find_or_create_page(mapping, index, mask);
3384 if (!page) { 3324 if (!page) {
3385 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3325 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3386 goto out; 3326 goto out;
@@ -3613,6 +3553,8 @@ void btrfs_evict_inode(struct inode *inode)
3613{ 3553{
3614 struct btrfs_trans_handle *trans; 3554 struct btrfs_trans_handle *trans;
3615 struct btrfs_root *root = BTRFS_I(inode)->root; 3555 struct btrfs_root *root = BTRFS_I(inode)->root;
3556 struct btrfs_block_rsv *rsv, *global_rsv;
3557 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
3616 unsigned long nr; 3558 unsigned long nr;
3617 int ret; 3559 int ret;
3618 3560
@@ -3640,22 +3582,55 @@ void btrfs_evict_inode(struct inode *inode)
3640 goto no_delete; 3582 goto no_delete;
3641 } 3583 }
3642 3584
3585 rsv = btrfs_alloc_block_rsv(root);
3586 if (!rsv) {
3587 btrfs_orphan_del(NULL, inode);
3588 goto no_delete;
3589 }
3590 rsv->size = min_size;
3591 global_rsv = &root->fs_info->global_block_rsv;
3592
3643 btrfs_i_size_write(inode, 0); 3593 btrfs_i_size_write(inode, 0);
3644 3594
3595 /*
3596 * This is a bit simpler than btrfs_truncate since
3597 *
3598 * 1) We've already reserved our space for our orphan item in the
3599 * unlink.
3600 * 2) We're going to delete the inode item, so we don't need to update
3601 * it at all.
3602 *
3603 * So we just need to reserve some slack space in case we add bytes when
3604 * doing the truncate.
3605 */
3645 while (1) { 3606 while (1) {
3646 trans = btrfs_join_transaction(root); 3607 ret = btrfs_block_rsv_refill(root, rsv, min_size);
3647 BUG_ON(IS_ERR(trans)); 3608
3648 trans->block_rsv = root->orphan_block_rsv; 3609 /*
3610 * Try and steal from the global reserve since we will
3611 * likely not use this space anyway, we want to try as
3612 * hard as possible to get this to work.
3613 */
3614 if (ret)
3615 ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
3649 3616
3650 ret = btrfs_block_rsv_check(trans, root,
3651 root->orphan_block_rsv, 0, 5);
3652 if (ret) { 3617 if (ret) {
3653 BUG_ON(ret != -EAGAIN); 3618 printk(KERN_WARNING "Could not get space for a "
3654 ret = btrfs_commit_transaction(trans, root); 3619 "delete, will truncate on mount %d\n", ret);
3655 BUG_ON(ret); 3620 btrfs_orphan_del(NULL, inode);
3656 continue; 3621 btrfs_free_block_rsv(root, rsv);
3622 goto no_delete;
3623 }
3624
3625 trans = btrfs_start_transaction(root, 0);
3626 if (IS_ERR(trans)) {
3627 btrfs_orphan_del(NULL, inode);
3628 btrfs_free_block_rsv(root, rsv);
3629 goto no_delete;
3657 } 3630 }
3658 3631
3632 trans->block_rsv = rsv;
3633
3659 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 3634 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
3660 if (ret != -EAGAIN) 3635 if (ret != -EAGAIN)
3661 break; 3636 break;
@@ -3664,14 +3639,17 @@ void btrfs_evict_inode(struct inode *inode)
3664 btrfs_end_transaction(trans, root); 3639 btrfs_end_transaction(trans, root);
3665 trans = NULL; 3640 trans = NULL;
3666 btrfs_btree_balance_dirty(root, nr); 3641 btrfs_btree_balance_dirty(root, nr);
3667
3668 } 3642 }
3669 3643
3644 btrfs_free_block_rsv(root, rsv);
3645
3670 if (ret == 0) { 3646 if (ret == 0) {
3647 trans->block_rsv = root->orphan_block_rsv;
3671 ret = btrfs_orphan_del(trans, inode); 3648 ret = btrfs_orphan_del(trans, inode);
3672 BUG_ON(ret); 3649 BUG_ON(ret);
3673 } 3650 }
3674 3651
3652 trans->block_rsv = &root->fs_info->trans_block_rsv;
3675 if (!(root == root->fs_info->tree_root || 3653 if (!(root == root->fs_info->tree_root ||
3676 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) 3654 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
3677 btrfs_return_ino(root, btrfs_ino(inode)); 3655 btrfs_return_ino(root, btrfs_ino(inode));
@@ -6541,6 +6519,7 @@ static int btrfs_truncate(struct inode *inode)
6541 struct btrfs_trans_handle *trans; 6519 struct btrfs_trans_handle *trans;
6542 unsigned long nr; 6520 unsigned long nr;
6543 u64 mask = root->sectorsize - 1; 6521 u64 mask = root->sectorsize - 1;
6522 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
6544 6523
6545 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); 6524 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
6546 if (ret) 6525 if (ret)
@@ -6588,19 +6567,23 @@ static int btrfs_truncate(struct inode *inode)
6588 rsv = btrfs_alloc_block_rsv(root); 6567 rsv = btrfs_alloc_block_rsv(root);
6589 if (!rsv) 6568 if (!rsv)
6590 return -ENOMEM; 6569 return -ENOMEM;
6591 btrfs_add_durable_block_rsv(root->fs_info, rsv); 6570 rsv->size = min_size;
6592 6571
6572 /*
6573 * 1 for the truncate slack space
6574 * 1 for the orphan item we're going to add
6575 * 1 for the orphan item deletion
6576 * 1 for updating the inode.
6577 */
6593 trans = btrfs_start_transaction(root, 4); 6578 trans = btrfs_start_transaction(root, 4);
6594 if (IS_ERR(trans)) { 6579 if (IS_ERR(trans)) {
6595 err = PTR_ERR(trans); 6580 err = PTR_ERR(trans);
6596 goto out; 6581 goto out;
6597 } 6582 }
6598 6583
6599 /* 6584 /* Migrate the slack space for the truncate to our reserve */
6600 * Reserve space for the truncate process. Truncate should be adding 6585 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
6601 * space, but if there are snapshots it may end up using space. 6586 min_size);
6602 */
6603 ret = btrfs_truncate_reserve_metadata(trans, root, rsv);
6604 BUG_ON(ret); 6587 BUG_ON(ret);
6605 6588
6606 ret = btrfs_orphan_add(trans, inode); 6589 ret = btrfs_orphan_add(trans, inode);
@@ -6609,21 +6592,6 @@ static int btrfs_truncate(struct inode *inode)
6609 goto out; 6592 goto out;
6610 } 6593 }
6611 6594
6612 nr = trans->blocks_used;
6613 btrfs_end_transaction(trans, root);
6614 btrfs_btree_balance_dirty(root, nr);
6615
6616 /*
6617 * Ok so we've already migrated our bytes over for the truncate, so here
6618 * just reserve the one slot we need for updating the inode.
6619 */
6620 trans = btrfs_start_transaction(root, 1);
6621 if (IS_ERR(trans)) {
6622 err = PTR_ERR(trans);
6623 goto out;
6624 }
6625 trans->block_rsv = rsv;
6626
6627 /* 6595 /*
6628 * setattr is responsible for setting the ordered_data_close flag, 6596 * setattr is responsible for setting the ordered_data_close flag,
6629 * but that is only tested during the last file release. That 6597 * but that is only tested during the last file release. That
@@ -6645,20 +6613,30 @@ static int btrfs_truncate(struct inode *inode)
6645 btrfs_add_ordered_operation(trans, root, inode); 6613 btrfs_add_ordered_operation(trans, root, inode);
6646 6614
6647 while (1) { 6615 while (1) {
6616 ret = btrfs_block_rsv_refill(root, rsv, min_size);
6617 if (ret) {
6618 /*
6619 * This can only happen with the original transaction we
6620 * started above, every other time we shouldn't have a
6621 * transaction started yet.
6622 */
6623 if (ret == -EAGAIN)
6624 goto end_trans;
6625 err = ret;
6626 break;
6627 }
6628
6648 if (!trans) { 6629 if (!trans) {
6649 trans = btrfs_start_transaction(root, 3); 6630 /* Just need the 1 for updating the inode */
6631 trans = btrfs_start_transaction(root, 1);
6650 if (IS_ERR(trans)) { 6632 if (IS_ERR(trans)) {
6651 err = PTR_ERR(trans); 6633 err = PTR_ERR(trans);
6652 goto out; 6634 goto out;
6653 } 6635 }
6654
6655 ret = btrfs_truncate_reserve_metadata(trans, root,
6656 rsv);
6657 BUG_ON(ret);
6658
6659 trans->block_rsv = rsv;
6660 } 6636 }
6661 6637
6638 trans->block_rsv = rsv;
6639
6662 ret = btrfs_truncate_inode_items(trans, root, inode, 6640 ret = btrfs_truncate_inode_items(trans, root, inode,
6663 inode->i_size, 6641 inode->i_size,
6664 BTRFS_EXTENT_DATA_KEY); 6642 BTRFS_EXTENT_DATA_KEY);
@@ -6673,7 +6651,7 @@ static int btrfs_truncate(struct inode *inode)
6673 err = ret; 6651 err = ret;
6674 break; 6652 break;
6675 } 6653 }
6676 6654end_trans:
6677 nr = trans->blocks_used; 6655 nr = trans->blocks_used;
6678 btrfs_end_transaction(trans, root); 6656 btrfs_end_transaction(trans, root);
6679 trans = NULL; 6657 trans = NULL;
@@ -6755,9 +6733,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6755 ei->last_sub_trans = 0; 6733 ei->last_sub_trans = 0;
6756 ei->logged_trans = 0; 6734 ei->logged_trans = 0;
6757 ei->delalloc_bytes = 0; 6735 ei->delalloc_bytes = 0;
6758 ei->reserved_bytes = 0;
6759 ei->disk_i_size = 0; 6736 ei->disk_i_size = 0;
6760 ei->flags = 0; 6737 ei->flags = 0;
6738 ei->csum_bytes = 0;
6761 ei->index_cnt = (u64)-1; 6739 ei->index_cnt = (u64)-1;
6762 ei->last_unlink_trans = 0; 6740 ei->last_unlink_trans = 0;
6763 6741
@@ -6803,6 +6781,8 @@ void btrfs_destroy_inode(struct inode *inode)
6803 WARN_ON(inode->i_data.nrpages); 6781 WARN_ON(inode->i_data.nrpages);
6804 WARN_ON(BTRFS_I(inode)->outstanding_extents); 6782 WARN_ON(BTRFS_I(inode)->outstanding_extents);
6805 WARN_ON(BTRFS_I(inode)->reserved_extents); 6783 WARN_ON(BTRFS_I(inode)->reserved_extents);
6784 WARN_ON(BTRFS_I(inode)->delalloc_bytes);
6785 WARN_ON(BTRFS_I(inode)->csum_bytes);
6806 6786
6807 /* 6787 /*
6808 * This can happen where we create an inode, but somebody else also 6788 * This can happen where we create an inode, but somebody else also
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index dae5dfe41ba5..877727b28d88 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -117,7 +117,7 @@ void btrfs_update_iflags(struct inode *inode)
117/* 117/*
118 * Inherit flags from the parent inode. 118 * Inherit flags from the parent inode.
119 * 119 *
120 * Unlike extN we don't have any flags we don't want to inherit currently. 120 * Currently only the compression flags and the cow flags are inherited.
121 */ 121 */
122void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) 122void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
123{ 123{
@@ -128,12 +128,17 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
128 128
129 flags = BTRFS_I(dir)->flags; 129 flags = BTRFS_I(dir)->flags;
130 130
131 if (S_ISREG(inode->i_mode)) 131 if (flags & BTRFS_INODE_NOCOMPRESS) {
132 flags &= ~BTRFS_INODE_DIRSYNC; 132 BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
133 else if (!S_ISDIR(inode->i_mode)) 133 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
134 flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME); 134 } else if (flags & BTRFS_INODE_COMPRESS) {
135 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
136 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
137 }
138
139 if (flags & BTRFS_INODE_NODATACOW)
140 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
135 141
136 BTRFS_I(inode)->flags = flags;
137 btrfs_update_iflags(inode); 142 btrfs_update_iflags(inode);
138} 143}
139 144
@@ -843,6 +848,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
843 int i_done; 848 int i_done;
844 struct btrfs_ordered_extent *ordered; 849 struct btrfs_ordered_extent *ordered;
845 struct extent_state *cached_state = NULL; 850 struct extent_state *cached_state = NULL;
851 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
846 852
847 if (isize == 0) 853 if (isize == 0)
848 return 0; 854 return 0;
@@ -860,7 +866,7 @@ again:
860 for (i = 0; i < num_pages; i++) { 866 for (i = 0; i < num_pages; i++) {
861 struct page *page; 867 struct page *page;
862 page = find_or_create_page(inode->i_mapping, 868 page = find_or_create_page(inode->i_mapping,
863 start_index + i, GFP_NOFS); 869 start_index + i, mask);
864 if (!page) 870 if (!page)
865 break; 871 break;
866 872
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 59bb1764273d..10af6a0e0865 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2041,8 +2041,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2041 BUG_ON(IS_ERR(trans)); 2041 BUG_ON(IS_ERR(trans));
2042 trans->block_rsv = rc->block_rsv; 2042 trans->block_rsv = rc->block_rsv;
2043 2043
2044 ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, 2044 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved);
2045 min_reserved, 0);
2046 if (ret) { 2045 if (ret) {
2047 BUG_ON(ret != -EAGAIN); 2046 BUG_ON(ret != -EAGAIN);
2048 ret = btrfs_commit_transaction(trans, root); 2047 ret = btrfs_commit_transaction(trans, root);
@@ -2152,8 +2151,7 @@ int prepare_to_merge(struct reloc_control *rc, int err)
2152again: 2151again:
2153 if (!err) { 2152 if (!err) {
2154 num_bytes = rc->merging_rsv_size; 2153 num_bytes = rc->merging_rsv_size;
2155 ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv, 2154 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
2156 num_bytes);
2157 if (ret) 2155 if (ret)
2158 err = ret; 2156 err = ret;
2159 } 2157 }
@@ -2427,7 +2425,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2427 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2425 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2428 2426
2429 trans->block_rsv = rc->block_rsv; 2427 trans->block_rsv = rc->block_rsv;
2430 ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes); 2428 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
2431 if (ret) { 2429 if (ret) {
2432 if (ret == -EAGAIN) 2430 if (ret == -EAGAIN)
2433 rc->commit_transaction = 1; 2431 rc->commit_transaction = 1;
@@ -2922,6 +2920,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
2922 unsigned long last_index; 2920 unsigned long last_index;
2923 struct page *page; 2921 struct page *page;
2924 struct file_ra_state *ra; 2922 struct file_ra_state *ra;
2923 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
2925 int nr = 0; 2924 int nr = 0;
2926 int ret = 0; 2925 int ret = 0;
2927 2926
@@ -2956,7 +2955,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
2956 ra, NULL, index, 2955 ra, NULL, index,
2957 last_index + 1 - index); 2956 last_index + 1 - index);
2958 page = find_or_create_page(inode->i_mapping, index, 2957 page = find_or_create_page(inode->i_mapping, index,
2959 GFP_NOFS); 2958 mask);
2960 if (!page) { 2959 if (!page) {
2961 btrfs_delalloc_release_metadata(inode, 2960 btrfs_delalloc_release_metadata(inode,
2962 PAGE_CACHE_SIZE); 2961 PAGE_CACHE_SIZE);
@@ -3645,14 +3644,11 @@ int prepare_to_relocate(struct reloc_control *rc)
3645 * btrfs_init_reloc_root will use them when there 3644 * btrfs_init_reloc_root will use them when there
3646 * is no reservation in transaction handle. 3645 * is no reservation in transaction handle.
3647 */ 3646 */
3648 ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv, 3647 ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
3649 rc->extent_root->nodesize * 256); 3648 rc->extent_root->nodesize * 256);
3650 if (ret) 3649 if (ret)
3651 return ret; 3650 return ret;
3652 3651
3653 rc->block_rsv->refill_used = 1;
3654 btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
3655
3656 memset(&rc->cluster, 0, sizeof(rc->cluster)); 3652 memset(&rc->cluster, 0, sizeof(rc->cluster));
3657 rc->search_start = rc->block_group->key.objectid; 3653 rc->search_start = rc->block_group->key.objectid;
3658 rc->extents_found = 0; 3654 rc->extents_found = 0;
@@ -3777,8 +3773,7 @@ restart:
3777 } 3773 }
3778 } 3774 }
3779 3775
3780 ret = btrfs_block_rsv_check(trans, rc->extent_root, 3776 ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5);
3781 rc->block_rsv, 0, 5);
3782 if (ret < 0) { 3777 if (ret < 0) {
3783 if (ret != -EAGAIN) { 3778 if (ret != -EAGAIN) {
3784 err = ret; 3779 err = ret;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 15634d4648d7..266d1f35465d 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -40,6 +40,7 @@
40#include <linux/magic.h> 40#include <linux/magic.h>
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/cleancache.h> 42#include <linux/cleancache.h>
43#include <linux/mnt_namespace.h>
43#include "compat.h" 44#include "compat.h"
44#include "delayed-inode.h" 45#include "delayed-inode.h"
45#include "ctree.h" 46#include "ctree.h"
@@ -58,6 +59,7 @@
58#include <trace/events/btrfs.h> 59#include <trace/events/btrfs.h>
59 60
60static const struct super_operations btrfs_super_ops; 61static const struct super_operations btrfs_super_ops;
62static struct file_system_type btrfs_fs_type;
61 63
62static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, 64static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
63 char nbuf[16]) 65 char nbuf[16])
@@ -162,7 +164,7 @@ enum {
162 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, 164 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
163 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, 165 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
164 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, 166 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
165 Opt_inode_cache, Opt_err, 167 Opt_inode_cache, Opt_no_space_cache, Opt_err,
166}; 168};
167 169
168static match_table_t tokens = { 170static match_table_t tokens = {
@@ -195,6 +197,7 @@ static match_table_t tokens = {
195 {Opt_subvolrootid, "subvolrootid=%d"}, 197 {Opt_subvolrootid, "subvolrootid=%d"},
196 {Opt_defrag, "autodefrag"}, 198 {Opt_defrag, "autodefrag"},
197 {Opt_inode_cache, "inode_cache"}, 199 {Opt_inode_cache, "inode_cache"},
200 {Opt_no_space_cache, "no_space_cache"},
198 {Opt_err, NULL}, 201 {Opt_err, NULL},
199}; 202};
200 203
@@ -206,14 +209,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
206{ 209{
207 struct btrfs_fs_info *info = root->fs_info; 210 struct btrfs_fs_info *info = root->fs_info;
208 substring_t args[MAX_OPT_ARGS]; 211 substring_t args[MAX_OPT_ARGS];
209 char *p, *num, *orig; 212 char *p, *num, *orig = NULL;
213 u64 cache_gen;
210 int intarg; 214 int intarg;
211 int ret = 0; 215 int ret = 0;
212 char *compress_type; 216 char *compress_type;
213 bool compress_force = false; 217 bool compress_force = false;
214 218
219 cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
220 if (cache_gen)
221 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
222
215 if (!options) 223 if (!options)
216 return 0; 224 goto out;
217 225
218 /* 226 /*
219 * strsep changes the string, duplicate it because parse_options 227 * strsep changes the string, duplicate it because parse_options
@@ -360,9 +368,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
360 btrfs_set_opt(info->mount_opt, DISCARD); 368 btrfs_set_opt(info->mount_opt, DISCARD);
361 break; 369 break;
362 case Opt_space_cache: 370 case Opt_space_cache:
363 printk(KERN_INFO "btrfs: enabling disk space caching\n");
364 btrfs_set_opt(info->mount_opt, SPACE_CACHE); 371 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
365 break; 372 break;
373 case Opt_no_space_cache:
374 printk(KERN_INFO "btrfs: disabling disk space caching\n");
375 btrfs_clear_opt(info->mount_opt, SPACE_CACHE);
376 break;
366 case Opt_inode_cache: 377 case Opt_inode_cache:
367 printk(KERN_INFO "btrfs: enabling inode map caching\n"); 378 printk(KERN_INFO "btrfs: enabling inode map caching\n");
368 btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE); 379 btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE);
@@ -391,6 +402,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
391 } 402 }
392 } 403 }
393out: 404out:
405 if (!ret && btrfs_test_opt(root, SPACE_CACHE))
406 printk(KERN_INFO "btrfs: disk space caching is enabled\n");
394 kfree(orig); 407 kfree(orig);
395 return ret; 408 return ret;
396} 409}
@@ -411,7 +424,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
411 int intarg; 424 int intarg;
412 425
413 if (!options) 426 if (!options)
414 goto out; 427 return 0;
415 428
416 /* 429 /*
417 * strsep changes the string, duplicate it because parse_options 430 * strsep changes the string, duplicate it because parse_options
@@ -460,26 +473,15 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
460 error = btrfs_scan_one_device(match_strdup(&args[0]), 473 error = btrfs_scan_one_device(match_strdup(&args[0]),
461 flags, holder, fs_devices); 474 flags, holder, fs_devices);
462 if (error) 475 if (error)
463 goto out_free_opts; 476 goto out;
464 break; 477 break;
465 default: 478 default:
466 break; 479 break;
467 } 480 }
468 } 481 }
469 482
470 out_free_opts: 483out:
471 kfree(orig); 484 kfree(orig);
472 out:
473 /*
474 * If no subvolume name is specified we use the default one. Allocate
475 * a copy of the string "." here so that code later in the
476 * mount path doesn't care if it's the default volume or another one.
477 */
478 if (!*subvol_name) {
479 *subvol_name = kstrdup(".", GFP_KERNEL);
480 if (!*subvol_name)
481 return -ENOMEM;
482 }
483 return error; 485 return error;
484} 486}
485 487
@@ -492,7 +494,6 @@ static struct dentry *get_default_root(struct super_block *sb,
492 struct btrfs_path *path; 494 struct btrfs_path *path;
493 struct btrfs_key location; 495 struct btrfs_key location;
494 struct inode *inode; 496 struct inode *inode;
495 struct dentry *dentry;
496 u64 dir_id; 497 u64 dir_id;
497 int new = 0; 498 int new = 0;
498 499
@@ -566,29 +567,7 @@ setup_root:
566 return dget(sb->s_root); 567 return dget(sb->s_root);
567 } 568 }
568 569
569 if (new) { 570 return d_obtain_alias(inode);
570 const struct qstr name = { .name = "/", .len = 1 };
571
572 /*
573 * New inode, we need to make the dentry a sibling of s_root so
574 * everything gets cleaned up properly on unmount.
575 */
576 dentry = d_alloc(sb->s_root, &name);
577 if (!dentry) {
578 iput(inode);
579 return ERR_PTR(-ENOMEM);
580 }
581 d_splice_alias(inode, dentry);
582 } else {
583 /*
584 * We found the inode in cache, just find a dentry for it and
585 * put the reference to the inode we just got.
586 */
587 dentry = d_find_alias(inode);
588 iput(inode);
589 }
590
591 return dentry;
592} 571}
593 572
594static int btrfs_fill_super(struct super_block *sb, 573static int btrfs_fill_super(struct super_block *sb,
@@ -719,6 +698,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
719 seq_puts(seq, ",noacl"); 698 seq_puts(seq, ",noacl");
720 if (btrfs_test_opt(root, SPACE_CACHE)) 699 if (btrfs_test_opt(root, SPACE_CACHE))
721 seq_puts(seq, ",space_cache"); 700 seq_puts(seq, ",space_cache");
701 else
702 seq_puts(seq, ",no_space_cache");
722 if (btrfs_test_opt(root, CLEAR_CACHE)) 703 if (btrfs_test_opt(root, CLEAR_CACHE))
723 seq_puts(seq, ",clear_cache"); 704 seq_puts(seq, ",clear_cache");
724 if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) 705 if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
@@ -753,6 +734,118 @@ static int btrfs_set_super(struct super_block *s, void *data)
753 return set_anon_super(s, data); 734 return set_anon_super(s, data);
754} 735}
755 736
737/*
738 * This will strip out the subvol=%s argument for an argument string and add
739 * subvolid=0 to make sure we get the actual tree root for path walking to the
740 * subvol we want.
741 */
742static char *setup_root_args(char *args)
743{
744 unsigned copied = 0;
745 unsigned len = strlen(args) + 2;
746 char *pos;
747 char *ret;
748
749 /*
750 * We need the same args as before, but minus
751 *
752 * subvol=a
753 *
754 * and add
755 *
756 * subvolid=0
757 *
758 * which is a difference of 2 characters, so we allocate strlen(args) +
759 * 2 characters.
760 */
761 ret = kzalloc(len * sizeof(char), GFP_NOFS);
762 if (!ret)
763 return NULL;
764 pos = strstr(args, "subvol=");
765
766 /* This shouldn't happen, but just in case.. */
767 if (!pos) {
768 kfree(ret);
769 return NULL;
770 }
771
772 /*
773 * The subvol=<> arg is not at the front of the string, copy everybody
774 * up to that into ret.
775 */
776 if (pos != args) {
777 *pos = '\0';
778 strcpy(ret, args);
779 copied += strlen(args);
780 pos++;
781 }
782
783 strncpy(ret + copied, "subvolid=0", len - copied);
784
785 /* Length of subvolid=0 */
786 copied += 10;
787
788 /*
789 * If there is no , after the subvol= option then we know there's no
790 * other options and we can just return.
791 */
792 pos = strchr(pos, ',');
793 if (!pos)
794 return ret;
795
796 /* Copy the rest of the arguments into our buffer */
797 strncpy(ret + copied, pos, len - copied);
798 copied += strlen(pos);
799
800 return ret;
801}
802
803static struct dentry *mount_subvol(const char *subvol_name, int flags,
804 const char *device_name, char *data)
805{
806 struct super_block *s;
807 struct dentry *root;
808 struct vfsmount *mnt;
809 struct mnt_namespace *ns_private;
810 char *newargs;
811 struct path path;
812 int error;
813
814 newargs = setup_root_args(data);
815 if (!newargs)
816 return ERR_PTR(-ENOMEM);
817 mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
818 newargs);
819 kfree(newargs);
820 if (IS_ERR(mnt))
821 return ERR_CAST(mnt);
822
823 ns_private = create_mnt_ns(mnt);
824 if (IS_ERR(ns_private)) {
825 mntput(mnt);
826 return ERR_CAST(ns_private);
827 }
828
829 /*
830 * This will trigger the automount of the subvol so we can just
831 * drop the mnt we have here and return the dentry that we
832 * found.
833 */
834 error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name,
835 LOOKUP_FOLLOW, &path);
836 put_mnt_ns(ns_private);
837 if (error)
838 return ERR_PTR(error);
839
840 /* Get a ref to the sb and the dentry we found and return it */
841 s = path.mnt->mnt_sb;
842 atomic_inc(&s->s_active);
843 root = dget(path.dentry);
844 path_put(&path);
845 down_write(&s->s_umount);
846
847 return root;
848}
756 849
757/* 850/*
758 * Find a superblock for the given device / mount point. 851 * Find a superblock for the given device / mount point.
@@ -784,13 +877,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
784 if (error) 877 if (error)
785 return ERR_PTR(error); 878 return ERR_PTR(error);
786 879
880 if (subvol_name) {
881 root = mount_subvol(subvol_name, flags, device_name, data);
882 kfree(subvol_name);
883 return root;
884 }
885
787 error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices); 886 error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
788 if (error) 887 if (error)
789 goto error_free_subvol_name; 888 return ERR_PTR(error);
790 889
791 error = btrfs_open_devices(fs_devices, mode, fs_type); 890 error = btrfs_open_devices(fs_devices, mode, fs_type);
792 if (error) 891 if (error)
793 goto error_free_subvol_name; 892 return ERR_PTR(error);
794 893
795 if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { 894 if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
796 error = -EACCES; 895 error = -EACCES;
@@ -815,14 +914,15 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
815 914
816 bdev = fs_devices->latest_bdev; 915 bdev = fs_devices->latest_bdev;
817 s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root); 916 s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
818 if (IS_ERR(s)) 917 if (IS_ERR(s)) {
819 goto error_s; 918 error = PTR_ERR(s);
919 goto error_close_devices;
920 }
820 921
821 if (s->s_root) { 922 if (s->s_root) {
822 if ((flags ^ s->s_flags) & MS_RDONLY) { 923 if ((flags ^ s->s_flags) & MS_RDONLY) {
823 deactivate_locked_super(s); 924 deactivate_locked_super(s);
824 error = -EBUSY; 925 return ERR_PTR(-EBUSY);
825 goto error_close_devices;
826 } 926 }
827 927
828 btrfs_close_devices(fs_devices); 928 btrfs_close_devices(fs_devices);
@@ -837,64 +937,25 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
837 flags & MS_SILENT ? 1 : 0); 937 flags & MS_SILENT ? 1 : 0);
838 if (error) { 938 if (error) {
839 deactivate_locked_super(s); 939 deactivate_locked_super(s);
840 goto error_free_subvol_name; 940 return ERR_PTR(error);
841 } 941 }
842 942
843 btrfs_sb(s)->fs_info->bdev_holder = fs_type; 943 btrfs_sb(s)->fs_info->bdev_holder = fs_type;
844 s->s_flags |= MS_ACTIVE; 944 s->s_flags |= MS_ACTIVE;
845 } 945 }
846 946
847 /* if they gave us a subvolume name bind mount into that */ 947 root = get_default_root(s, subvol_objectid);
848 if (strcmp(subvol_name, ".")) { 948 if (IS_ERR(root)) {
849 struct dentry *new_root; 949 deactivate_locked_super(s);
850 950 return root;
851 root = get_default_root(s, subvol_rootid);
852 if (IS_ERR(root)) {
853 error = PTR_ERR(root);
854 deactivate_locked_super(s);
855 goto error_free_subvol_name;
856 }
857
858 mutex_lock(&root->d_inode->i_mutex);
859 new_root = lookup_one_len(subvol_name, root,
860 strlen(subvol_name));
861 mutex_unlock(&root->d_inode->i_mutex);
862
863 if (IS_ERR(new_root)) {
864 dput(root);
865 deactivate_locked_super(s);
866 error = PTR_ERR(new_root);
867 goto error_free_subvol_name;
868 }
869 if (!new_root->d_inode) {
870 dput(root);
871 dput(new_root);
872 deactivate_locked_super(s);
873 error = -ENXIO;
874 goto error_free_subvol_name;
875 }
876 dput(root);
877 root = new_root;
878 } else {
879 root = get_default_root(s, subvol_objectid);
880 if (IS_ERR(root)) {
881 error = PTR_ERR(root);
882 deactivate_locked_super(s);
883 goto error_free_subvol_name;
884 }
885 } 951 }
886 952
887 kfree(subvol_name);
888 return root; 953 return root;
889 954
890error_s:
891 error = PTR_ERR(s);
892error_close_devices: 955error_close_devices:
893 btrfs_close_devices(fs_devices); 956 btrfs_close_devices(fs_devices);
894 kfree(fs_info); 957 kfree(fs_info);
895 kfree(tree_root); 958 kfree(tree_root);
896error_free_subvol_name:
897 kfree(subvol_name);
898 return ERR_PTR(error); 959 return ERR_PTR(error);
899} 960}
900 961
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e24b7964a155..29bef63e23ba 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -275,7 +275,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
275 */ 275 */
276 if (num_items > 0 && root != root->fs_info->chunk_root) { 276 if (num_items > 0 && root != root->fs_info->chunk_root) {
277 num_bytes = btrfs_calc_trans_metadata_size(root, num_items); 277 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
278 ret = btrfs_block_rsv_add(NULL, root, 278 ret = btrfs_block_rsv_add(root,
279 &root->fs_info->trans_block_rsv, 279 &root->fs_info->trans_block_rsv,
280 num_bytes); 280 num_bytes);
281 if (ret) 281 if (ret)
@@ -418,8 +418,8 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
418 struct btrfs_root *root) 418 struct btrfs_root *root)
419{ 419{
420 int ret; 420 int ret;
421 ret = btrfs_block_rsv_check(trans, root, 421
422 &root->fs_info->global_block_rsv, 0, 5); 422 ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
423 return ret ? 1 : 0; 423 return ret ? 1 : 0;
424} 424}
425 425
@@ -427,17 +427,26 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
427 struct btrfs_root *root) 427 struct btrfs_root *root)
428{ 428{
429 struct btrfs_transaction *cur_trans = trans->transaction; 429 struct btrfs_transaction *cur_trans = trans->transaction;
430 struct btrfs_block_rsv *rsv = trans->block_rsv;
430 int updates; 431 int updates;
431 432
432 smp_mb(); 433 smp_mb();
433 if (cur_trans->blocked || cur_trans->delayed_refs.flushing) 434 if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
434 return 1; 435 return 1;
435 436
437 /*
438 * We need to do this in case we're deleting csums so the global block
439 * rsv get's used instead of the csum block rsv.
440 */
441 trans->block_rsv = NULL;
442
436 updates = trans->delayed_ref_updates; 443 updates = trans->delayed_ref_updates;
437 trans->delayed_ref_updates = 0; 444 trans->delayed_ref_updates = 0;
438 if (updates) 445 if (updates)
439 btrfs_run_delayed_refs(trans, root, updates); 446 btrfs_run_delayed_refs(trans, root, updates);
440 447
448 trans->block_rsv = rsv;
449
441 return should_end_transaction(trans, root); 450 return should_end_transaction(trans, root);
442} 451}
443 452
@@ -453,6 +462,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
453 return 0; 462 return 0;
454 } 463 }
455 464
465 btrfs_trans_release_metadata(trans, root);
466 trans->block_rsv = NULL;
456 while (count < 4) { 467 while (count < 4) {
457 unsigned long cur = trans->delayed_ref_updates; 468 unsigned long cur = trans->delayed_ref_updates;
458 trans->delayed_ref_updates = 0; 469 trans->delayed_ref_updates = 0;
@@ -473,8 +484,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
473 count++; 484 count++;
474 } 485 }
475 486
476 btrfs_trans_release_metadata(trans, root);
477
478 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && 487 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
479 should_end_transaction(trans, root)) { 488 should_end_transaction(trans, root)) {
480 trans->transaction->blocked = 1; 489 trans->transaction->blocked = 1;
@@ -562,50 +571,21 @@ int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
562int btrfs_write_marked_extents(struct btrfs_root *root, 571int btrfs_write_marked_extents(struct btrfs_root *root,
563 struct extent_io_tree *dirty_pages, int mark) 572 struct extent_io_tree *dirty_pages, int mark)
564{ 573{
565 int ret;
566 int err = 0; 574 int err = 0;
567 int werr = 0; 575 int werr = 0;
568 struct page *page; 576 struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
569 struct inode *btree_inode = root->fs_info->btree_inode;
570 u64 start = 0; 577 u64 start = 0;
571 u64 end; 578 u64 end;
572 unsigned long index;
573
574 while (1) {
575 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
576 mark);
577 if (ret)
578 break;
579 while (start <= end) {
580 cond_resched();
581
582 index = start >> PAGE_CACHE_SHIFT;
583 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
584 page = find_get_page(btree_inode->i_mapping, index);
585 if (!page)
586 continue;
587
588 btree_lock_page_hook(page);
589 if (!page->mapping) {
590 unlock_page(page);
591 page_cache_release(page);
592 continue;
593 }
594 579
595 if (PageWriteback(page)) { 580 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
596 if (PageDirty(page)) 581 mark)) {
597 wait_on_page_writeback(page); 582 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
598 else { 583 GFP_NOFS);
599 unlock_page(page); 584 err = filemap_fdatawrite_range(mapping, start, end);
600 page_cache_release(page); 585 if (err)
601 continue; 586 werr = err;
602 } 587 cond_resched();
603 } 588 start = end + 1;
604 err = write_one_page(page, 0);
605 if (err)
606 werr = err;
607 page_cache_release(page);
608 }
609 } 589 }
610 if (err) 590 if (err)
611 werr = err; 591 werr = err;
@@ -621,39 +601,20 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
621int btrfs_wait_marked_extents(struct btrfs_root *root, 601int btrfs_wait_marked_extents(struct btrfs_root *root,
622 struct extent_io_tree *dirty_pages, int mark) 602 struct extent_io_tree *dirty_pages, int mark)
623{ 603{
624 int ret;
625 int err = 0; 604 int err = 0;
626 int werr = 0; 605 int werr = 0;
627 struct page *page; 606 struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
628 struct inode *btree_inode = root->fs_info->btree_inode;
629 u64 start = 0; 607 u64 start = 0;
630 u64 end; 608 u64 end;
631 unsigned long index;
632
633 while (1) {
634 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
635 mark);
636 if (ret)
637 break;
638 609
639 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); 610 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
640 while (start <= end) { 611 EXTENT_NEED_WAIT)) {
641 index = start >> PAGE_CACHE_SHIFT; 612 clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
642 start = (u64)(index + 1) << PAGE_CACHE_SHIFT; 613 err = filemap_fdatawait_range(mapping, start, end);
643 page = find_get_page(btree_inode->i_mapping, index); 614 if (err)
644 if (!page) 615 werr = err;
645 continue; 616 cond_resched();
646 if (PageDirty(page)) { 617 start = end + 1;
647 btree_lock_page_hook(page);
648 wait_on_page_writeback(page);
649 err = write_one_page(page, 0);
650 if (err)
651 werr = err;
652 }
653 wait_on_page_writeback(page);
654 page_cache_release(page);
655 cond_resched();
656 }
657 } 618 }
658 if (err) 619 if (err)
659 werr = err; 620 werr = err;
@@ -911,10 +872,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
911 } 872 }
912 873
913 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); 874 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
914 btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
915 875
916 if (to_reserve > 0) { 876 if (to_reserve > 0) {
917 ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, 877 ret = btrfs_block_rsv_add(root, &pending->block_rsv,
918 to_reserve); 878 to_reserve);
919 if (ret) { 879 if (ret) {
920 pending->error = ret; 880 pending->error = ret;
@@ -1002,7 +962,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1002 BUG_ON(IS_ERR(pending->snap)); 962 BUG_ON(IS_ERR(pending->snap));
1003 963
1004 btrfs_reloc_post_snapshot(trans, pending); 964 btrfs_reloc_post_snapshot(trans, pending);
1005 btrfs_orphan_post_snapshot(trans, pending);
1006fail: 965fail:
1007 kfree(new_root_item); 966 kfree(new_root_item);
1008 trans->block_rsv = rsv; 967 trans->block_rsv = rsv;
@@ -1043,7 +1002,7 @@ static void update_super_roots(struct btrfs_root *root)
1043 super->root = root_item->bytenr; 1002 super->root = root_item->bytenr;
1044 super->generation = root_item->generation; 1003 super->generation = root_item->generation;
1045 super->root_level = root_item->level; 1004 super->root_level = root_item->level;
1046 if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE)) 1005 if (btrfs_test_opt(root, SPACE_CACHE))
1047 super->cache_generation = root_item->generation; 1006 super->cache_generation = root_item->generation;
1048} 1007}
1049 1008
@@ -1168,14 +1127,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1168 1127
1169 btrfs_run_ordered_operations(root, 0); 1128 btrfs_run_ordered_operations(root, 0);
1170 1129
1130 btrfs_trans_release_metadata(trans, root);
1131 trans->block_rsv = NULL;
1132
1171 /* make a pass through all the delayed refs we have so far 1133 /* make a pass through all the delayed refs we have so far
1172 * any runnings procs may add more while we are here 1134 * any runnings procs may add more while we are here
1173 */ 1135 */
1174 ret = btrfs_run_delayed_refs(trans, root, 0); 1136 ret = btrfs_run_delayed_refs(trans, root, 0);
1175 BUG_ON(ret); 1137 BUG_ON(ret);
1176 1138
1177 btrfs_trans_release_metadata(trans, root);
1178
1179 cur_trans = trans->transaction; 1139 cur_trans = trans->transaction;
1180 /* 1140 /*
1181 * set the flushing flag so procs in this transaction have to 1141 * set the flushing flag so procs in this transaction have to
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f2a4cc79da61..e138af710de2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1013,8 +1013,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1013 } 1013 }
1014 BUG_ON(ret); 1014 BUG_ON(ret);
1015 1015
1016 if (device->bytes_used > 0) 1016 if (device->bytes_used > 0) {
1017 device->bytes_used -= btrfs_dev_extent_length(leaf, extent); 1017 u64 len = btrfs_dev_extent_length(leaf, extent);
1018 device->bytes_used -= len;
1019 spin_lock(&root->fs_info->free_chunk_lock);
1020 root->fs_info->free_chunk_space += len;
1021 spin_unlock(&root->fs_info->free_chunk_lock);
1022 }
1018 ret = btrfs_del_item(trans, root, path); 1023 ret = btrfs_del_item(trans, root, path);
1019 1024
1020out: 1025out:
@@ -1356,6 +1361,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1356 if (ret) 1361 if (ret)
1357 goto error_undo; 1362 goto error_undo;
1358 1363
1364 spin_lock(&root->fs_info->free_chunk_lock);
1365 root->fs_info->free_chunk_space = device->total_bytes -
1366 device->bytes_used;
1367 spin_unlock(&root->fs_info->free_chunk_lock);
1368
1359 device->in_fs_metadata = 0; 1369 device->in_fs_metadata = 0;
1360 btrfs_scrub_cancel_dev(root, device); 1370 btrfs_scrub_cancel_dev(root, device);
1361 1371
@@ -1691,6 +1701,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1691 root->fs_info->fs_devices->num_can_discard++; 1701 root->fs_info->fs_devices->num_can_discard++;
1692 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1702 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1693 1703
1704 spin_lock(&root->fs_info->free_chunk_lock);
1705 root->fs_info->free_chunk_space += device->total_bytes;
1706 spin_unlock(&root->fs_info->free_chunk_lock);
1707
1694 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 1708 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
1695 root->fs_info->fs_devices->rotating = 1; 1709 root->fs_info->fs_devices->rotating = 1;
1696 1710
@@ -2192,8 +2206,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2192 lock_chunks(root); 2206 lock_chunks(root);
2193 2207
2194 device->total_bytes = new_size; 2208 device->total_bytes = new_size;
2195 if (device->writeable) 2209 if (device->writeable) {
2196 device->fs_devices->total_rw_bytes -= diff; 2210 device->fs_devices->total_rw_bytes -= diff;
2211 spin_lock(&root->fs_info->free_chunk_lock);
2212 root->fs_info->free_chunk_space -= diff;
2213 spin_unlock(&root->fs_info->free_chunk_lock);
2214 }
2197 unlock_chunks(root); 2215 unlock_chunks(root);
2198 2216
2199again: 2217again:
@@ -2257,6 +2275,9 @@ again:
2257 device->total_bytes = old_size; 2275 device->total_bytes = old_size;
2258 if (device->writeable) 2276 if (device->writeable)
2259 device->fs_devices->total_rw_bytes += diff; 2277 device->fs_devices->total_rw_bytes += diff;
2278 spin_lock(&root->fs_info->free_chunk_lock);
2279 root->fs_info->free_chunk_space += diff;
2280 spin_unlock(&root->fs_info->free_chunk_lock);
2260 unlock_chunks(root); 2281 unlock_chunks(root);
2261 goto done; 2282 goto done;
2262 } 2283 }
@@ -2615,6 +2636,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2615 index++; 2636 index++;
2616 } 2637 }
2617 2638
2639 spin_lock(&extent_root->fs_info->free_chunk_lock);
2640 extent_root->fs_info->free_chunk_space -= (stripe_size *
2641 map->num_stripes);
2642 spin_unlock(&extent_root->fs_info->free_chunk_lock);
2643
2618 index = 0; 2644 index = 0;
2619 stripe = &chunk->stripe; 2645 stripe = &chunk->stripe;
2620 while (index < map->num_stripes) { 2646 while (index < map->num_stripes) {
@@ -3616,8 +3642,13 @@ static int read_one_dev(struct btrfs_root *root,
3616 fill_device_from_item(leaf, dev_item, device); 3642 fill_device_from_item(leaf, dev_item, device);
3617 device->dev_root = root->fs_info->dev_root; 3643 device->dev_root = root->fs_info->dev_root;
3618 device->in_fs_metadata = 1; 3644 device->in_fs_metadata = 1;
3619 if (device->writeable) 3645 if (device->writeable) {
3620 device->fs_devices->total_rw_bytes += device->total_bytes; 3646 device->fs_devices->total_rw_bytes += device->total_bytes;
3647 spin_lock(&root->fs_info->free_chunk_lock);
3648 root->fs_info->free_chunk_space += device->total_bytes -
3649 device->bytes_used;
3650 spin_unlock(&root->fs_info->free_chunk_lock);
3651 }
3621 ret = 0; 3652 ret = 0;
3622 return ret; 3653 return ret;
3623} 3654}
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 69565e5fc6a0..a76e41c04b71 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -127,6 +127,17 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
127again: 127again:
128 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), 128 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
129 name, name_len, value, size); 129 name, name_len, value, size);
130 /*
131 * If we're setting an xattr to a new value but the new value is say
132 * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting
133 * back from split_leaf. This is because it thinks we'll be extending
134 * the existing item size, but we're asking for enough space to add the
135 * item itself. So if we get EOVERFLOW just set ret to EEXIST and let
136 * the rest of the function figure it out.
137 */
138 if (ret == -EOVERFLOW)
139 ret = -EEXIST;
140
130 if (ret == -EEXIST) { 141 if (ret == -EEXIST) {
131 if (flags & XATTR_CREATE) 142 if (flags & XATTR_CREATE)
132 goto out; 143 goto out;