aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/acl.c11
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/compression.c17
-rw-r--r--fs/btrfs/ctree.c159
-rw-r--r--fs/btrfs/ctree.h32
-rw-r--r--fs/btrfs/delayed-ref.c6
-rw-r--r--fs/btrfs/dir-item.c45
-rw-r--r--fs/btrfs/disk-io.c217
-rw-r--r--fs/btrfs/extent-tree.c354
-rw-r--r--fs/btrfs/extent_io.c87
-rw-r--r--fs/btrfs/extent_io.h3
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/file-item.c5
-rw-r--r--fs/btrfs/file.c391
-rw-r--r--fs/btrfs/free-space-cache.c713
-rw-r--r--fs/btrfs/free-space-cache.h2
-rw-r--r--fs/btrfs/inode-map.c3
-rw-r--r--fs/btrfs/inode.c557
-rw-r--r--fs/btrfs/ioctl.c112
-rw-r--r--fs/btrfs/ordered-data.c8
-rw-r--r--fs/btrfs/relocation.c10
-rw-r--r--fs/btrfs/root-tree.c24
-rw-r--r--fs/btrfs/super.c66
-rw-r--r--fs/btrfs/transaction.c64
-rw-r--r--fs/btrfs/transaction.h4
-rw-r--r--fs/btrfs/tree-log.c57
-rw-r--r--fs/btrfs/volumes.c235
-rw-r--r--fs/btrfs/volumes.h12
-rw-r--r--fs/btrfs/xattr.c35
-rw-r--r--fs/btrfs/zlib.c3
30 files changed, 2031 insertions, 1206 deletions
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 9c949348510b..5d505aaa72fb 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -170,7 +170,7 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
170 int ret; 170 int ret;
171 struct posix_acl *acl = NULL; 171 struct posix_acl *acl = NULL;
172 172
173 if (!is_owner_or_cap(dentry->d_inode)) 173 if (!inode_owner_or_capable(dentry->d_inode))
174 return -EPERM; 174 return -EPERM;
175 175
176 if (!IS_POSIXACL(dentry->d_inode)) 176 if (!IS_POSIXACL(dentry->d_inode))
@@ -178,16 +178,17 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
178 178
179 if (value) { 179 if (value) {
180 acl = posix_acl_from_xattr(value, size); 180 acl = posix_acl_from_xattr(value, size);
181 if (acl == NULL) { 181 if (acl) {
182 value = NULL; 182 ret = posix_acl_valid(acl);
183 size = 0; 183 if (ret)
184 goto out;
184 } else if (IS_ERR(acl)) { 185 } else if (IS_ERR(acl)) {
185 return PTR_ERR(acl); 186 return PTR_ERR(acl);
186 } 187 }
187 } 188 }
188 189
189 ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type); 190 ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type);
190 191out:
191 posix_acl_release(acl); 192 posix_acl_release(acl);
192 193
193 return ret; 194 return ret;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ccc991c542df..57c3bb2884ce 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -136,9 +136,8 @@ struct btrfs_inode {
136 * items we think we'll end up using, and reserved_extents is the number 136 * items we think we'll end up using, and reserved_extents is the number
137 * of extent items we've reserved metadata for. 137 * of extent items we've reserved metadata for.
138 */ 138 */
139 spinlock_t accounting_lock;
140 atomic_t outstanding_extents; 139 atomic_t outstanding_extents;
141 int reserved_extents; 140 atomic_t reserved_extents;
142 141
143 /* 142 /*
144 * ordered_data_close is set by truncate when a file that used 143 * ordered_data_close is set by truncate when a file that used
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 4d2110eafe29..41d1d7c70e29 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -340,6 +340,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
340 340
341 WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1)); 341 WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
342 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); 342 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
343 if (!cb)
344 return -ENOMEM;
343 atomic_set(&cb->pending_bios, 0); 345 atomic_set(&cb->pending_bios, 0);
344 cb->errors = 0; 346 cb->errors = 0;
345 cb->inode = inode; 347 cb->inode = inode;
@@ -354,6 +356,10 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
354 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 356 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
355 357
356 bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); 358 bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
359 if(!bio) {
360 kfree(cb);
361 return -ENOMEM;
362 }
357 bio->bi_private = cb; 363 bio->bi_private = cb;
358 bio->bi_end_io = end_compressed_bio_write; 364 bio->bi_end_io = end_compressed_bio_write;
359 atomic_inc(&cb->pending_bios); 365 atomic_inc(&cb->pending_bios);
@@ -657,8 +663,9 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
657 atomic_inc(&cb->pending_bios); 663 atomic_inc(&cb->pending_bios);
658 664
659 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 665 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
660 btrfs_lookup_bio_sums(root, inode, comp_bio, 666 ret = btrfs_lookup_bio_sums(root, inode,
661 sums); 667 comp_bio, sums);
668 BUG_ON(ret);
662 } 669 }
663 sums += (comp_bio->bi_size + root->sectorsize - 1) / 670 sums += (comp_bio->bi_size + root->sectorsize - 1) /
664 root->sectorsize; 671 root->sectorsize;
@@ -683,8 +690,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
683 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); 690 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
684 BUG_ON(ret); 691 BUG_ON(ret);
685 692
686 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) 693 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
687 btrfs_lookup_bio_sums(root, inode, comp_bio, sums); 694 ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
695 BUG_ON(ret);
696 }
688 697
689 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); 698 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
690 BUG_ON(ret); 699 BUG_ON(ret);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index b5baff0dccfe..84d7ca1fe0ba 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -147,10 +147,11 @@ noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
147struct extent_buffer *btrfs_root_node(struct btrfs_root *root) 147struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
148{ 148{
149 struct extent_buffer *eb; 149 struct extent_buffer *eb;
150 spin_lock(&root->node_lock); 150
151 eb = root->node; 151 rcu_read_lock();
152 eb = rcu_dereference(root->node);
152 extent_buffer_get(eb); 153 extent_buffer_get(eb);
153 spin_unlock(&root->node_lock); 154 rcu_read_unlock();
154 return eb; 155 return eb;
155} 156}
156 157
@@ -165,14 +166,8 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
165 while (1) { 166 while (1) {
166 eb = btrfs_root_node(root); 167 eb = btrfs_root_node(root);
167 btrfs_tree_lock(eb); 168 btrfs_tree_lock(eb);
168 169 if (eb == root->node)
169 spin_lock(&root->node_lock);
170 if (eb == root->node) {
171 spin_unlock(&root->node_lock);
172 break; 170 break;
173 }
174 spin_unlock(&root->node_lock);
175
176 btrfs_tree_unlock(eb); 171 btrfs_tree_unlock(eb);
177 free_extent_buffer(eb); 172 free_extent_buffer(eb);
178 } 173 }
@@ -458,10 +453,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
458 else 453 else
459 parent_start = 0; 454 parent_start = 0;
460 455
461 spin_lock(&root->node_lock);
462 root->node = cow;
463 extent_buffer_get(cow); 456 extent_buffer_get(cow);
464 spin_unlock(&root->node_lock); 457 rcu_assign_pointer(root->node, cow);
465 458
466 btrfs_free_tree_block(trans, root, buf, parent_start, 459 btrfs_free_tree_block(trans, root, buf, parent_start,
467 last_ref); 460 last_ref);
@@ -542,6 +535,9 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
542 535
543 ret = __btrfs_cow_block(trans, root, buf, parent, 536 ret = __btrfs_cow_block(trans, root, buf, parent,
544 parent_slot, cow_ret, search_start, 0); 537 parent_slot, cow_ret, search_start, 0);
538
539 trace_btrfs_cow_block(root, buf, *cow_ret);
540
545 return ret; 541 return ret;
546} 542}
547 543
@@ -686,6 +682,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
686 if (!cur) { 682 if (!cur) {
687 cur = read_tree_block(root, blocknr, 683 cur = read_tree_block(root, blocknr,
688 blocksize, gen); 684 blocksize, gen);
685 if (!cur)
686 return -EIO;
689 } else if (!uptodate) { 687 } else if (!uptodate) {
690 btrfs_read_buffer(cur, gen); 688 btrfs_read_buffer(cur, gen);
691 } 689 }
@@ -732,122 +730,6 @@ static inline unsigned int leaf_data_end(struct btrfs_root *root,
732 return btrfs_item_offset_nr(leaf, nr - 1); 730 return btrfs_item_offset_nr(leaf, nr - 1);
733} 731}
734 732
735/*
736 * extra debugging checks to make sure all the items in a key are
737 * well formed and in the proper order
738 */
739static int check_node(struct btrfs_root *root, struct btrfs_path *path,
740 int level)
741{
742 struct extent_buffer *parent = NULL;
743 struct extent_buffer *node = path->nodes[level];
744 struct btrfs_disk_key parent_key;
745 struct btrfs_disk_key node_key;
746 int parent_slot;
747 int slot;
748 struct btrfs_key cpukey;
749 u32 nritems = btrfs_header_nritems(node);
750
751 if (path->nodes[level + 1])
752 parent = path->nodes[level + 1];
753
754 slot = path->slots[level];
755 BUG_ON(nritems == 0);
756 if (parent) {
757 parent_slot = path->slots[level + 1];
758 btrfs_node_key(parent, &parent_key, parent_slot);
759 btrfs_node_key(node, &node_key, 0);
760 BUG_ON(memcmp(&parent_key, &node_key,
761 sizeof(struct btrfs_disk_key)));
762 BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
763 btrfs_header_bytenr(node));
764 }
765 BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
766 if (slot != 0) {
767 btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
768 btrfs_node_key(node, &node_key, slot);
769 BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
770 }
771 if (slot < nritems - 1) {
772 btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
773 btrfs_node_key(node, &node_key, slot);
774 BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
775 }
776 return 0;
777}
778
779/*
780 * extra checking to make sure all the items in a leaf are
781 * well formed and in the proper order
782 */
783static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
784 int level)
785{
786 struct extent_buffer *leaf = path->nodes[level];
787 struct extent_buffer *parent = NULL;
788 int parent_slot;
789 struct btrfs_key cpukey;
790 struct btrfs_disk_key parent_key;
791 struct btrfs_disk_key leaf_key;
792 int slot = path->slots[0];
793
794 u32 nritems = btrfs_header_nritems(leaf);
795
796 if (path->nodes[level + 1])
797 parent = path->nodes[level + 1];
798
799 if (nritems == 0)
800 return 0;
801
802 if (parent) {
803 parent_slot = path->slots[level + 1];
804 btrfs_node_key(parent, &parent_key, parent_slot);
805 btrfs_item_key(leaf, &leaf_key, 0);
806
807 BUG_ON(memcmp(&parent_key, &leaf_key,
808 sizeof(struct btrfs_disk_key)));
809 BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
810 btrfs_header_bytenr(leaf));
811 }
812 if (slot != 0 && slot < nritems - 1) {
813 btrfs_item_key(leaf, &leaf_key, slot);
814 btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
815 if (comp_keys(&leaf_key, &cpukey) <= 0) {
816 btrfs_print_leaf(root, leaf);
817 printk(KERN_CRIT "slot %d offset bad key\n", slot);
818 BUG_ON(1);
819 }
820 if (btrfs_item_offset_nr(leaf, slot - 1) !=
821 btrfs_item_end_nr(leaf, slot)) {
822 btrfs_print_leaf(root, leaf);
823 printk(KERN_CRIT "slot %d offset bad\n", slot);
824 BUG_ON(1);
825 }
826 }
827 if (slot < nritems - 1) {
828 btrfs_item_key(leaf, &leaf_key, slot);
829 btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
830 BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
831 if (btrfs_item_offset_nr(leaf, slot) !=
832 btrfs_item_end_nr(leaf, slot + 1)) {
833 btrfs_print_leaf(root, leaf);
834 printk(KERN_CRIT "slot %d offset bad\n", slot);
835 BUG_ON(1);
836 }
837 }
838 BUG_ON(btrfs_item_offset_nr(leaf, 0) +
839 btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
840 return 0;
841}
842
843static noinline int check_block(struct btrfs_root *root,
844 struct btrfs_path *path, int level)
845{
846 return 0;
847 if (level == 0)
848 return check_leaf(root, path, level);
849 return check_node(root, path, level);
850}
851 733
852/* 734/*
853 * search for key in the extent_buffer. The items start at offset p, 735 * search for key in the extent_buffer. The items start at offset p,
@@ -1046,9 +928,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1046 goto enospc; 928 goto enospc;
1047 } 929 }
1048 930
1049 spin_lock(&root->node_lock); 931 rcu_assign_pointer(root->node, child);
1050 root->node = child;
1051 spin_unlock(&root->node_lock);
1052 932
1053 add_root_to_dirty_list(root); 933 add_root_to_dirty_list(root);
1054 btrfs_tree_unlock(child); 934 btrfs_tree_unlock(child);
@@ -1188,7 +1068,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1188 } 1068 }
1189 } 1069 }
1190 /* double check we haven't messed things up */ 1070 /* double check we haven't messed things up */
1191 check_block(root, path, level);
1192 if (orig_ptr != 1071 if (orig_ptr !=
1193 btrfs_node_blockptr(path->nodes[level], path->slots[level])) 1072 btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1194 BUG(); 1073 BUG();
@@ -1798,12 +1677,6 @@ cow_done:
1798 if (!cow) 1677 if (!cow)
1799 btrfs_unlock_up_safe(p, level + 1); 1678 btrfs_unlock_up_safe(p, level + 1);
1800 1679
1801 ret = check_block(root, p, level);
1802 if (ret) {
1803 ret = -1;
1804 goto done;
1805 }
1806
1807 ret = bin_search(b, key, level, &slot); 1680 ret = bin_search(b, key, level, &slot);
1808 1681
1809 if (level != 0) { 1682 if (level != 0) {
@@ -2130,10 +2003,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2130 2003
2131 btrfs_mark_buffer_dirty(c); 2004 btrfs_mark_buffer_dirty(c);
2132 2005
2133 spin_lock(&root->node_lock);
2134 old = root->node; 2006 old = root->node;
2135 root->node = c; 2007 rcu_assign_pointer(root->node, c);
2136 spin_unlock(&root->node_lock);
2137 2008
2138 /* the super has an extra ref to root->node */ 2009 /* the super has an extra ref to root->node */
2139 free_extent_buffer(old); 2010 free_extent_buffer(old);
@@ -3840,7 +3711,8 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
3840 unsigned long ptr; 3711 unsigned long ptr;
3841 3712
3842 path = btrfs_alloc_path(); 3713 path = btrfs_alloc_path();
3843 BUG_ON(!path); 3714 if (!path)
3715 return -ENOMEM;
3844 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); 3716 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3845 if (!ret) { 3717 if (!ret) {
3846 leaf = path->nodes[0]; 3718 leaf = path->nodes[0];
@@ -4217,6 +4089,7 @@ find_next_key:
4217 } 4089 }
4218 btrfs_set_path_blocking(path); 4090 btrfs_set_path_blocking(path);
4219 cur = read_node_slot(root, cur, slot); 4091 cur = read_node_slot(root, cur, slot);
4092 BUG_ON(!cur);
4220 4093
4221 btrfs_tree_lock(cur); 4094 btrfs_tree_lock(cur);
4222 4095
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7f78cc78fdd0..2e61fe1b6b8c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -28,6 +28,7 @@
28#include <linux/wait.h> 28#include <linux/wait.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/kobject.h> 30#include <linux/kobject.h>
31#include <trace/events/btrfs.h>
31#include <asm/kmap_types.h> 32#include <asm/kmap_types.h>
32#include "extent_io.h" 33#include "extent_io.h"
33#include "extent_map.h" 34#include "extent_map.h"
@@ -40,6 +41,7 @@ extern struct kmem_cache *btrfs_trans_handle_cachep;
40extern struct kmem_cache *btrfs_transaction_cachep; 41extern struct kmem_cache *btrfs_transaction_cachep;
41extern struct kmem_cache *btrfs_bit_radix_cachep; 42extern struct kmem_cache *btrfs_bit_radix_cachep;
42extern struct kmem_cache *btrfs_path_cachep; 43extern struct kmem_cache *btrfs_path_cachep;
44extern struct kmem_cache *btrfs_free_space_cachep;
43struct btrfs_ordered_sum; 45struct btrfs_ordered_sum;
44 46
45#define BTRFS_MAGIC "_BHRfS_M" 47#define BTRFS_MAGIC "_BHRfS_M"
@@ -738,8 +740,10 @@ struct btrfs_space_info {
738 */ 740 */
739 unsigned long reservation_progress; 741 unsigned long reservation_progress;
740 742
741 int full; /* indicates that we cannot allocate any more 743 int full:1; /* indicates that we cannot allocate any more
742 chunks for this space */ 744 chunks for this space */
745 int chunk_alloc:1; /* set if we are allocating a chunk */
746
743 int force_alloc; /* set if we need to force a chunk alloc for 747 int force_alloc; /* set if we need to force a chunk alloc for
744 this space */ 748 this space */
745 749
@@ -782,9 +786,6 @@ struct btrfs_free_cluster {
782 /* first extent starting offset */ 786 /* first extent starting offset */
783 u64 window_start; 787 u64 window_start;
784 788
785 /* if this cluster simply points at a bitmap in the block group */
786 bool points_to_bitmap;
787
788 struct btrfs_block_group_cache *block_group; 789 struct btrfs_block_group_cache *block_group;
789 /* 790 /*
790 * when a cluster is allocated from a block group, we put the 791 * when a cluster is allocated from a block group, we put the
@@ -1283,6 +1284,9 @@ struct btrfs_root {
1283#define BTRFS_INODE_NODUMP (1 << 8) 1284#define BTRFS_INODE_NODUMP (1 << 8)
1284#define BTRFS_INODE_NOATIME (1 << 9) 1285#define BTRFS_INODE_NOATIME (1 << 9)
1285#define BTRFS_INODE_DIRSYNC (1 << 10) 1286#define BTRFS_INODE_DIRSYNC (1 << 10)
1287#define BTRFS_INODE_COMPRESS (1 << 11)
1288
1289#define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31)
1286 1290
1287/* some macros to generate set/get funcs for the struct fields. This 1291/* some macros to generate set/get funcs for the struct fields. This
1288 * assumes there is a lefoo_to_cpu for every type, so lets make a simple 1292 * assumes there is a lefoo_to_cpu for every type, so lets make a simple
@@ -2157,6 +2161,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
2157 u64 root_objectid, u64 owner, u64 offset); 2161 u64 root_objectid, u64 owner, u64 offset);
2158 2162
2159int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); 2163int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
2164int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
2165 u64 num_bytes, int reserve, int sinfo);
2160int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 2166int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
2161 struct btrfs_root *root); 2167 struct btrfs_root *root);
2162int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 2168int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
@@ -2227,10 +2233,12 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
2227int btrfs_error_unpin_extent_range(struct btrfs_root *root, 2233int btrfs_error_unpin_extent_range(struct btrfs_root *root,
2228 u64 start, u64 end); 2234 u64 start, u64 end);
2229int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, 2235int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
2230 u64 num_bytes); 2236 u64 num_bytes, u64 *actual_bytes);
2231int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, 2237int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
2232 struct btrfs_root *root, u64 type); 2238 struct btrfs_root *root, u64 type);
2239int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
2233 2240
2241int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
2234/* ctree.c */ 2242/* ctree.c */
2235int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 2243int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2236 int level, int *slot); 2244 int level, int *slot);
@@ -2355,6 +2363,8 @@ int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
2355int btrfs_find_orphan_roots(struct btrfs_root *tree_root); 2363int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
2356int btrfs_set_root_node(struct btrfs_root_item *item, 2364int btrfs_set_root_node(struct btrfs_root_item *item,
2357 struct extent_buffer *node); 2365 struct extent_buffer *node);
2366void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
2367
2358/* dir-item.c */ 2368/* dir-item.c */
2359int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, 2369int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
2360 struct btrfs_root *root, const char *name, 2370 struct btrfs_root *root, const char *name,
@@ -2392,6 +2402,9 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
2392 struct btrfs_path *path, u64 dir, 2402 struct btrfs_path *path, u64 dir,
2393 const char *name, u16 name_len, 2403 const char *name, u16 name_len,
2394 int mod); 2404 int mod);
2405int verify_dir_item(struct btrfs_root *root,
2406 struct extent_buffer *leaf,
2407 struct btrfs_dir_item *dir_item);
2395 2408
2396/* orphan.c */ 2409/* orphan.c */
2397int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, 2410int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
@@ -2528,7 +2541,7 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
2528 struct inode *inode); 2541 struct inode *inode);
2529int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); 2542int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
2530int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); 2543int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2531void btrfs_orphan_cleanup(struct btrfs_root *root); 2544int btrfs_orphan_cleanup(struct btrfs_root *root);
2532void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, 2545void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2533 struct btrfs_pending_snapshot *pending, 2546 struct btrfs_pending_snapshot *pending,
2534 u64 *bytes_to_reserve); 2547 u64 *bytes_to_reserve);
@@ -2536,7 +2549,7 @@ void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2536 struct btrfs_pending_snapshot *pending); 2549 struct btrfs_pending_snapshot *pending);
2537void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, 2550void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2538 struct btrfs_root *root); 2551 struct btrfs_root *root);
2539int btrfs_cont_expand(struct inode *inode, loff_t size); 2552int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
2540int btrfs_invalidate_inodes(struct btrfs_root *root); 2553int btrfs_invalidate_inodes(struct btrfs_root *root);
2541void btrfs_add_delayed_iput(struct inode *inode); 2554void btrfs_add_delayed_iput(struct inode *inode);
2542void btrfs_run_delayed_iputs(struct btrfs_root *root); 2555void btrfs_run_delayed_iputs(struct btrfs_root *root);
@@ -2565,6 +2578,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
2565int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, 2578int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
2566 struct inode *inode, u64 start, u64 end); 2579 struct inode *inode, u64 start, u64 end);
2567int btrfs_release_file(struct inode *inode, struct file *file); 2580int btrfs_release_file(struct inode *inode, struct file *file);
2581void btrfs_drop_pages(struct page **pages, size_t num_pages);
2582int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
2583 struct page **pages, size_t num_pages,
2584 loff_t pos, size_t write_bytes,
2585 struct extent_state **cached);
2568 2586
2569/* tree-defrag.c */ 2587/* tree-defrag.c */
2570int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, 2588int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index e807b143b857..bce28f653899 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -483,6 +483,8 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
483 INIT_LIST_HEAD(&head_ref->cluster); 483 INIT_LIST_HEAD(&head_ref->cluster);
484 mutex_init(&head_ref->mutex); 484 mutex_init(&head_ref->mutex);
485 485
486 trace_btrfs_delayed_ref_head(ref, head_ref, action);
487
486 existing = tree_insert(&delayed_refs->root, &ref->rb_node); 488 existing = tree_insert(&delayed_refs->root, &ref->rb_node);
487 489
488 if (existing) { 490 if (existing) {
@@ -537,6 +539,8 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
537 } 539 }
538 full_ref->level = level; 540 full_ref->level = level;
539 541
542 trace_btrfs_delayed_tree_ref(ref, full_ref, action);
543
540 existing = tree_insert(&delayed_refs->root, &ref->rb_node); 544 existing = tree_insert(&delayed_refs->root, &ref->rb_node);
541 545
542 if (existing) { 546 if (existing) {
@@ -591,6 +595,8 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
591 full_ref->objectid = owner; 595 full_ref->objectid = owner;
592 full_ref->offset = offset; 596 full_ref->offset = offset;
593 597
598 trace_btrfs_delayed_data_ref(ref, full_ref, action);
599
594 existing = tree_insert(&delayed_refs->root, &ref->rb_node); 600 existing = tree_insert(&delayed_refs->root, &ref->rb_node);
595 601
596 if (existing) { 602 if (existing) {
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index f0cad5ae5be7..c62f02f6ae69 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -151,7 +151,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
151 ret = PTR_ERR(dir_item); 151 ret = PTR_ERR(dir_item);
152 if (ret == -EEXIST) 152 if (ret == -EEXIST)
153 goto second_insert; 153 goto second_insert;
154 goto out; 154 goto out_free;
155 } 155 }
156 156
157 leaf = path->nodes[0]; 157 leaf = path->nodes[0];
@@ -170,7 +170,7 @@ second_insert:
170 /* FIXME, use some real flag for selecting the extra index */ 170 /* FIXME, use some real flag for selecting the extra index */
171 if (root == root->fs_info->tree_root) { 171 if (root == root->fs_info->tree_root) {
172 ret = 0; 172 ret = 0;
173 goto out; 173 goto out_free;
174 } 174 }
175 btrfs_release_path(root, path); 175 btrfs_release_path(root, path);
176 176
@@ -180,7 +180,7 @@ second_insert:
180 name, name_len); 180 name, name_len);
181 if (IS_ERR(dir_item)) { 181 if (IS_ERR(dir_item)) {
182 ret2 = PTR_ERR(dir_item); 182 ret2 = PTR_ERR(dir_item);
183 goto out; 183 goto out_free;
184 } 184 }
185 leaf = path->nodes[0]; 185 leaf = path->nodes[0];
186 btrfs_cpu_key_to_disk(&disk_key, location); 186 btrfs_cpu_key_to_disk(&disk_key, location);
@@ -192,7 +192,9 @@ second_insert:
192 name_ptr = (unsigned long)(dir_item + 1); 192 name_ptr = (unsigned long)(dir_item + 1);
193 write_extent_buffer(leaf, name, name_ptr, name_len); 193 write_extent_buffer(leaf, name, name_ptr, name_len);
194 btrfs_mark_buffer_dirty(leaf); 194 btrfs_mark_buffer_dirty(leaf);
195out: 195
196out_free:
197
196 btrfs_free_path(path); 198 btrfs_free_path(path);
197 if (ret) 199 if (ret)
198 return ret; 200 return ret;
@@ -377,6 +379,9 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
377 379
378 leaf = path->nodes[0]; 380 leaf = path->nodes[0];
379 dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); 381 dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
382 if (verify_dir_item(root, leaf, dir_item))
383 return NULL;
384
380 total_len = btrfs_item_size_nr(leaf, path->slots[0]); 385 total_len = btrfs_item_size_nr(leaf, path->slots[0]);
381 while (cur < total_len) { 386 while (cur < total_len) {
382 this_len = sizeof(*dir_item) + 387 this_len = sizeof(*dir_item) +
@@ -429,3 +434,35 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
429 } 434 }
430 return ret; 435 return ret;
431} 436}
437
438int verify_dir_item(struct btrfs_root *root,
439 struct extent_buffer *leaf,
440 struct btrfs_dir_item *dir_item)
441{
442 u16 namelen = BTRFS_NAME_LEN;
443 u8 type = btrfs_dir_type(leaf, dir_item);
444
445 if (type >= BTRFS_FT_MAX) {
446 printk(KERN_CRIT "btrfs: invalid dir item type: %d\n",
447 (int)type);
448 return 1;
449 }
450
451 if (type == BTRFS_FT_XATTR)
452 namelen = XATTR_NAME_MAX;
453
454 if (btrfs_dir_name_len(leaf, dir_item) > namelen) {
455 printk(KERN_CRIT "btrfS: invalid dir item name len: %u\n",
456 (unsigned)btrfs_dir_data_len(leaf, dir_item));
457 return 1;
458 }
459
460 /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */
461 if (btrfs_dir_data_len(leaf, dir_item) > BTRFS_MAX_XATTR_SIZE(root)) {
462 printk(KERN_CRIT "btrfs: invalid dir item data len: %u\n",
463 (unsigned)btrfs_dir_data_len(leaf, dir_item));
464 return 1;
465 }
466
467 return 0;
468}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 100b07f021b4..68c84c8c24bd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -29,6 +29,7 @@
29#include <linux/crc32c.h> 29#include <linux/crc32c.h>
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/migrate.h> 31#include <linux/migrate.h>
32#include <asm/unaligned.h>
32#include "compat.h" 33#include "compat.h"
33#include "ctree.h" 34#include "ctree.h"
34#include "disk-io.h" 35#include "disk-io.h"
@@ -198,7 +199,7 @@ u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
198 199
199void btrfs_csum_final(u32 crc, char *result) 200void btrfs_csum_final(u32 crc, char *result)
200{ 201{
201 *(__le32 *)result = ~cpu_to_le32(crc); 202 put_unaligned_le32(~crc, result);
202} 203}
203 204
204/* 205/*
@@ -323,6 +324,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
323 int num_copies = 0; 324 int num_copies = 0;
324 int mirror_num = 0; 325 int mirror_num = 0;
325 326
327 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
326 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; 328 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
327 while (1) { 329 while (1) {
328 ret = read_extent_buffer_pages(io_tree, eb, start, 1, 330 ret = read_extent_buffer_pages(io_tree, eb, start, 1,
@@ -331,6 +333,14 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
331 !verify_parent_transid(io_tree, eb, parent_transid)) 333 !verify_parent_transid(io_tree, eb, parent_transid))
332 return ret; 334 return ret;
333 335
336 /*
337 * This buffer's crc is fine, but its contents are corrupted, so
338 * there is no reason to read the other copies, they won't be
339 * any less wrong.
340 */
341 if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
342 return ret;
343
334 num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, 344 num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
335 eb->start, eb->len); 345 eb->start, eb->len);
336 if (num_copies == 1) 346 if (num_copies == 1)
@@ -419,6 +429,73 @@ static int check_tree_block_fsid(struct btrfs_root *root,
419 return ret; 429 return ret;
420} 430}
421 431
432#define CORRUPT(reason, eb, root, slot) \
433 printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu," \
434 "root=%llu, slot=%d\n", reason, \
435 (unsigned long long)btrfs_header_bytenr(eb), \
436 (unsigned long long)root->objectid, slot)
437
438static noinline int check_leaf(struct btrfs_root *root,
439 struct extent_buffer *leaf)
440{
441 struct btrfs_key key;
442 struct btrfs_key leaf_key;
443 u32 nritems = btrfs_header_nritems(leaf);
444 int slot;
445
446 if (nritems == 0)
447 return 0;
448
449 /* Check the 0 item */
450 if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
451 BTRFS_LEAF_DATA_SIZE(root)) {
452 CORRUPT("invalid item offset size pair", leaf, root, 0);
453 return -EIO;
454 }
455
456 /*
457 * Check to make sure each items keys are in the correct order and their
458 * offsets make sense. We only have to loop through nritems-1 because
459 * we check the current slot against the next slot, which verifies the
460 * next slot's offset+size makes sense and that the current's slot
461 * offset is correct.
462 */
463 for (slot = 0; slot < nritems - 1; slot++) {
464 btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
465 btrfs_item_key_to_cpu(leaf, &key, slot + 1);
466
467 /* Make sure the keys are in the right order */
468 if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
469 CORRUPT("bad key order", leaf, root, slot);
470 return -EIO;
471 }
472
473 /*
474 * Make sure the offset and ends are right, remember that the
475 * item data starts at the end of the leaf and grows towards the
476 * front.
477 */
478 if (btrfs_item_offset_nr(leaf, slot) !=
479 btrfs_item_end_nr(leaf, slot + 1)) {
480 CORRUPT("slot offset bad", leaf, root, slot);
481 return -EIO;
482 }
483
484 /*
485 * Check to make sure that we don't point outside of the leaf,
486 * just incase all the items are consistent to eachother, but
487 * all point outside of the leaf.
488 */
489 if (btrfs_item_end_nr(leaf, slot) >
490 BTRFS_LEAF_DATA_SIZE(root)) {
491 CORRUPT("slot end outside of leaf", leaf, root, slot);
492 return -EIO;
493 }
494 }
495
496 return 0;
497}
498
422#ifdef CONFIG_DEBUG_LOCK_ALLOC 499#ifdef CONFIG_DEBUG_LOCK_ALLOC
423void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level) 500void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
424{ 501{
@@ -485,8 +562,20 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
485 btrfs_set_buffer_lockdep_class(eb, found_level); 562 btrfs_set_buffer_lockdep_class(eb, found_level);
486 563
487 ret = csum_tree_block(root, eb, 1); 564 ret = csum_tree_block(root, eb, 1);
488 if (ret) 565 if (ret) {
566 ret = -EIO;
567 goto err;
568 }
569
570 /*
571 * If this is a leaf block and it is corrupt, set the corrupt bit so
572 * that we don't try and read the other copies of this block, just
573 * return -EIO.
574 */
575 if (found_level == 0 && check_leaf(root, eb)) {
576 set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
489 ret = -EIO; 577 ret = -EIO;
578 }
490 579
491 end = min_t(u64, eb->len, PAGE_CACHE_SIZE); 580 end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
492 end = eb->start + end - 1; 581 end = eb->start + end - 1;
@@ -847,7 +936,6 @@ static const struct address_space_operations btree_aops = {
847 .writepages = btree_writepages, 936 .writepages = btree_writepages,
848 .releasepage = btree_releasepage, 937 .releasepage = btree_releasepage,
849 .invalidatepage = btree_invalidatepage, 938 .invalidatepage = btree_invalidatepage,
850 .sync_page = block_sync_page,
851#ifdef CONFIG_MIGRATION 939#ifdef CONFIG_MIGRATION
852 .migratepage = btree_migratepage, 940 .migratepage = btree_migratepage,
853#endif 941#endif
@@ -1160,7 +1248,10 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1160 root, fs_info, location->objectid); 1248 root, fs_info, location->objectid);
1161 1249
1162 path = btrfs_alloc_path(); 1250 path = btrfs_alloc_path();
1163 BUG_ON(!path); 1251 if (!path) {
1252 kfree(root);
1253 return ERR_PTR(-ENOMEM);
1254 }
1164 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); 1255 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
1165 if (ret == 0) { 1256 if (ret == 0) {
1166 l = path->nodes[0]; 1257 l = path->nodes[0];
@@ -1184,8 +1275,10 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1184 root->commit_root = btrfs_root_node(root); 1275 root->commit_root = btrfs_root_node(root);
1185 BUG_ON(!root->node); 1276 BUG_ON(!root->node);
1186out: 1277out:
1187 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) 1278 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
1188 root->ref_cows = 1; 1279 root->ref_cows = 1;
1280 btrfs_check_and_init_root_item(&root->root_item);
1281 }
1189 1282
1190 return root; 1283 return root;
1191} 1284}
@@ -1331,82 +1424,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1331} 1424}
1332 1425
1333/* 1426/*
1334 * this unplugs every device on the box, and it is only used when page
1335 * is null
1336 */
1337static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1338{
1339 struct btrfs_device *device;
1340 struct btrfs_fs_info *info;
1341
1342 info = (struct btrfs_fs_info *)bdi->unplug_io_data;
1343 list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
1344 if (!device->bdev)
1345 continue;
1346
1347 bdi = blk_get_backing_dev_info(device->bdev);
1348 if (bdi->unplug_io_fn)
1349 bdi->unplug_io_fn(bdi, page);
1350 }
1351}
1352
1353static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1354{
1355 struct inode *inode;
1356 struct extent_map_tree *em_tree;
1357 struct extent_map *em;
1358 struct address_space *mapping;
1359 u64 offset;
1360
1361 /* the generic O_DIRECT read code does this */
1362 if (1 || !page) {
1363 __unplug_io_fn(bdi, page);
1364 return;
1365 }
1366
1367 /*
1368 * page->mapping may change at any time. Get a consistent copy
1369 * and use that for everything below
1370 */
1371 smp_mb();
1372 mapping = page->mapping;
1373 if (!mapping)
1374 return;
1375
1376 inode = mapping->host;
1377
1378 /*
1379 * don't do the expensive searching for a small number of
1380 * devices
1381 */
1382 if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
1383 __unplug_io_fn(bdi, page);
1384 return;
1385 }
1386
1387 offset = page_offset(page);
1388
1389 em_tree = &BTRFS_I(inode)->extent_tree;
1390 read_lock(&em_tree->lock);
1391 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
1392 read_unlock(&em_tree->lock);
1393 if (!em) {
1394 __unplug_io_fn(bdi, page);
1395 return;
1396 }
1397
1398 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
1399 free_extent_map(em);
1400 __unplug_io_fn(bdi, page);
1401 return;
1402 }
1403 offset = offset - em->start;
1404 btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
1405 em->block_start + offset, page);
1406 free_extent_map(em);
1407}
1408
1409/*
1410 * If this fails, caller must call bdi_destroy() to get rid of the 1427 * If this fails, caller must call bdi_destroy() to get rid of the
1411 * bdi again. 1428 * bdi again.
1412 */ 1429 */
@@ -1420,8 +1437,6 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1420 return err; 1437 return err;
1421 1438
1422 bdi->ra_pages = default_backing_dev_info.ra_pages; 1439 bdi->ra_pages = default_backing_dev_info.ra_pages;
1423 bdi->unplug_io_fn = btrfs_unplug_io_fn;
1424 bdi->unplug_io_data = info;
1425 bdi->congested_fn = btrfs_congested_fn; 1440 bdi->congested_fn = btrfs_congested_fn;
1426 bdi->congested_data = info; 1441 bdi->congested_data = info;
1427 return 0; 1442 return 0;
@@ -1632,6 +1647,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1632 goto fail_bdi; 1647 goto fail_bdi;
1633 } 1648 }
1634 1649
1650 fs_info->btree_inode->i_mapping->flags &= ~__GFP_FS;
1651
1635 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); 1652 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
1636 INIT_LIST_HEAD(&fs_info->trans_list); 1653 INIT_LIST_HEAD(&fs_info->trans_list);
1637 INIT_LIST_HEAD(&fs_info->dead_roots); 1654 INIT_LIST_HEAD(&fs_info->dead_roots);
@@ -1762,6 +1779,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1762 1779
1763 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); 1780 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
1764 1781
1782 /*
1783 * In the long term, we'll store the compression type in the super
1784 * block, and it'll be used for per file compression control.
1785 */
1786 fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
1787
1765 ret = btrfs_parse_options(tree_root, options); 1788 ret = btrfs_parse_options(tree_root, options);
1766 if (ret) { 1789 if (ret) {
1767 err = ret; 1790 err = ret;
@@ -1967,6 +1990,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1967 fs_info->metadata_alloc_profile = (u64)-1; 1990 fs_info->metadata_alloc_profile = (u64)-1;
1968 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; 1991 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1969 1992
1993 ret = btrfs_init_space_info(fs_info);
1994 if (ret) {
1995 printk(KERN_ERR "Failed to initial space info: %d\n", ret);
1996 goto fail_block_groups;
1997 }
1998
1970 ret = btrfs_read_block_groups(extent_root); 1999 ret = btrfs_read_block_groups(extent_root);
1971 if (ret) { 2000 if (ret) {
1972 printk(KERN_ERR "Failed to read block groups: %d\n", ret); 2001 printk(KERN_ERR "Failed to read block groups: %d\n", ret);
@@ -2058,9 +2087,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
2058 2087
2059 if (!(sb->s_flags & MS_RDONLY)) { 2088 if (!(sb->s_flags & MS_RDONLY)) {
2060 down_read(&fs_info->cleanup_work_sem); 2089 down_read(&fs_info->cleanup_work_sem);
2061 btrfs_orphan_cleanup(fs_info->fs_root); 2090 err = btrfs_orphan_cleanup(fs_info->fs_root);
2062 btrfs_orphan_cleanup(fs_info->tree_root); 2091 if (!err)
2092 err = btrfs_orphan_cleanup(fs_info->tree_root);
2063 up_read(&fs_info->cleanup_work_sem); 2093 up_read(&fs_info->cleanup_work_sem);
2094 if (err) {
2095 close_ctree(tree_root);
2096 return ERR_PTR(err);
2097 }
2064 } 2098 }
2065 2099
2066 return tree_root; 2100 return tree_root;
@@ -2435,8 +2469,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
2435 2469
2436 root_objectid = gang[ret - 1]->root_key.objectid + 1; 2470 root_objectid = gang[ret - 1]->root_key.objectid + 1;
2437 for (i = 0; i < ret; i++) { 2471 for (i = 0; i < ret; i++) {
2472 int err;
2473
2438 root_objectid = gang[i]->root_key.objectid; 2474 root_objectid = gang[i]->root_key.objectid;
2439 btrfs_orphan_cleanup(gang[i]); 2475 err = btrfs_orphan_cleanup(gang[i]);
2476 if (err)
2477 return err;
2440 } 2478 }
2441 root_objectid++; 2479 root_objectid++;
2442 } 2480 }
@@ -2947,7 +2985,10 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
2947 break; 2985 break;
2948 2986
2949 /* opt_discard */ 2987 /* opt_discard */
2950 ret = btrfs_error_discard_extent(root, start, end + 1 - start); 2988 if (btrfs_test_opt(root, DISCARD))
2989 ret = btrfs_error_discard_extent(root, start,
2990 end + 1 - start,
2991 NULL);
2951 2992
2952 clear_extent_dirty(unpin, start, end, GFP_NOFS); 2993 clear_extent_dirty(unpin, start, end, GFP_NOFS);
2953 btrfs_error_unpin_extent_range(root, start, end); 2994 btrfs_error_unpin_extent_range(root, start, end);
@@ -3016,7 +3057,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
3016 btrfs_destroy_pinned_extent(root, 3057 btrfs_destroy_pinned_extent(root,
3017 root->fs_info->pinned_extents); 3058 root->fs_info->pinned_extents);
3018 3059
3019 t->use_count = 0; 3060 atomic_set(&t->use_count, 0);
3020 list_del_init(&t->list); 3061 list_del_init(&t->list);
3021 memset(t, 0, sizeof(*t)); 3062 memset(t, 0, sizeof(*t));
3022 kmem_cache_free(btrfs_transaction_cachep, t); 3063 kmem_cache_free(btrfs_transaction_cachep, t);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7b3089b5c2df..31f33ba56fe8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,11 +33,28 @@
33#include "locking.h" 33#include "locking.h"
34#include "free-space-cache.h" 34#include "free-space-cache.h"
35 35
36/* control flags for do_chunk_alloc's force field
37 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
38 * if we really need one.
39 *
40 * CHUNK_ALLOC_FORCE means it must try to allocate one
41 *
42 * CHUNK_ALLOC_LIMITED means to only try and allocate one
43 * if we have very few chunks already allocated. This is
44 * used as part of the clustering code to help make sure
45 * we have a good pool of storage to cluster in, without
46 * filling the FS with empty chunks
47 *
48 */
49enum {
50 CHUNK_ALLOC_NO_FORCE = 0,
51 CHUNK_ALLOC_FORCE = 1,
52 CHUNK_ALLOC_LIMITED = 2,
53};
54
36static int update_block_group(struct btrfs_trans_handle *trans, 55static int update_block_group(struct btrfs_trans_handle *trans,
37 struct btrfs_root *root, 56 struct btrfs_root *root,
38 u64 bytenr, u64 num_bytes, int alloc); 57 u64 bytenr, u64 num_bytes, int alloc);
39static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
40 u64 num_bytes, int reserve, int sinfo);
41static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 58static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
42 struct btrfs_root *root, 59 struct btrfs_root *root,
43 u64 bytenr, u64 num_bytes, u64 parent, 60 u64 bytenr, u64 num_bytes, u64 parent,
@@ -442,7 +459,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
442 * allocate blocks for the tree root we can't do the fast caching since 459 * allocate blocks for the tree root we can't do the fast caching since
443 * we likely hold important locks. 460 * we likely hold important locks.
444 */ 461 */
445 if (!trans->transaction->in_commit && 462 if (trans && (!trans->transaction->in_commit) &&
446 (root && root != root->fs_info->tree_root)) { 463 (root && root != root->fs_info->tree_root)) {
447 spin_lock(&cache->lock); 464 spin_lock(&cache->lock);
448 if (cache->cached != BTRFS_CACHE_NO) { 465 if (cache->cached != BTRFS_CACHE_NO) {
@@ -471,7 +488,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
471 if (load_cache_only) 488 if (load_cache_only)
472 return 0; 489 return 0;
473 490
474 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL); 491 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
475 BUG_ON(!caching_ctl); 492 BUG_ON(!caching_ctl);
476 493
477 INIT_LIST_HEAD(&caching_ctl->list); 494 INIT_LIST_HEAD(&caching_ctl->list);
@@ -1740,39 +1757,45 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
1740 return ret; 1757 return ret;
1741} 1758}
1742 1759
1743static void btrfs_issue_discard(struct block_device *bdev, 1760static int btrfs_issue_discard(struct block_device *bdev,
1744 u64 start, u64 len) 1761 u64 start, u64 len)
1745{ 1762{
1746 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0); 1763 return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
1747} 1764}
1748 1765
1749static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1766static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1750 u64 num_bytes) 1767 u64 num_bytes, u64 *actual_bytes)
1751{ 1768{
1752 int ret; 1769 int ret;
1753 u64 map_length = num_bytes; 1770 u64 discarded_bytes = 0;
1754 struct btrfs_multi_bio *multi = NULL; 1771 struct btrfs_multi_bio *multi = NULL;
1755 1772
1756 if (!btrfs_test_opt(root, DISCARD))
1757 return 0;
1758 1773
1759 /* Tell the block device(s) that the sectors can be discarded */ 1774 /* Tell the block device(s) that the sectors can be discarded */
1760 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, 1775 ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
1761 bytenr, &map_length, &multi, 0); 1776 bytenr, &num_bytes, &multi, 0);
1762 if (!ret) { 1777 if (!ret) {
1763 struct btrfs_bio_stripe *stripe = multi->stripes; 1778 struct btrfs_bio_stripe *stripe = multi->stripes;
1764 int i; 1779 int i;
1765 1780
1766 if (map_length > num_bytes)
1767 map_length = num_bytes;
1768 1781
1769 for (i = 0; i < multi->num_stripes; i++, stripe++) { 1782 for (i = 0; i < multi->num_stripes; i++, stripe++) {
1770 btrfs_issue_discard(stripe->dev->bdev, 1783 ret = btrfs_issue_discard(stripe->dev->bdev,
1771 stripe->physical, 1784 stripe->physical,
1772 map_length); 1785 stripe->length);
1786 if (!ret)
1787 discarded_bytes += stripe->length;
1788 else if (ret != -EOPNOTSUPP)
1789 break;
1773 } 1790 }
1774 kfree(multi); 1791 kfree(multi);
1775 } 1792 }
1793 if (discarded_bytes && ret == -EOPNOTSUPP)
1794 ret = 0;
1795
1796 if (actual_bytes)
1797 *actual_bytes = discarded_bytes;
1798
1776 1799
1777 return ret; 1800 return ret;
1778} 1801}
@@ -3015,7 +3038,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3015 found->bytes_readonly = 0; 3038 found->bytes_readonly = 0;
3016 found->bytes_may_use = 0; 3039 found->bytes_may_use = 0;
3017 found->full = 0; 3040 found->full = 0;
3018 found->force_alloc = 0; 3041 found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3042 found->chunk_alloc = 0;
3019 *space_info = found; 3043 *space_info = found;
3020 list_add_rcu(&found->list, &info->space_info); 3044 list_add_rcu(&found->list, &info->space_info);
3021 atomic_set(&found->caching_threads, 0); 3045 atomic_set(&found->caching_threads, 0);
@@ -3146,7 +3170,7 @@ again:
3146 if (!data_sinfo->full && alloc_chunk) { 3170 if (!data_sinfo->full && alloc_chunk) {
3147 u64 alloc_target; 3171 u64 alloc_target;
3148 3172
3149 data_sinfo->force_alloc = 1; 3173 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
3150 spin_unlock(&data_sinfo->lock); 3174 spin_unlock(&data_sinfo->lock);
3151alloc: 3175alloc:
3152 alloc_target = btrfs_get_alloc_profile(root, 1); 3176 alloc_target = btrfs_get_alloc_profile(root, 1);
@@ -3156,7 +3180,8 @@ alloc:
3156 3180
3157 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 3181 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3158 bytes + 2 * 1024 * 1024, 3182 bytes + 2 * 1024 * 1024,
3159 alloc_target, 0); 3183 alloc_target,
3184 CHUNK_ALLOC_NO_FORCE);
3160 btrfs_end_transaction(trans, root); 3185 btrfs_end_transaction(trans, root);
3161 if (ret < 0) { 3186 if (ret < 0) {
3162 if (ret != -ENOSPC) 3187 if (ret != -ENOSPC)
@@ -3235,31 +3260,56 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
3235 rcu_read_lock(); 3260 rcu_read_lock();
3236 list_for_each_entry_rcu(found, head, list) { 3261 list_for_each_entry_rcu(found, head, list) {
3237 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 3262 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3238 found->force_alloc = 1; 3263 found->force_alloc = CHUNK_ALLOC_FORCE;
3239 } 3264 }
3240 rcu_read_unlock(); 3265 rcu_read_unlock();
3241} 3266}
3242 3267
3243static int should_alloc_chunk(struct btrfs_root *root, 3268static int should_alloc_chunk(struct btrfs_root *root,
3244 struct btrfs_space_info *sinfo, u64 alloc_bytes) 3269 struct btrfs_space_info *sinfo, u64 alloc_bytes,
3270 int force)
3245{ 3271{
3246 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3272 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3273 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3247 u64 thresh; 3274 u64 thresh;
3248 3275
3249 if (sinfo->bytes_used + sinfo->bytes_reserved + 3276 if (force == CHUNK_ALLOC_FORCE)
3250 alloc_bytes + 256 * 1024 * 1024 < num_bytes) 3277 return 1;
3278
3279 /*
3280 * in limited mode, we want to have some free space up to
3281 * about 1% of the FS size.
3282 */
3283 if (force == CHUNK_ALLOC_LIMITED) {
3284 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
3285 thresh = max_t(u64, 64 * 1024 * 1024,
3286 div_factor_fine(thresh, 1));
3287
3288 if (num_bytes - num_allocated < thresh)
3289 return 1;
3290 }
3291
3292 /*
3293 * we have two similar checks here, one based on percentage
3294 * and once based on a hard number of 256MB. The idea
3295 * is that if we have a good amount of free
3296 * room, don't allocate a chunk. A good mount is
3297 * less than 80% utilized of the chunks we have allocated,
3298 * or more than 256MB free
3299 */
3300 if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
3251 return 0; 3301 return 0;
3252 3302
3253 if (sinfo->bytes_used + sinfo->bytes_reserved + 3303 if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
3254 alloc_bytes < div_factor(num_bytes, 8))
3255 return 0; 3304 return 0;
3256 3305
3257 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); 3306 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
3307
3308 /* 256MB or 5% of the FS */
3258 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); 3309 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
3259 3310
3260 if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3)) 3311 if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
3261 return 0; 3312 return 0;
3262
3263 return 1; 3313 return 1;
3264} 3314}
3265 3315
@@ -3269,10 +3319,9 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3269{ 3319{
3270 struct btrfs_space_info *space_info; 3320 struct btrfs_space_info *space_info;
3271 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3321 struct btrfs_fs_info *fs_info = extent_root->fs_info;
3322 int wait_for_alloc = 0;
3272 int ret = 0; 3323 int ret = 0;
3273 3324
3274 mutex_lock(&fs_info->chunk_mutex);
3275
3276 flags = btrfs_reduce_alloc_profile(extent_root, flags); 3325 flags = btrfs_reduce_alloc_profile(extent_root, flags);
3277 3326
3278 space_info = __find_space_info(extent_root->fs_info, flags); 3327 space_info = __find_space_info(extent_root->fs_info, flags);
@@ -3283,21 +3332,40 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3283 } 3332 }
3284 BUG_ON(!space_info); 3333 BUG_ON(!space_info);
3285 3334
3335again:
3286 spin_lock(&space_info->lock); 3336 spin_lock(&space_info->lock);
3287 if (space_info->force_alloc) 3337 if (space_info->force_alloc)
3288 force = 1; 3338 force = space_info->force_alloc;
3289 if (space_info->full) { 3339 if (space_info->full) {
3290 spin_unlock(&space_info->lock); 3340 spin_unlock(&space_info->lock);
3291 goto out; 3341 return 0;
3292 } 3342 }
3293 3343
3294 if (!force && !should_alloc_chunk(extent_root, space_info, 3344 if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) {
3295 alloc_bytes)) {
3296 spin_unlock(&space_info->lock); 3345 spin_unlock(&space_info->lock);
3297 goto out; 3346 return 0;
3347 } else if (space_info->chunk_alloc) {
3348 wait_for_alloc = 1;
3349 } else {
3350 space_info->chunk_alloc = 1;
3298 } 3351 }
3352
3299 spin_unlock(&space_info->lock); 3353 spin_unlock(&space_info->lock);
3300 3354
3355 mutex_lock(&fs_info->chunk_mutex);
3356
3357 /*
3358 * The chunk_mutex is held throughout the entirety of a chunk
3359 * allocation, so once we've acquired the chunk_mutex we know that the
3360 * other guy is done and we need to recheck and see if we should
3361 * allocate.
3362 */
3363 if (wait_for_alloc) {
3364 mutex_unlock(&fs_info->chunk_mutex);
3365 wait_for_alloc = 0;
3366 goto again;
3367 }
3368
3301 /* 3369 /*
3302 * If we have mixed data/metadata chunks we want to make sure we keep 3370 * If we have mixed data/metadata chunks we want to make sure we keep
3303 * allocating mixed chunks instead of individual chunks. 3371 * allocating mixed chunks instead of individual chunks.
@@ -3323,9 +3391,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3323 space_info->full = 1; 3391 space_info->full = 1;
3324 else 3392 else
3325 ret = 1; 3393 ret = 1;
3326 space_info->force_alloc = 0; 3394
3395 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3396 space_info->chunk_alloc = 0;
3327 spin_unlock(&space_info->lock); 3397 spin_unlock(&space_info->lock);
3328out:
3329 mutex_unlock(&extent_root->fs_info->chunk_mutex); 3398 mutex_unlock(&extent_root->fs_info->chunk_mutex);
3330 return ret; 3399 return ret;
3331} 3400}
@@ -3996,6 +4065,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3996 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 4065 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
3997 u64 to_reserve; 4066 u64 to_reserve;
3998 int nr_extents; 4067 int nr_extents;
4068 int reserved_extents;
3999 int ret; 4069 int ret;
4000 4070
4001 if (btrfs_transaction_in_commit(root->fs_info)) 4071 if (btrfs_transaction_in_commit(root->fs_info))
@@ -4003,25 +4073,24 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4003 4073
4004 num_bytes = ALIGN(num_bytes, root->sectorsize); 4074 num_bytes = ALIGN(num_bytes, root->sectorsize);
4005 4075
4006 spin_lock(&BTRFS_I(inode)->accounting_lock);
4007 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1; 4076 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
4008 if (nr_extents > BTRFS_I(inode)->reserved_extents) { 4077 reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents);
4009 nr_extents -= BTRFS_I(inode)->reserved_extents; 4078
4079 if (nr_extents > reserved_extents) {
4080 nr_extents -= reserved_extents;
4010 to_reserve = calc_trans_metadata_size(root, nr_extents); 4081 to_reserve = calc_trans_metadata_size(root, nr_extents);
4011 } else { 4082 } else {
4012 nr_extents = 0; 4083 nr_extents = 0;
4013 to_reserve = 0; 4084 to_reserve = 0;
4014 } 4085 }
4015 spin_unlock(&BTRFS_I(inode)->accounting_lock); 4086
4016 to_reserve += calc_csum_metadata_size(inode, num_bytes); 4087 to_reserve += calc_csum_metadata_size(inode, num_bytes);
4017 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); 4088 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
4018 if (ret) 4089 if (ret)
4019 return ret; 4090 return ret;
4020 4091
4021 spin_lock(&BTRFS_I(inode)->accounting_lock); 4092 atomic_add(nr_extents, &BTRFS_I(inode)->reserved_extents);
4022 BTRFS_I(inode)->reserved_extents += nr_extents;
4023 atomic_inc(&BTRFS_I(inode)->outstanding_extents); 4093 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
4024 spin_unlock(&BTRFS_I(inode)->accounting_lock);
4025 4094
4026 block_rsv_add_bytes(block_rsv, to_reserve, 1); 4095 block_rsv_add_bytes(block_rsv, to_reserve, 1);
4027 4096
@@ -4036,20 +4105,30 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4036 struct btrfs_root *root = BTRFS_I(inode)->root; 4105 struct btrfs_root *root = BTRFS_I(inode)->root;
4037 u64 to_free; 4106 u64 to_free;
4038 int nr_extents; 4107 int nr_extents;
4108 int reserved_extents;
4039 4109
4040 num_bytes = ALIGN(num_bytes, root->sectorsize); 4110 num_bytes = ALIGN(num_bytes, root->sectorsize);
4041 atomic_dec(&BTRFS_I(inode)->outstanding_extents); 4111 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
4042 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0); 4112 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0);
4043 4113
4044 spin_lock(&BTRFS_I(inode)->accounting_lock); 4114 reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents);
4045 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents); 4115 do {
4046 if (nr_extents < BTRFS_I(inode)->reserved_extents) { 4116 int old, new;
4047 nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents; 4117
4048 BTRFS_I(inode)->reserved_extents -= nr_extents; 4118 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
4049 } else { 4119 if (nr_extents >= reserved_extents) {
4050 nr_extents = 0; 4120 nr_extents = 0;
4051 } 4121 break;
4052 spin_unlock(&BTRFS_I(inode)->accounting_lock); 4122 }
4123 old = reserved_extents;
4124 nr_extents = reserved_extents - nr_extents;
4125 new = reserved_extents - nr_extents;
4126 old = atomic_cmpxchg(&BTRFS_I(inode)->reserved_extents,
4127 reserved_extents, new);
4128 if (likely(old == reserved_extents))
4129 break;
4130 reserved_extents = old;
4131 } while (1);
4053 4132
4054 to_free = calc_csum_metadata_size(inode, num_bytes); 4133 to_free = calc_csum_metadata_size(inode, num_bytes);
4055 if (nr_extents > 0) 4134 if (nr_extents > 0)
@@ -4223,8 +4302,8 @@ int btrfs_pin_extent(struct btrfs_root *root,
4223 * update size of reserved extents. this function may return -EAGAIN 4302 * update size of reserved extents. this function may return -EAGAIN
4224 * if 'reserve' is true or 'sinfo' is false. 4303 * if 'reserve' is true or 'sinfo' is false.
4225 */ 4304 */
4226static int update_reserved_bytes(struct btrfs_block_group_cache *cache, 4305int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4227 u64 num_bytes, int reserve, int sinfo) 4306 u64 num_bytes, int reserve, int sinfo)
4228{ 4307{
4229 int ret = 0; 4308 int ret = 0;
4230 if (sinfo) { 4309 if (sinfo) {
@@ -4363,7 +4442,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4363 if (ret) 4442 if (ret)
4364 break; 4443 break;
4365 4444
4366 ret = btrfs_discard_extent(root, start, end + 1 - start); 4445 if (btrfs_test_opt(root, DISCARD))
4446 ret = btrfs_discard_extent(root, start,
4447 end + 1 - start, NULL);
4367 4448
4368 clear_extent_dirty(unpin, start, end, GFP_NOFS); 4449 clear_extent_dirty(unpin, start, end, GFP_NOFS);
4369 unpin_extent_range(root, start, end); 4450 unpin_extent_range(root, start, end);
@@ -4704,10 +4785,10 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4704 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 4785 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4705 4786
4706 btrfs_add_free_space(cache, buf->start, buf->len); 4787 btrfs_add_free_space(cache, buf->start, buf->len);
4707 ret = update_reserved_bytes(cache, buf->len, 0, 0); 4788 ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0);
4708 if (ret == -EAGAIN) { 4789 if (ret == -EAGAIN) {
4709 /* block group became read-only */ 4790 /* block group became read-only */
4710 update_reserved_bytes(cache, buf->len, 0, 1); 4791 btrfs_update_reserved_bytes(cache, buf->len, 0, 1);
4711 goto out; 4792 goto out;
4712 } 4793 }
4713 4794
@@ -4744,6 +4825,11 @@ pin:
4744 } 4825 }
4745 } 4826 }
4746out: 4827out:
4828 /*
4829 * Deleting the buffer, clear the corrupt flag since it doesn't matter
4830 * anymore.
4831 */
4832 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
4747 btrfs_put_block_group(cache); 4833 btrfs_put_block_group(cache);
4748} 4834}
4749 4835
@@ -5191,7 +5277,7 @@ checks:
5191 search_start - offset); 5277 search_start - offset);
5192 BUG_ON(offset > search_start); 5278 BUG_ON(offset > search_start);
5193 5279
5194 ret = update_reserved_bytes(block_group, num_bytes, 1, 5280 ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1,
5195 (data & BTRFS_BLOCK_GROUP_DATA)); 5281 (data & BTRFS_BLOCK_GROUP_DATA));
5196 if (ret == -EAGAIN) { 5282 if (ret == -EAGAIN) {
5197 btrfs_add_free_space(block_group, offset, num_bytes); 5283 btrfs_add_free_space(block_group, offset, num_bytes);
@@ -5282,11 +5368,13 @@ loop:
5282 5368
5283 if (allowed_chunk_alloc) { 5369 if (allowed_chunk_alloc) {
5284 ret = do_chunk_alloc(trans, root, num_bytes + 5370 ret = do_chunk_alloc(trans, root, num_bytes +
5285 2 * 1024 * 1024, data, 1); 5371 2 * 1024 * 1024, data,
5372 CHUNK_ALLOC_LIMITED);
5286 allowed_chunk_alloc = 0; 5373 allowed_chunk_alloc = 0;
5287 done_chunk_alloc = 1; 5374 done_chunk_alloc = 1;
5288 } else if (!done_chunk_alloc) { 5375 } else if (!done_chunk_alloc &&
5289 space_info->force_alloc = 1; 5376 space_info->force_alloc == CHUNK_ALLOC_NO_FORCE) {
5377 space_info->force_alloc = CHUNK_ALLOC_LIMITED;
5290 } 5378 }
5291 5379
5292 if (loop < LOOP_NO_EMPTY_SIZE) { 5380 if (loop < LOOP_NO_EMPTY_SIZE) {
@@ -5372,7 +5460,8 @@ again:
5372 */ 5460 */
5373 if (empty_size || root->ref_cows) 5461 if (empty_size || root->ref_cows)
5374 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 5462 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
5375 num_bytes + 2 * 1024 * 1024, data, 0); 5463 num_bytes + 2 * 1024 * 1024, data,
5464 CHUNK_ALLOC_NO_FORCE);
5376 5465
5377 WARN_ON(num_bytes < root->sectorsize); 5466 WARN_ON(num_bytes < root->sectorsize);
5378 ret = find_free_extent(trans, root, num_bytes, empty_size, 5467 ret = find_free_extent(trans, root, num_bytes, empty_size,
@@ -5384,7 +5473,7 @@ again:
5384 num_bytes = num_bytes & ~(root->sectorsize - 1); 5473 num_bytes = num_bytes & ~(root->sectorsize - 1);
5385 num_bytes = max(num_bytes, min_alloc_size); 5474 num_bytes = max(num_bytes, min_alloc_size);
5386 do_chunk_alloc(trans, root->fs_info->extent_root, 5475 do_chunk_alloc(trans, root->fs_info->extent_root,
5387 num_bytes, data, 1); 5476 num_bytes, data, CHUNK_ALLOC_FORCE);
5388 goto again; 5477 goto again;
5389 } 5478 }
5390 if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) { 5479 if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) {
@@ -5397,6 +5486,8 @@ again:
5397 dump_space_info(sinfo, num_bytes, 1); 5486 dump_space_info(sinfo, num_bytes, 1);
5398 } 5487 }
5399 5488
5489 trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
5490
5400 return ret; 5491 return ret;
5401} 5492}
5402 5493
@@ -5412,12 +5503,15 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
5412 return -ENOSPC; 5503 return -ENOSPC;
5413 } 5504 }
5414 5505
5415 ret = btrfs_discard_extent(root, start, len); 5506 if (btrfs_test_opt(root, DISCARD))
5507 ret = btrfs_discard_extent(root, start, len, NULL);
5416 5508
5417 btrfs_add_free_space(cache, start, len); 5509 btrfs_add_free_space(cache, start, len);
5418 update_reserved_bytes(cache, len, 0, 1); 5510 btrfs_update_reserved_bytes(cache, len, 0, 1);
5419 btrfs_put_block_group(cache); 5511 btrfs_put_block_group(cache);
5420 5512
5513 trace_btrfs_reserved_extent_free(root, start, len);
5514
5421 return ret; 5515 return ret;
5422} 5516}
5423 5517
@@ -5444,7 +5538,8 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5444 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); 5538 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
5445 5539
5446 path = btrfs_alloc_path(); 5540 path = btrfs_alloc_path();
5447 BUG_ON(!path); 5541 if (!path)
5542 return -ENOMEM;
5448 5543
5449 path->leave_spinning = 1; 5544 path->leave_spinning = 1;
5450 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 5545 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
@@ -5614,7 +5709,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5614 put_caching_control(caching_ctl); 5709 put_caching_control(caching_ctl);
5615 } 5710 }
5616 5711
5617 ret = update_reserved_bytes(block_group, ins->offset, 1, 1); 5712 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1);
5618 BUG_ON(ret); 5713 BUG_ON(ret);
5619 btrfs_put_block_group(block_group); 5714 btrfs_put_block_group(block_group);
5620 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 5715 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
@@ -6047,6 +6142,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
6047 if (reada && level == 1) 6142 if (reada && level == 1)
6048 reada_walk_down(trans, root, wc, path); 6143 reada_walk_down(trans, root, wc, path);
6049 next = read_tree_block(root, bytenr, blocksize, generation); 6144 next = read_tree_block(root, bytenr, blocksize, generation);
6145 if (!next)
6146 return -EIO;
6050 btrfs_tree_lock(next); 6147 btrfs_tree_lock(next);
6051 btrfs_set_lock_blocking(next); 6148 btrfs_set_lock_blocking(next);
6052 } 6149 }
@@ -6438,10 +6535,14 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6438 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 6535 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
6439 6536
6440 path = btrfs_alloc_path(); 6537 path = btrfs_alloc_path();
6441 BUG_ON(!path); 6538 if (!path)
6539 return -ENOMEM;
6442 6540
6443 wc = kzalloc(sizeof(*wc), GFP_NOFS); 6541 wc = kzalloc(sizeof(*wc), GFP_NOFS);
6444 BUG_ON(!wc); 6542 if (!wc) {
6543 btrfs_free_path(path);
6544 return -ENOMEM;
6545 }
6445 6546
6446 btrfs_assert_tree_locked(parent); 6547 btrfs_assert_tree_locked(parent);
6447 parent_level = btrfs_header_level(parent); 6548 parent_level = btrfs_header_level(parent);
@@ -6899,7 +7000,11 @@ static noinline int get_new_locations(struct inode *reloc_inode,
6899 } 7000 }
6900 7001
6901 path = btrfs_alloc_path(); 7002 path = btrfs_alloc_path();
6902 BUG_ON(!path); 7003 if (!path) {
7004 if (exts != *extents)
7005 kfree(exts);
7006 return -ENOMEM;
7007 }
6903 7008
6904 cur_pos = extent_key->objectid - offset; 7009 cur_pos = extent_key->objectid - offset;
6905 last_byte = extent_key->objectid + extent_key->offset; 7010 last_byte = extent_key->objectid + extent_key->offset;
@@ -6941,6 +7046,10 @@ static noinline int get_new_locations(struct inode *reloc_inode,
6941 struct disk_extent *old = exts; 7046 struct disk_extent *old = exts;
6942 max *= 2; 7047 max *= 2;
6943 exts = kzalloc(sizeof(*exts) * max, GFP_NOFS); 7048 exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
7049 if (!exts) {
7050 ret = -ENOMEM;
7051 goto out;
7052 }
6944 memcpy(exts, old, sizeof(*exts) * nr); 7053 memcpy(exts, old, sizeof(*exts) * nr);
6945 if (old != *extents) 7054 if (old != *extents)
6946 kfree(old); 7055 kfree(old);
@@ -7423,7 +7532,8 @@ static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
7423 int ret; 7532 int ret;
7424 7533
7425 new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS); 7534 new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
7426 BUG_ON(!new_extent); 7535 if (!new_extent)
7536 return -ENOMEM;
7427 7537
7428 ref = btrfs_lookup_leaf_ref(root, leaf->start); 7538 ref = btrfs_lookup_leaf_ref(root, leaf->start);
7429 BUG_ON(!ref); 7539 BUG_ON(!ref);
@@ -7609,7 +7719,8 @@ int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
7609 7719
7610 reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location); 7720 reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
7611 BUG_ON(!reloc_root); 7721 BUG_ON(!reloc_root);
7612 btrfs_orphan_cleanup(reloc_root); 7722 ret = btrfs_orphan_cleanup(reloc_root);
7723 BUG_ON(ret);
7613 return 0; 7724 return 0;
7614} 7725}
7615 7726
@@ -7627,7 +7738,8 @@ static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
7627 return 0; 7738 return 0;
7628 7739
7629 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 7740 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
7630 BUG_ON(!root_item); 7741 if (!root_item)
7742 return -ENOMEM;
7631 7743
7632 ret = btrfs_copy_root(trans, root, root->commit_root, 7744 ret = btrfs_copy_root(trans, root, root->commit_root,
7633 &eb, BTRFS_TREE_RELOC_OBJECTID); 7745 &eb, BTRFS_TREE_RELOC_OBJECTID);
@@ -7653,7 +7765,7 @@ static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
7653 7765
7654 reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, 7766 reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
7655 &root_key); 7767 &root_key);
7656 BUG_ON(!reloc_root); 7768 BUG_ON(IS_ERR(reloc_root));
7657 reloc_root->last_trans = trans->transid; 7769 reloc_root->last_trans = trans->transid;
7658 reloc_root->commit_root = NULL; 7770 reloc_root->commit_root = NULL;
7659 reloc_root->ref_tree = &root->fs_info->reloc_ref_tree; 7771 reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
@@ -7906,6 +8018,10 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
7906 8018
7907 eb = read_tree_block(found_root, block_start, 8019 eb = read_tree_block(found_root, block_start,
7908 block_size, 0); 8020 block_size, 0);
8021 if (!eb) {
8022 ret = -EIO;
8023 goto out;
8024 }
7909 btrfs_tree_lock(eb); 8025 btrfs_tree_lock(eb);
7910 BUG_ON(level != btrfs_header_level(eb)); 8026 BUG_ON(level != btrfs_header_level(eb));
7911 8027
@@ -8061,13 +8177,15 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
8061 8177
8062 alloc_flags = update_block_group_flags(root, cache->flags); 8178 alloc_flags = update_block_group_flags(root, cache->flags);
8063 if (alloc_flags != cache->flags) 8179 if (alloc_flags != cache->flags)
8064 do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); 8180 do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
8181 CHUNK_ALLOC_FORCE);
8065 8182
8066 ret = set_block_group_ro(cache); 8183 ret = set_block_group_ro(cache);
8067 if (!ret) 8184 if (!ret)
8068 goto out; 8185 goto out;
8069 alloc_flags = get_alloc_profile(root, cache->space_info->flags); 8186 alloc_flags = get_alloc_profile(root, cache->space_info->flags);
8070 ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); 8187 ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
8188 CHUNK_ALLOC_FORCE);
8071 if (ret < 0) 8189 if (ret < 0)
8072 goto out; 8190 goto out;
8073 ret = set_block_group_ro(cache); 8191 ret = set_block_group_ro(cache);
@@ -8080,7 +8198,8 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
8080 struct btrfs_root *root, u64 type) 8198 struct btrfs_root *root, u64 type)
8081{ 8199{
8082 u64 alloc_flags = get_alloc_profile(root, type); 8200 u64 alloc_flags = get_alloc_profile(root, type);
8083 return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); 8201 return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
8202 CHUNK_ALLOC_FORCE);
8084} 8203}
8085 8204
8086/* 8205/*
@@ -8621,6 +8740,12 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8621 BUG_ON(!block_group); 8740 BUG_ON(!block_group);
8622 BUG_ON(!block_group->ro); 8741 BUG_ON(!block_group->ro);
8623 8742
8743 /*
8744 * Free the reserved super bytes from this block group before
8745 * remove it.
8746 */
8747 free_excluded_extents(root, block_group);
8748
8624 memcpy(&key, &block_group->key, sizeof(key)); 8749 memcpy(&key, &block_group->key, sizeof(key));
8625 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | 8750 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
8626 BTRFS_BLOCK_GROUP_RAID1 | 8751 BTRFS_BLOCK_GROUP_RAID1 |
@@ -8724,13 +8849,84 @@ out:
8724 return ret; 8849 return ret;
8725} 8850}
8726 8851
8852int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
8853{
8854 struct btrfs_space_info *space_info;
8855 int ret;
8856
8857 ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM, 0, 0,
8858 &space_info);
8859 if (ret)
8860 return ret;
8861
8862 ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA, 0, 0,
8863 &space_info);
8864 if (ret)
8865 return ret;
8866
8867 ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA, 0, 0,
8868 &space_info);
8869 if (ret)
8870 return ret;
8871
8872 return ret;
8873}
8874
8727int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) 8875int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
8728{ 8876{
8729 return unpin_extent_range(root, start, end); 8877 return unpin_extent_range(root, start, end);
8730} 8878}
8731 8879
8732int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, 8880int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
8733 u64 num_bytes) 8881 u64 num_bytes, u64 *actual_bytes)
8882{
8883 return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes);
8884}
8885
8886int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
8734{ 8887{
8735 return btrfs_discard_extent(root, bytenr, num_bytes); 8888 struct btrfs_fs_info *fs_info = root->fs_info;
8889 struct btrfs_block_group_cache *cache = NULL;
8890 u64 group_trimmed;
8891 u64 start;
8892 u64 end;
8893 u64 trimmed = 0;
8894 int ret = 0;
8895
8896 cache = btrfs_lookup_block_group(fs_info, range->start);
8897
8898 while (cache) {
8899 if (cache->key.objectid >= (range->start + range->len)) {
8900 btrfs_put_block_group(cache);
8901 break;
8902 }
8903
8904 start = max(range->start, cache->key.objectid);
8905 end = min(range->start + range->len,
8906 cache->key.objectid + cache->key.offset);
8907
8908 if (end - start >= range->minlen) {
8909 if (!block_group_cache_done(cache)) {
8910 ret = cache_block_group(cache, NULL, root, 0);
8911 if (!ret)
8912 wait_block_group_cache_done(cache);
8913 }
8914 ret = btrfs_trim_block_group(cache,
8915 &group_trimmed,
8916 start,
8917 end,
8918 range->minlen);
8919
8920 trimmed += group_trimmed;
8921 if (ret) {
8922 btrfs_put_block_group(cache);
8923 break;
8924 }
8925 }
8926
8927 cache = next_block_group(fs_info->tree_root, cache);
8928 }
8929
8930 range->len = trimmed;
8931 return ret;
8736} 8932}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 714adc4ac4c2..315138605088 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -690,6 +690,15 @@ static void cache_state(struct extent_state *state,
690 } 690 }
691} 691}
692 692
693static void uncache_state(struct extent_state **cached_ptr)
694{
695 if (cached_ptr && (*cached_ptr)) {
696 struct extent_state *state = *cached_ptr;
697 *cached_ptr = NULL;
698 free_extent_state(state);
699 }
700}
701
693/* 702/*
694 * set some bits on a range in the tree. This may require allocations or 703 * set some bits on a range in the tree. This may require allocations or
695 * sleeping, so the gfp mask is used to indicate what is allowed. 704 * sleeping, so the gfp mask is used to indicate what is allowed.
@@ -940,10 +949,10 @@ static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
940} 949}
941 950
942int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 951int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
943 gfp_t mask) 952 struct extent_state **cached_state, gfp_t mask)
944{ 953{
945 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, 954 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
946 NULL, mask); 955 NULL, cached_state, mask);
947} 956}
948 957
949static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, 958static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
@@ -1012,8 +1021,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1012 mask); 1021 mask);
1013} 1022}
1014 1023
1015int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, 1024int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
1016 gfp_t mask)
1017{ 1025{
1018 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, 1026 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1019 mask); 1027 mask);
@@ -1735,6 +1743,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1735 1743
1736 do { 1744 do {
1737 struct page *page = bvec->bv_page; 1745 struct page *page = bvec->bv_page;
1746 struct extent_state *cached = NULL;
1747 struct extent_state *state;
1748
1738 tree = &BTRFS_I(page->mapping->host)->io_tree; 1749 tree = &BTRFS_I(page->mapping->host)->io_tree;
1739 1750
1740 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 1751 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1749,9 +1760,20 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1749 if (++bvec <= bvec_end) 1760 if (++bvec <= bvec_end)
1750 prefetchw(&bvec->bv_page->flags); 1761 prefetchw(&bvec->bv_page->flags);
1751 1762
1763 spin_lock(&tree->lock);
1764 state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
1765 if (state && state->start == start) {
1766 /*
1767 * take a reference on the state, unlock will drop
1768 * the ref
1769 */
1770 cache_state(state, &cached);
1771 }
1772 spin_unlock(&tree->lock);
1773
1752 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 1774 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1753 ret = tree->ops->readpage_end_io_hook(page, start, end, 1775 ret = tree->ops->readpage_end_io_hook(page, start, end,
1754 NULL); 1776 state);
1755 if (ret) 1777 if (ret)
1756 uptodate = 0; 1778 uptodate = 0;
1757 } 1779 }
@@ -1764,15 +1786,16 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1764 test_bit(BIO_UPTODATE, &bio->bi_flags); 1786 test_bit(BIO_UPTODATE, &bio->bi_flags);
1765 if (err) 1787 if (err)
1766 uptodate = 0; 1788 uptodate = 0;
1789 uncache_state(&cached);
1767 continue; 1790 continue;
1768 } 1791 }
1769 } 1792 }
1770 1793
1771 if (uptodate) { 1794 if (uptodate) {
1772 set_extent_uptodate(tree, start, end, 1795 set_extent_uptodate(tree, start, end, &cached,
1773 GFP_ATOMIC); 1796 GFP_ATOMIC);
1774 } 1797 }
1775 unlock_extent(tree, start, end, GFP_ATOMIC); 1798 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
1776 1799
1777 if (whole_page) { 1800 if (whole_page) {
1778 if (uptodate) { 1801 if (uptodate) {
@@ -1811,6 +1834,7 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err)
1811 1834
1812 do { 1835 do {
1813 struct page *page = bvec->bv_page; 1836 struct page *page = bvec->bv_page;
1837 struct extent_state *cached = NULL;
1814 tree = &BTRFS_I(page->mapping->host)->io_tree; 1838 tree = &BTRFS_I(page->mapping->host)->io_tree;
1815 1839
1816 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 1840 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1821,13 +1845,14 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err)
1821 prefetchw(&bvec->bv_page->flags); 1845 prefetchw(&bvec->bv_page->flags);
1822 1846
1823 if (uptodate) { 1847 if (uptodate) {
1824 set_extent_uptodate(tree, start, end, GFP_ATOMIC); 1848 set_extent_uptodate(tree, start, end, &cached,
1849 GFP_ATOMIC);
1825 } else { 1850 } else {
1826 ClearPageUptodate(page); 1851 ClearPageUptodate(page);
1827 SetPageError(page); 1852 SetPageError(page);
1828 } 1853 }
1829 1854
1830 unlock_extent(tree, start, end, GFP_ATOMIC); 1855 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
1831 1856
1832 } while (bvec >= bio->bi_io_vec); 1857 } while (bvec >= bio->bi_io_vec);
1833 1858
@@ -2016,14 +2041,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2016 while (cur <= end) { 2041 while (cur <= end) {
2017 if (cur >= last_byte) { 2042 if (cur >= last_byte) {
2018 char *userpage; 2043 char *userpage;
2044 struct extent_state *cached = NULL;
2045
2019 iosize = PAGE_CACHE_SIZE - page_offset; 2046 iosize = PAGE_CACHE_SIZE - page_offset;
2020 userpage = kmap_atomic(page, KM_USER0); 2047 userpage = kmap_atomic(page, KM_USER0);
2021 memset(userpage + page_offset, 0, iosize); 2048 memset(userpage + page_offset, 0, iosize);
2022 flush_dcache_page(page); 2049 flush_dcache_page(page);
2023 kunmap_atomic(userpage, KM_USER0); 2050 kunmap_atomic(userpage, KM_USER0);
2024 set_extent_uptodate(tree, cur, cur + iosize - 1, 2051 set_extent_uptodate(tree, cur, cur + iosize - 1,
2025 GFP_NOFS); 2052 &cached, GFP_NOFS);
2026 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2053 unlock_extent_cached(tree, cur, cur + iosize - 1,
2054 &cached, GFP_NOFS);
2027 break; 2055 break;
2028 } 2056 }
2029 em = get_extent(inode, page, page_offset, cur, 2057 em = get_extent(inode, page, page_offset, cur,
@@ -2063,14 +2091,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2063 /* we've found a hole, just zero and go on */ 2091 /* we've found a hole, just zero and go on */
2064 if (block_start == EXTENT_MAP_HOLE) { 2092 if (block_start == EXTENT_MAP_HOLE) {
2065 char *userpage; 2093 char *userpage;
2094 struct extent_state *cached = NULL;
2095
2066 userpage = kmap_atomic(page, KM_USER0); 2096 userpage = kmap_atomic(page, KM_USER0);
2067 memset(userpage + page_offset, 0, iosize); 2097 memset(userpage + page_offset, 0, iosize);
2068 flush_dcache_page(page); 2098 flush_dcache_page(page);
2069 kunmap_atomic(userpage, KM_USER0); 2099 kunmap_atomic(userpage, KM_USER0);
2070 2100
2071 set_extent_uptodate(tree, cur, cur + iosize - 1, 2101 set_extent_uptodate(tree, cur, cur + iosize - 1,
2072 GFP_NOFS); 2102 &cached, GFP_NOFS);
2073 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2103 unlock_extent_cached(tree, cur, cur + iosize - 1,
2104 &cached, GFP_NOFS);
2074 cur = cur + iosize; 2105 cur = cur + iosize;
2075 page_offset += iosize; 2106 page_offset += iosize;
2076 continue; 2107 continue;
@@ -2188,10 +2219,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2188 unsigned long nr_written = 0; 2219 unsigned long nr_written = 0;
2189 2220
2190 if (wbc->sync_mode == WB_SYNC_ALL) 2221 if (wbc->sync_mode == WB_SYNC_ALL)
2191 write_flags = WRITE_SYNC_PLUG; 2222 write_flags = WRITE_SYNC;
2192 else 2223 else
2193 write_flags = WRITE; 2224 write_flags = WRITE;
2194 2225
2226 trace___extent_writepage(page, inode, wbc);
2227
2195 WARN_ON(!PageLocked(page)); 2228 WARN_ON(!PageLocked(page));
2196 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2229 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2197 if (page->index > end_index || 2230 if (page->index > end_index ||
@@ -2787,9 +2820,12 @@ int extent_prepare_write(struct extent_io_tree *tree,
2787 iocount++; 2820 iocount++;
2788 block_start = block_start + iosize; 2821 block_start = block_start + iosize;
2789 } else { 2822 } else {
2790 set_extent_uptodate(tree, block_start, cur_end, 2823 struct extent_state *cached = NULL;
2824
2825 set_extent_uptodate(tree, block_start, cur_end, &cached,
2791 GFP_NOFS); 2826 GFP_NOFS);
2792 unlock_extent(tree, block_start, cur_end, GFP_NOFS); 2827 unlock_extent_cached(tree, block_start, cur_end,
2828 &cached, GFP_NOFS);
2793 block_start = cur_end + 1; 2829 block_start = cur_end + 1;
2794 } 2830 }
2795 page_offset = block_start & (PAGE_CACHE_SIZE - 1); 2831 page_offset = block_start & (PAGE_CACHE_SIZE - 1);
@@ -3455,7 +3491,7 @@ int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3455 num_pages = num_extent_pages(eb->start, eb->len); 3491 num_pages = num_extent_pages(eb->start, eb->len);
3456 3492
3457 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3493 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3458 GFP_NOFS); 3494 NULL, GFP_NOFS);
3459 for (i = 0; i < num_pages; i++) { 3495 for (i = 0; i < num_pages; i++) {
3460 page = extent_buffer_page(eb, i); 3496 page = extent_buffer_page(eb, i);
3461 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || 3497 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
@@ -3690,6 +3726,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3690 "wanted %lu %lu\n", (unsigned long long)eb->start, 3726 "wanted %lu %lu\n", (unsigned long long)eb->start,
3691 eb->len, start, min_len); 3727 eb->len, start, min_len);
3692 WARN_ON(1); 3728 WARN_ON(1);
3729 return -EINVAL;
3693 } 3730 }
3694 3731
3695 p = extent_buffer_page(eb, i); 3732 p = extent_buffer_page(eb, i);
@@ -3882,6 +3919,12 @@ static void move_pages(struct page *dst_page, struct page *src_page,
3882 kunmap_atomic(dst_kaddr, KM_USER0); 3919 kunmap_atomic(dst_kaddr, KM_USER0);
3883} 3920}
3884 3921
3922static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
3923{
3924 unsigned long distance = (src > dst) ? src - dst : dst - src;
3925 return distance < len;
3926}
3927
3885static void copy_pages(struct page *dst_page, struct page *src_page, 3928static void copy_pages(struct page *dst_page, struct page *src_page,
3886 unsigned long dst_off, unsigned long src_off, 3929 unsigned long dst_off, unsigned long src_off,
3887 unsigned long len) 3930 unsigned long len)
@@ -3889,10 +3932,12 @@ static void copy_pages(struct page *dst_page, struct page *src_page,
3889 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); 3932 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3890 char *src_kaddr; 3933 char *src_kaddr;
3891 3934
3892 if (dst_page != src_page) 3935 if (dst_page != src_page) {
3893 src_kaddr = kmap_atomic(src_page, KM_USER1); 3936 src_kaddr = kmap_atomic(src_page, KM_USER1);
3894 else 3937 } else {
3895 src_kaddr = dst_kaddr; 3938 src_kaddr = dst_kaddr;
3939 BUG_ON(areas_overlap(src_off, dst_off, len));
3940 }
3896 3941
3897 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 3942 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3898 kunmap_atomic(dst_kaddr, KM_USER0); 3943 kunmap_atomic(dst_kaddr, KM_USER0);
@@ -3967,7 +4012,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3967 "len %lu len %lu\n", dst_offset, len, dst->len); 4012 "len %lu len %lu\n", dst_offset, len, dst->len);
3968 BUG_ON(1); 4013 BUG_ON(1);
3969 } 4014 }
3970 if (dst_offset < src_offset) { 4015 if (!areas_overlap(src_offset, dst_offset, len)) {
3971 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 4016 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3972 return; 4017 return;
3973 } 4018 }
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 9318dfefd59c..af2d7179c372 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -31,6 +31,7 @@
31#define EXTENT_BUFFER_UPTODATE 0 31#define EXTENT_BUFFER_UPTODATE 0
32#define EXTENT_BUFFER_BLOCKING 1 32#define EXTENT_BUFFER_BLOCKING 1
33#define EXTENT_BUFFER_DIRTY 2 33#define EXTENT_BUFFER_DIRTY 2
34#define EXTENT_BUFFER_CORRUPT 3
34 35
35/* these are flags for extent_clear_unlock_delalloc */ 36/* these are flags for extent_clear_unlock_delalloc */
36#define EXTENT_CLEAR_UNLOCK_PAGE 0x1 37#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -207,7 +208,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
207 int bits, int exclusive_bits, u64 *failed_start, 208 int bits, int exclusive_bits, u64 *failed_start,
208 struct extent_state **cached_state, gfp_t mask); 209 struct extent_state **cached_state, gfp_t mask);
209int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 210int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
210 gfp_t mask); 211 struct extent_state **cached_state, gfp_t mask);
211int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 212int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
212 gfp_t mask); 213 gfp_t mask);
213int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 214int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 2b6c12e983b3..a24a3f2fa13e 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -243,7 +243,7 @@ out:
243 * Insert @em into @tree or perform a simple forward/backward merge with 243 * Insert @em into @tree or perform a simple forward/backward merge with
244 * existing mappings. The extent_map struct passed in will be inserted 244 * existing mappings. The extent_map struct passed in will be inserted
245 * into the tree directly, with an additional reference taken, or a 245 * into the tree directly, with an additional reference taken, or a
246 * reference dropped if the merge attempt was successfull. 246 * reference dropped if the merge attempt was successful.
247 */ 247 */
248int add_extent_mapping(struct extent_map_tree *tree, 248int add_extent_mapping(struct extent_map_tree *tree,
249 struct extent_map *em) 249 struct extent_map *em)
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 4f19a3e1bf32..a6a9d4e8b491 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -48,7 +48,8 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
48 struct extent_buffer *leaf; 48 struct extent_buffer *leaf;
49 49
50 path = btrfs_alloc_path(); 50 path = btrfs_alloc_path();
51 BUG_ON(!path); 51 if (!path)
52 return -ENOMEM;
52 file_key.objectid = objectid; 53 file_key.objectid = objectid;
53 file_key.offset = pos; 54 file_key.offset = pos;
54 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); 55 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
@@ -169,6 +170,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
169 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 170 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
170 171
171 path = btrfs_alloc_path(); 172 path = btrfs_alloc_path();
173 if (!path)
174 return -ENOMEM;
172 if (bio->bi_size > PAGE_CACHE_SIZE * 8) 175 if (bio->bi_size > PAGE_CACHE_SIZE * 8)
173 path->reada = 2; 176 path->reada = 2;
174 177
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f447b783bb84..75899a01dded 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -45,14 +45,14 @@
45 * and be replaced with calls into generic code. 45 * and be replaced with calls into generic code.
46 */ 46 */
47static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, 47static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
48 int write_bytes, 48 size_t write_bytes,
49 struct page **prepared_pages, 49 struct page **prepared_pages,
50 struct iov_iter *i) 50 struct iov_iter *i)
51{ 51{
52 size_t copied = 0; 52 size_t copied = 0;
53 size_t total_copied = 0;
53 int pg = 0; 54 int pg = 0;
54 int offset = pos & (PAGE_CACHE_SIZE - 1); 55 int offset = pos & (PAGE_CACHE_SIZE - 1);
55 int total_copied = 0;
56 56
57 while (write_bytes > 0) { 57 while (write_bytes > 0) {
58 size_t count = min_t(size_t, 58 size_t count = min_t(size_t,
@@ -88,9 +88,8 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
88 total_copied += copied; 88 total_copied += copied;
89 89
90 /* Return to btrfs_file_aio_write to fault page */ 90 /* Return to btrfs_file_aio_write to fault page */
91 if (unlikely(copied == 0)) { 91 if (unlikely(copied == 0))
92 break; 92 break;
93 }
94 93
95 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { 94 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
96 offset += copied; 95 offset += copied;
@@ -105,12 +104,10 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
105/* 104/*
106 * unlocks pages after btrfs_file_write is done with them 105 * unlocks pages after btrfs_file_write is done with them
107 */ 106 */
108static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages) 107void btrfs_drop_pages(struct page **pages, size_t num_pages)
109{ 108{
110 size_t i; 109 size_t i;
111 for (i = 0; i < num_pages; i++) { 110 for (i = 0; i < num_pages; i++) {
112 if (!pages[i])
113 break;
114 /* page checked is some magic around finding pages that 111 /* page checked is some magic around finding pages that
115 * have been modified without going through btrfs_set_page_dirty 112 * have been modified without going through btrfs_set_page_dirty
116 * clear it here 113 * clear it here
@@ -130,17 +127,13 @@ static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
130 * this also makes the decision about creating an inline extent vs 127 * this also makes the decision about creating an inline extent vs
131 * doing real data extents, marking pages dirty and delalloc as required. 128 * doing real data extents, marking pages dirty and delalloc as required.
132 */ 129 */
133static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, 130int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
134 struct btrfs_root *root, 131 struct page **pages, size_t num_pages,
135 struct file *file, 132 loff_t pos, size_t write_bytes,
136 struct page **pages, 133 struct extent_state **cached)
137 size_t num_pages,
138 loff_t pos,
139 size_t write_bytes)
140{ 134{
141 int err = 0; 135 int err = 0;
142 int i; 136 int i;
143 struct inode *inode = fdentry(file)->d_inode;
144 u64 num_bytes; 137 u64 num_bytes;
145 u64 start_pos; 138 u64 start_pos;
146 u64 end_of_last_block; 139 u64 end_of_last_block;
@@ -153,8 +146,9 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
153 146
154 end_of_last_block = start_pos + num_bytes - 1; 147 end_of_last_block = start_pos + num_bytes - 1;
155 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, 148 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
156 NULL); 149 cached);
157 BUG_ON(err); 150 if (err)
151 return err;
158 152
159 for (i = 0; i < num_pages; i++) { 153 for (i = 0; i < num_pages; i++) {
160 struct page *p = pages[i]; 154 struct page *p = pages[i];
@@ -162,13 +156,14 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
162 ClearPageChecked(p); 156 ClearPageChecked(p);
163 set_page_dirty(p); 157 set_page_dirty(p);
164 } 158 }
165 if (end_pos > isize) { 159
160 /*
161 * we've only changed i_size in ram, and we haven't updated
162 * the disk i_size. There is no need to log the inode
163 * at this time.
164 */
165 if (end_pos > isize)
166 i_size_write(inode, end_pos); 166 i_size_write(inode, end_pos);
167 /* we've only changed i_size in ram, and we haven't updated
168 * the disk i_size. There is no need to log the inode
169 * at this time.
170 */
171 }
172 return 0; 167 return 0;
173} 168}
174 169
@@ -610,6 +605,8 @@ again:
610 key.offset = split; 605 key.offset = split;
611 606
612 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 607 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
608 if (ret < 0)
609 goto out;
613 if (ret > 0 && path->slots[0] > 0) 610 if (ret > 0 && path->slots[0] > 0)
614 path->slots[0]--; 611 path->slots[0]--;
615 612
@@ -819,12 +816,11 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
819 last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; 816 last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
820 817
821 if (start_pos > inode->i_size) { 818 if (start_pos > inode->i_size) {
822 err = btrfs_cont_expand(inode, start_pos); 819 err = btrfs_cont_expand(inode, i_size_read(inode), start_pos);
823 if (err) 820 if (err)
824 return err; 821 return err;
825 } 822 }
826 823
827 memset(pages, 0, num_pages * sizeof(struct page *));
828again: 824again:
829 for (i = 0; i < num_pages; i++) { 825 for (i = 0; i < num_pages; i++) {
830 pages[i] = grab_cache_page(inode->i_mapping, index + i); 826 pages[i] = grab_cache_page(inode->i_mapping, index + i);
@@ -896,156 +892,71 @@ fail:
896 892
897} 893}
898 894
899static ssize_t btrfs_file_aio_write(struct kiocb *iocb, 895static noinline ssize_t __btrfs_buffered_write(struct file *file,
900 const struct iovec *iov, 896 struct iov_iter *i,
901 unsigned long nr_segs, loff_t pos) 897 loff_t pos)
902{ 898{
903 struct file *file = iocb->ki_filp;
904 struct inode *inode = fdentry(file)->d_inode; 899 struct inode *inode = fdentry(file)->d_inode;
905 struct btrfs_root *root = BTRFS_I(inode)->root; 900 struct btrfs_root *root = BTRFS_I(inode)->root;
906 struct page **pages = NULL; 901 struct page **pages = NULL;
907 struct iov_iter i;
908 loff_t *ppos = &iocb->ki_pos;
909 loff_t start_pos;
910 ssize_t num_written = 0;
911 ssize_t err = 0;
912 size_t count;
913 size_t ocount;
914 int ret = 0;
915 int nrptrs;
916 unsigned long first_index; 902 unsigned long first_index;
917 unsigned long last_index; 903 unsigned long last_index;
918 int will_write; 904 size_t num_written = 0;
919 int buffered = 0; 905 int nrptrs;
920 int copied = 0; 906 int ret = 0;
921 int dirty_pages = 0;
922
923 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
924 (file->f_flags & O_DIRECT));
925
926 start_pos = pos;
927
928 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
929
930 mutex_lock(&inode->i_mutex);
931
932 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
933 if (err)
934 goto out;
935 count = ocount;
936
937 current->backing_dev_info = inode->i_mapping->backing_dev_info;
938 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
939 if (err)
940 goto out;
941
942 if (count == 0)
943 goto out;
944
945 err = file_remove_suid(file);
946 if (err)
947 goto out;
948
949 /*
950 * If BTRFS flips readonly due to some impossible error
951 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
952 * although we have opened a file as writable, we have
953 * to stop this write operation to ensure FS consistency.
954 */
955 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
956 err = -EROFS;
957 goto out;
958 }
959
960 file_update_time(file);
961 BTRFS_I(inode)->sequence++;
962
963 if (unlikely(file->f_flags & O_DIRECT)) {
964 num_written = generic_file_direct_write(iocb, iov, &nr_segs,
965 pos, ppos, count,
966 ocount);
967 /*
968 * the generic O_DIRECT will update in-memory i_size after the
969 * DIOs are done. But our endio handlers that update the on
970 * disk i_size never update past the in memory i_size. So we
971 * need one more update here to catch any additions to the
972 * file
973 */
974 if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
975 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
976 mark_inode_dirty(inode);
977 }
978
979 if (num_written < 0) {
980 ret = num_written;
981 num_written = 0;
982 goto out;
983 } else if (num_written == count) {
984 /* pick up pos changes done by the generic code */
985 pos = *ppos;
986 goto out;
987 }
988 /*
989 * We are going to do buffered for the rest of the range, so we
990 * need to make sure to invalidate the buffered pages when we're
991 * done.
992 */
993 buffered = 1;
994 pos += num_written;
995 }
996 907
997 iov_iter_init(&i, iov, nr_segs, count, num_written); 908 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
998 nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
999 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / 909 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
1000 (sizeof(struct page *))); 910 (sizeof(struct page *)));
1001 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 911 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
1002 if (!pages) { 912 if (!pages)
1003 ret = -ENOMEM; 913 return -ENOMEM;
1004 goto out;
1005 }
1006
1007 /* generic_write_checks can change our pos */
1008 start_pos = pos;
1009 914
1010 first_index = pos >> PAGE_CACHE_SHIFT; 915 first_index = pos >> PAGE_CACHE_SHIFT;
1011 last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT; 916 last_index = (pos + iov_iter_count(i)) >> PAGE_CACHE_SHIFT;
1012 917
1013 while (iov_iter_count(&i) > 0) { 918 while (iov_iter_count(i) > 0) {
1014 size_t offset = pos & (PAGE_CACHE_SIZE - 1); 919 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
1015 size_t write_bytes = min(iov_iter_count(&i), 920 size_t write_bytes = min(iov_iter_count(i),
1016 nrptrs * (size_t)PAGE_CACHE_SIZE - 921 nrptrs * (size_t)PAGE_CACHE_SIZE -
1017 offset); 922 offset);
1018 size_t num_pages = (write_bytes + offset + 923 size_t num_pages = (write_bytes + offset +
1019 PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 924 PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
925 size_t dirty_pages;
926 size_t copied;
1020 927
1021 WARN_ON(num_pages > nrptrs); 928 WARN_ON(num_pages > nrptrs);
1022 memset(pages, 0, sizeof(struct page *) * nrptrs);
1023 929
1024 /* 930 /*
1025 * Fault pages before locking them in prepare_pages 931 * Fault pages before locking them in prepare_pages
1026 * to avoid recursive lock 932 * to avoid recursive lock
1027 */ 933 */
1028 if (unlikely(iov_iter_fault_in_readable(&i, write_bytes))) { 934 if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
1029 ret = -EFAULT; 935 ret = -EFAULT;
1030 goto out; 936 break;
1031 } 937 }
1032 938
1033 ret = btrfs_delalloc_reserve_space(inode, 939 ret = btrfs_delalloc_reserve_space(inode,
1034 num_pages << PAGE_CACHE_SHIFT); 940 num_pages << PAGE_CACHE_SHIFT);
1035 if (ret) 941 if (ret)
1036 goto out; 942 break;
1037 943
944 /*
945 * This is going to setup the pages array with the number of
946 * pages we want, so we don't really need to worry about the
947 * contents of pages from loop to loop
948 */
1038 ret = prepare_pages(root, file, pages, num_pages, 949 ret = prepare_pages(root, file, pages, num_pages,
1039 pos, first_index, last_index, 950 pos, first_index, last_index,
1040 write_bytes); 951 write_bytes);
1041 if (ret) { 952 if (ret) {
1042 btrfs_delalloc_release_space(inode, 953 btrfs_delalloc_release_space(inode,
1043 num_pages << PAGE_CACHE_SHIFT); 954 num_pages << PAGE_CACHE_SHIFT);
1044 goto out; 955 break;
1045 } 956 }
1046 957
1047 copied = btrfs_copy_from_user(pos, num_pages, 958 copied = btrfs_copy_from_user(pos, num_pages,
1048 write_bytes, pages, &i); 959 write_bytes, pages, i);
1049 960
1050 /* 961 /*
1051 * if we have trouble faulting in the pages, fall 962 * if we have trouble faulting in the pages, fall
@@ -1061,6 +972,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1061 PAGE_CACHE_SIZE - 1) >> 972 PAGE_CACHE_SIZE - 1) >>
1062 PAGE_CACHE_SHIFT; 973 PAGE_CACHE_SHIFT;
1063 974
975 /*
976 * If we had a short copy we need to release the excess delaloc
977 * bytes we reserved. We need to increment outstanding_extents
978 * because btrfs_delalloc_release_space will decrement it, but
979 * we still have an outstanding extent for the chunk we actually
980 * managed to copy.
981 */
1064 if (num_pages > dirty_pages) { 982 if (num_pages > dirty_pages) {
1065 if (copied > 0) 983 if (copied > 0)
1066 atomic_inc( 984 atomic_inc(
@@ -1071,39 +989,157 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1071 } 989 }
1072 990
1073 if (copied > 0) { 991 if (copied > 0) {
1074 dirty_and_release_pages(NULL, root, file, pages, 992 ret = btrfs_dirty_pages(root, inode, pages,
1075 dirty_pages, pos, copied); 993 dirty_pages, pos, copied,
994 NULL);
995 if (ret) {
996 btrfs_delalloc_release_space(inode,
997 dirty_pages << PAGE_CACHE_SHIFT);
998 btrfs_drop_pages(pages, num_pages);
999 break;
1000 }
1076 } 1001 }
1077 1002
1078 btrfs_drop_pages(pages, num_pages); 1003 btrfs_drop_pages(pages, num_pages);
1079 1004
1080 if (copied > 0) { 1005 cond_resched();
1081 if (will_write) { 1006
1082 filemap_fdatawrite_range(inode->i_mapping, pos, 1007 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
1083 pos + copied - 1); 1008 dirty_pages);
1084 } else { 1009 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1085 balance_dirty_pages_ratelimited_nr( 1010 btrfs_btree_balance_dirty(root, 1);
1086 inode->i_mapping, 1011 btrfs_throttle(root);
1087 dirty_pages);
1088 if (dirty_pages <
1089 (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1090 btrfs_btree_balance_dirty(root, 1);
1091 btrfs_throttle(root);
1092 }
1093 }
1094 1012
1095 pos += copied; 1013 pos += copied;
1096 num_written += copied; 1014 num_written += copied;
1015 }
1097 1016
1098 cond_resched(); 1017 kfree(pages);
1018
1019 return num_written ? num_written : ret;
1020}
1021
1022static ssize_t __btrfs_direct_write(struct kiocb *iocb,
1023 const struct iovec *iov,
1024 unsigned long nr_segs, loff_t pos,
1025 loff_t *ppos, size_t count, size_t ocount)
1026{
1027 struct file *file = iocb->ki_filp;
1028 struct inode *inode = fdentry(file)->d_inode;
1029 struct iov_iter i;
1030 ssize_t written;
1031 ssize_t written_buffered;
1032 loff_t endbyte;
1033 int err;
1034
1035 written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos,
1036 count, ocount);
1037
1038 /*
1039 * the generic O_DIRECT will update in-memory i_size after the
1040 * DIOs are done. But our endio handlers that update the on
1041 * disk i_size never update past the in memory i_size. So we
1042 * need one more update here to catch any additions to the
1043 * file
1044 */
1045 if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
1046 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
1047 mark_inode_dirty(inode);
1099 } 1048 }
1049
1050 if (written < 0 || written == count)
1051 return written;
1052
1053 pos += written;
1054 count -= written;
1055 iov_iter_init(&i, iov, nr_segs, count, written);
1056 written_buffered = __btrfs_buffered_write(file, &i, pos);
1057 if (written_buffered < 0) {
1058 err = written_buffered;
1059 goto out;
1060 }
1061 endbyte = pos + written_buffered - 1;
1062 err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
1063 if (err)
1064 goto out;
1065 written += written_buffered;
1066 *ppos = pos + written_buffered;
1067 invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
1068 endbyte >> PAGE_CACHE_SHIFT);
1100out: 1069out:
1101 mutex_unlock(&inode->i_mutex); 1070 return written ? written : err;
1102 if (ret) 1071}
1103 err = ret;
1104 1072
1105 kfree(pages); 1073static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1106 *ppos = pos; 1074 const struct iovec *iov,
1075 unsigned long nr_segs, loff_t pos)
1076{
1077 struct file *file = iocb->ki_filp;
1078 struct inode *inode = fdentry(file)->d_inode;
1079 struct btrfs_root *root = BTRFS_I(inode)->root;
1080 loff_t *ppos = &iocb->ki_pos;
1081 ssize_t num_written = 0;
1082 ssize_t err = 0;
1083 size_t count, ocount;
1084
1085 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
1086
1087 mutex_lock(&inode->i_mutex);
1088
1089 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
1090 if (err) {
1091 mutex_unlock(&inode->i_mutex);
1092 goto out;
1093 }
1094 count = ocount;
1095
1096 current->backing_dev_info = inode->i_mapping->backing_dev_info;
1097 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
1098 if (err) {
1099 mutex_unlock(&inode->i_mutex);
1100 goto out;
1101 }
1102
1103 if (count == 0) {
1104 mutex_unlock(&inode->i_mutex);
1105 goto out;
1106 }
1107
1108 err = file_remove_suid(file);
1109 if (err) {
1110 mutex_unlock(&inode->i_mutex);
1111 goto out;
1112 }
1113
1114 /*
1115 * If BTRFS flips readonly due to some impossible error
1116 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
1117 * although we have opened a file as writable, we have
1118 * to stop this write operation to ensure FS consistency.
1119 */
1120 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
1121 mutex_unlock(&inode->i_mutex);
1122 err = -EROFS;
1123 goto out;
1124 }
1125
1126 file_update_time(file);
1127 BTRFS_I(inode)->sequence++;
1128
1129 if (unlikely(file->f_flags & O_DIRECT)) {
1130 num_written = __btrfs_direct_write(iocb, iov, nr_segs,
1131 pos, ppos, count, ocount);
1132 } else {
1133 struct iov_iter i;
1134
1135 iov_iter_init(&i, iov, nr_segs, count, num_written);
1136
1137 num_written = __btrfs_buffered_write(file, &i, pos);
1138 if (num_written > 0)
1139 *ppos = pos + num_written;
1140 }
1141
1142 mutex_unlock(&inode->i_mutex);
1107 1143
1108 /* 1144 /*
1109 * we want to make sure fsync finds this change 1145 * we want to make sure fsync finds this change
@@ -1118,43 +1154,12 @@ out:
1118 * one running right now. 1154 * one running right now.
1119 */ 1155 */
1120 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; 1156 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1121 1157 if (num_written > 0 || num_written == -EIOCBQUEUED) {
1122 if (num_written > 0 && will_write) { 1158 err = generic_write_sync(file, pos, num_written);
1123 struct btrfs_trans_handle *trans; 1159 if (err < 0 && num_written > 0)
1124
1125 err = btrfs_wait_ordered_range(inode, start_pos, num_written);
1126 if (err)
1127 num_written = err; 1160 num_written = err;
1128
1129 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
1130 trans = btrfs_start_transaction(root, 0);
1131 if (IS_ERR(trans)) {
1132 num_written = PTR_ERR(trans);
1133 goto done;
1134 }
1135 mutex_lock(&inode->i_mutex);
1136 ret = btrfs_log_dentry_safe(trans, root,
1137 file->f_dentry);
1138 mutex_unlock(&inode->i_mutex);
1139 if (ret == 0) {
1140 ret = btrfs_sync_log(trans, root);
1141 if (ret == 0)
1142 btrfs_end_transaction(trans, root);
1143 else
1144 btrfs_commit_transaction(trans, root);
1145 } else if (ret != BTRFS_NO_LOG_SYNC) {
1146 btrfs_commit_transaction(trans, root);
1147 } else {
1148 btrfs_end_transaction(trans, root);
1149 }
1150 }
1151 if (file->f_flags & O_DIRECT && buffered) {
1152 invalidate_mapping_pages(inode->i_mapping,
1153 start_pos >> PAGE_CACHE_SHIFT,
1154 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
1155 }
1156 } 1161 }
1157done: 1162out:
1158 current->backing_dev_info = NULL; 1163 current->backing_dev_info = NULL;
1159 return num_written ? num_written : err; 1164 return num_written ? num_written : err;
1160} 1165}
@@ -1197,6 +1202,7 @@ int btrfs_sync_file(struct file *file, int datasync)
1197 int ret = 0; 1202 int ret = 0;
1198 struct btrfs_trans_handle *trans; 1203 struct btrfs_trans_handle *trans;
1199 1204
1205 trace_btrfs_sync_file(file, datasync);
1200 1206
1201 /* we wait first, since the writeback may change the inode */ 1207 /* we wait first, since the writeback may change the inode */
1202 root->log_batch++; 1208 root->log_batch++;
@@ -1324,7 +1330,8 @@ static long btrfs_fallocate(struct file *file, int mode,
1324 goto out; 1330 goto out;
1325 1331
1326 if (alloc_start > inode->i_size) { 1332 if (alloc_start > inode->i_size) {
1327 ret = btrfs_cont_expand(inode, alloc_start); 1333 ret = btrfs_cont_expand(inode, i_size_read(inode),
1334 alloc_start);
1328 if (ret) 1335 if (ret)
1329 goto out; 1336 goto out;
1330 } 1337 }
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index a0390657451b..11d2e9cea09e 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -24,6 +24,7 @@
24#include "free-space-cache.h" 24#include "free-space-cache.h"
25#include "transaction.h" 25#include "transaction.h"
26#include "disk-io.h" 26#include "disk-io.h"
27#include "extent_io.h"
27 28
28#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) 29#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
29#define MAX_CACHE_BYTES_PER_GIG (32 * 1024) 30#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
@@ -81,6 +82,8 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
81 return ERR_PTR(-ENOENT); 82 return ERR_PTR(-ENOENT);
82 } 83 }
83 84
85 inode->i_mapping->flags &= ~__GFP_FS;
86
84 spin_lock(&block_group->lock); 87 spin_lock(&block_group->lock);
85 if (!root->fs_info->closing) { 88 if (!root->fs_info->closing) {
86 block_group->inode = igrab(inode); 89 block_group->inode = igrab(inode);
@@ -222,6 +225,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
222 u64 num_entries; 225 u64 num_entries;
223 u64 num_bitmaps; 226 u64 num_bitmaps;
224 u64 generation; 227 u64 generation;
228 u64 used = btrfs_block_group_used(&block_group->item);
225 u32 cur_crc = ~(u32)0; 229 u32 cur_crc = ~(u32)0;
226 pgoff_t index = 0; 230 pgoff_t index = 0;
227 unsigned long first_page_offset; 231 unsigned long first_page_offset;
@@ -393,7 +397,8 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
393 break; 397 break;
394 398
395 need_loop = 1; 399 need_loop = 1;
396 e = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); 400 e = kmem_cache_zalloc(btrfs_free_space_cachep,
401 GFP_NOFS);
397 if (!e) { 402 if (!e) {
398 kunmap(page); 403 kunmap(page);
399 unlock_page(page); 404 unlock_page(page);
@@ -405,7 +410,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
405 e->bytes = le64_to_cpu(entry->bytes); 410 e->bytes = le64_to_cpu(entry->bytes);
406 if (!e->bytes) { 411 if (!e->bytes) {
407 kunmap(page); 412 kunmap(page);
408 kfree(e); 413 kmem_cache_free(btrfs_free_space_cachep, e);
409 unlock_page(page); 414 unlock_page(page);
410 page_cache_release(page); 415 page_cache_release(page);
411 goto free_cache; 416 goto free_cache;
@@ -420,7 +425,8 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
420 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); 425 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
421 if (!e->bitmap) { 426 if (!e->bitmap) {
422 kunmap(page); 427 kunmap(page);
423 kfree(e); 428 kmem_cache_free(
429 btrfs_free_space_cachep, e);
424 unlock_page(page); 430 unlock_page(page);
425 page_cache_release(page); 431 page_cache_release(page);
426 goto free_cache; 432 goto free_cache;
@@ -465,6 +471,17 @@ next:
465 index++; 471 index++;
466 } 472 }
467 473
474 spin_lock(&block_group->tree_lock);
475 if (block_group->free_space != (block_group->key.offset - used -
476 block_group->bytes_super)) {
477 spin_unlock(&block_group->tree_lock);
478 printk(KERN_ERR "block group %llu has an wrong amount of free "
479 "space\n", block_group->key.objectid);
480 ret = 0;
481 goto free_cache;
482 }
483 spin_unlock(&block_group->tree_lock);
484
468 ret = 1; 485 ret = 1;
469out: 486out:
470 kfree(checksums); 487 kfree(checksums);
@@ -491,18 +508,23 @@ int btrfs_write_out_cache(struct btrfs_root *root,
491 struct inode *inode; 508 struct inode *inode;
492 struct rb_node *node; 509 struct rb_node *node;
493 struct list_head *pos, *n; 510 struct list_head *pos, *n;
511 struct page **pages;
494 struct page *page; 512 struct page *page;
495 struct extent_state *cached_state = NULL; 513 struct extent_state *cached_state = NULL;
514 struct btrfs_free_cluster *cluster = NULL;
515 struct extent_io_tree *unpin = NULL;
496 struct list_head bitmap_list; 516 struct list_head bitmap_list;
497 struct btrfs_key key; 517 struct btrfs_key key;
518 u64 start, end, len;
498 u64 bytes = 0; 519 u64 bytes = 0;
499 u32 *crc, *checksums; 520 u32 *crc, *checksums;
500 pgoff_t index = 0, last_index = 0;
501 unsigned long first_page_offset; 521 unsigned long first_page_offset;
502 int num_checksums; 522 int index = 0, num_pages = 0;
503 int entries = 0; 523 int entries = 0;
504 int bitmaps = 0; 524 int bitmaps = 0;
505 int ret = 0; 525 int ret = 0;
526 bool next_page = false;
527 bool out_of_space = false;
506 528
507 root = root->fs_info->tree_root; 529 root = root->fs_info->tree_root;
508 530
@@ -530,24 +552,43 @@ int btrfs_write_out_cache(struct btrfs_root *root,
530 return 0; 552 return 0;
531 } 553 }
532 554
533 last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; 555 num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
556 PAGE_CACHE_SHIFT;
534 filemap_write_and_wait(inode->i_mapping); 557 filemap_write_and_wait(inode->i_mapping);
535 btrfs_wait_ordered_range(inode, inode->i_size & 558 btrfs_wait_ordered_range(inode, inode->i_size &
536 ~(root->sectorsize - 1), (u64)-1); 559 ~(root->sectorsize - 1), (u64)-1);
537 560
538 /* We need a checksum per page. */ 561 /* We need a checksum per page. */
539 num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE; 562 crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS);
540 crc = checksums = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
541 if (!crc) { 563 if (!crc) {
542 iput(inode); 564 iput(inode);
543 return 0; 565 return 0;
544 } 566 }
545 567
568 pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
569 if (!pages) {
570 kfree(crc);
571 iput(inode);
572 return 0;
573 }
574
546 /* Since the first page has all of our checksums and our generation we 575 /* Since the first page has all of our checksums and our generation we
547 * need to calculate the offset into the page that we can start writing 576 * need to calculate the offset into the page that we can start writing
548 * our entries. 577 * our entries.
549 */ 578 */
550 first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); 579 first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64);
580
581 /* Get the cluster for this block_group if it exists */
582 if (!list_empty(&block_group->cluster_list))
583 cluster = list_entry(block_group->cluster_list.next,
584 struct btrfs_free_cluster,
585 block_group_list);
586
587 /*
588 * We shouldn't have switched the pinned extents yet so this is the
589 * right one
590 */
591 unpin = root->fs_info->pinned_extents;
551 592
552 /* 593 /*
553 * Lock all pages first so we can lock the extent safely. 594 * Lock all pages first so we can lock the extent safely.
@@ -557,20 +598,18 @@ int btrfs_write_out_cache(struct btrfs_root *root,
557 * after find_get_page at this point. Just putting this here so people 598 * after find_get_page at this point. Just putting this here so people
558 * know and don't freak out. 599 * know and don't freak out.
559 */ 600 */
560 while (index <= last_index) { 601 while (index < num_pages) {
561 page = grab_cache_page(inode->i_mapping, index); 602 page = grab_cache_page(inode->i_mapping, index);
562 if (!page) { 603 if (!page) {
563 pgoff_t i = 0; 604 int i;
564 605
565 while (i < index) { 606 for (i = 0; i < num_pages; i++) {
566 page = find_get_page(inode->i_mapping, i); 607 unlock_page(pages[i]);
567 unlock_page(page); 608 page_cache_release(pages[i]);
568 page_cache_release(page);
569 page_cache_release(page);
570 i++;
571 } 609 }
572 goto out_free; 610 goto out_free;
573 } 611 }
612 pages[index] = page;
574 index++; 613 index++;
575 } 614 }
576 615
@@ -578,6 +617,12 @@ int btrfs_write_out_cache(struct btrfs_root *root,
578 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 617 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
579 0, &cached_state, GFP_NOFS); 618 0, &cached_state, GFP_NOFS);
580 619
620 /*
621 * When searching for pinned extents, we need to start at our start
622 * offset.
623 */
624 start = block_group->key.objectid;
625
581 /* Write out the extent entries */ 626 /* Write out the extent entries */
582 do { 627 do {
583 struct btrfs_free_space_entry *entry; 628 struct btrfs_free_space_entry *entry;
@@ -585,18 +630,25 @@ int btrfs_write_out_cache(struct btrfs_root *root,
585 unsigned long offset = 0; 630 unsigned long offset = 0;
586 unsigned long start_offset = 0; 631 unsigned long start_offset = 0;
587 632
633 next_page = false;
634
588 if (index == 0) { 635 if (index == 0) {
589 start_offset = first_page_offset; 636 start_offset = first_page_offset;
590 offset = start_offset; 637 offset = start_offset;
591 } 638 }
592 639
593 page = find_get_page(inode->i_mapping, index); 640 if (index >= num_pages) {
641 out_of_space = true;
642 break;
643 }
644
645 page = pages[index];
594 646
595 addr = kmap(page); 647 addr = kmap(page);
596 entry = addr + start_offset; 648 entry = addr + start_offset;
597 649
598 memset(addr, 0, PAGE_CACHE_SIZE); 650 memset(addr, 0, PAGE_CACHE_SIZE);
599 while (1) { 651 while (node && !next_page) {
600 struct btrfs_free_space *e; 652 struct btrfs_free_space *e;
601 653
602 e = rb_entry(node, struct btrfs_free_space, offset_index); 654 e = rb_entry(node, struct btrfs_free_space, offset_index);
@@ -612,12 +664,49 @@ int btrfs_write_out_cache(struct btrfs_root *root,
612 entry->type = BTRFS_FREE_SPACE_EXTENT; 664 entry->type = BTRFS_FREE_SPACE_EXTENT;
613 } 665 }
614 node = rb_next(node); 666 node = rb_next(node);
615 if (!node) 667 if (!node && cluster) {
616 break; 668 node = rb_first(&cluster->root);
669 cluster = NULL;
670 }
617 offset += sizeof(struct btrfs_free_space_entry); 671 offset += sizeof(struct btrfs_free_space_entry);
618 if (offset + sizeof(struct btrfs_free_space_entry) >= 672 if (offset + sizeof(struct btrfs_free_space_entry) >=
619 PAGE_CACHE_SIZE) 673 PAGE_CACHE_SIZE)
674 next_page = true;
675 entry++;
676 }
677
678 /*
679 * We want to add any pinned extents to our free space cache
680 * so we don't leak the space
681 */
682 while (!next_page && (start < block_group->key.objectid +
683 block_group->key.offset)) {
684 ret = find_first_extent_bit(unpin, start, &start, &end,
685 EXTENT_DIRTY);
686 if (ret) {
687 ret = 0;
688 break;
689 }
690
691 /* This pinned extent is out of our range */
692 if (start >= block_group->key.objectid +
693 block_group->key.offset)
620 break; 694 break;
695
696 len = block_group->key.objectid +
697 block_group->key.offset - start;
698 len = min(len, end + 1 - start);
699
700 entries++;
701 entry->offset = cpu_to_le64(start);
702 entry->bytes = cpu_to_le64(len);
703 entry->type = BTRFS_FREE_SPACE_EXTENT;
704
705 start = end + 1;
706 offset += sizeof(struct btrfs_free_space_entry);
707 if (offset + sizeof(struct btrfs_free_space_entry) >=
708 PAGE_CACHE_SIZE)
709 next_page = true;
621 entry++; 710 entry++;
622 } 711 }
623 *crc = ~(u32)0; 712 *crc = ~(u32)0;
@@ -630,25 +719,8 @@ int btrfs_write_out_cache(struct btrfs_root *root,
630 719
631 bytes += PAGE_CACHE_SIZE; 720 bytes += PAGE_CACHE_SIZE;
632 721
633 ClearPageChecked(page);
634 set_page_extent_mapped(page);
635 SetPageUptodate(page);
636 set_page_dirty(page);
637
638 /*
639 * We need to release our reference we got for grab_cache_page,
640 * except for the first page which will hold our checksums, we
641 * do that below.
642 */
643 if (index != 0) {
644 unlock_page(page);
645 page_cache_release(page);
646 }
647
648 page_cache_release(page);
649
650 index++; 722 index++;
651 } while (node); 723 } while (node || next_page);
652 724
653 /* Write out the bitmaps */ 725 /* Write out the bitmaps */
654 list_for_each_safe(pos, n, &bitmap_list) { 726 list_for_each_safe(pos, n, &bitmap_list) {
@@ -656,7 +728,11 @@ int btrfs_write_out_cache(struct btrfs_root *root,
656 struct btrfs_free_space *entry = 728 struct btrfs_free_space *entry =
657 list_entry(pos, struct btrfs_free_space, list); 729 list_entry(pos, struct btrfs_free_space, list);
658 730
659 page = find_get_page(inode->i_mapping, index); 731 if (index >= num_pages) {
732 out_of_space = true;
733 break;
734 }
735 page = pages[index];
660 736
661 addr = kmap(page); 737 addr = kmap(page);
662 memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE); 738 memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
@@ -667,64 +743,58 @@ int btrfs_write_out_cache(struct btrfs_root *root,
667 crc++; 743 crc++;
668 bytes += PAGE_CACHE_SIZE; 744 bytes += PAGE_CACHE_SIZE;
669 745
670 ClearPageChecked(page);
671 set_page_extent_mapped(page);
672 SetPageUptodate(page);
673 set_page_dirty(page);
674 unlock_page(page);
675 page_cache_release(page);
676 page_cache_release(page);
677 list_del_init(&entry->list); 746 list_del_init(&entry->list);
678 index++; 747 index++;
679 } 748 }
680 749
750 if (out_of_space) {
751 btrfs_drop_pages(pages, num_pages);
752 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
753 i_size_read(inode) - 1, &cached_state,
754 GFP_NOFS);
755 ret = 0;
756 goto out_free;
757 }
758
681 /* Zero out the rest of the pages just to make sure */ 759 /* Zero out the rest of the pages just to make sure */
682 while (index <= last_index) { 760 while (index < num_pages) {
683 void *addr; 761 void *addr;
684 762
685 page = find_get_page(inode->i_mapping, index); 763 page = pages[index];
686
687 addr = kmap(page); 764 addr = kmap(page);
688 memset(addr, 0, PAGE_CACHE_SIZE); 765 memset(addr, 0, PAGE_CACHE_SIZE);
689 kunmap(page); 766 kunmap(page);
690 ClearPageChecked(page);
691 set_page_extent_mapped(page);
692 SetPageUptodate(page);
693 set_page_dirty(page);
694 unlock_page(page);
695 page_cache_release(page);
696 page_cache_release(page);
697 bytes += PAGE_CACHE_SIZE; 767 bytes += PAGE_CACHE_SIZE;
698 index++; 768 index++;
699 } 769 }
700 770
701 btrfs_set_extent_delalloc(inode, 0, bytes - 1, &cached_state);
702
703 /* Write the checksums and trans id to the first page */ 771 /* Write the checksums and trans id to the first page */
704 { 772 {
705 void *addr; 773 void *addr;
706 u64 *gen; 774 u64 *gen;
707 775
708 page = find_get_page(inode->i_mapping, 0); 776 page = pages[0];
709 777
710 addr = kmap(page); 778 addr = kmap(page);
711 memcpy(addr, checksums, sizeof(u32) * num_checksums); 779 memcpy(addr, checksums, sizeof(u32) * num_pages);
712 gen = addr + (sizeof(u32) * num_checksums); 780 gen = addr + (sizeof(u32) * num_pages);
713 *gen = trans->transid; 781 *gen = trans->transid;
714 kunmap(page); 782 kunmap(page);
715 ClearPageChecked(page);
716 set_page_extent_mapped(page);
717 SetPageUptodate(page);
718 set_page_dirty(page);
719 unlock_page(page);
720 page_cache_release(page);
721 page_cache_release(page);
722 } 783 }
723 BTRFS_I(inode)->generation = trans->transid;
724 784
785 ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0,
786 bytes, &cached_state);
787 btrfs_drop_pages(pages, num_pages);
725 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 788 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
726 i_size_read(inode) - 1, &cached_state, GFP_NOFS); 789 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
727 790
791 if (ret) {
792 ret = 0;
793 goto out_free;
794 }
795
796 BTRFS_I(inode)->generation = trans->transid;
797
728 filemap_write_and_wait(inode->i_mapping); 798 filemap_write_and_wait(inode->i_mapping);
729 799
730 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 800 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@ -775,6 +845,7 @@ out_free:
775 BTRFS_I(inode)->generation = 0; 845 BTRFS_I(inode)->generation = 0;
776 } 846 }
777 kfree(checksums); 847 kfree(checksums);
848 kfree(pages);
778 btrfs_update_inode(trans, root, inode); 849 btrfs_update_inode(trans, root, inode);
779 iput(inode); 850 iput(inode);
780 return ret; 851 return ret;
@@ -1187,7 +1258,7 @@ static void free_bitmap(struct btrfs_block_group_cache *block_group,
1187{ 1258{
1188 unlink_free_space(block_group, bitmap_info); 1259 unlink_free_space(block_group, bitmap_info);
1189 kfree(bitmap_info->bitmap); 1260 kfree(bitmap_info->bitmap);
1190 kfree(bitmap_info); 1261 kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
1191 block_group->total_bitmaps--; 1262 block_group->total_bitmaps--;
1192 recalculate_thresholds(block_group); 1263 recalculate_thresholds(block_group);
1193} 1264}
@@ -1285,9 +1356,22 @@ static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
1285 * If we are below the extents threshold then we can add this as an 1356 * If we are below the extents threshold then we can add this as an
1286 * extent, and don't have to deal with the bitmap 1357 * extent, and don't have to deal with the bitmap
1287 */ 1358 */
1288 if (block_group->free_extents < block_group->extents_thresh && 1359 if (block_group->free_extents < block_group->extents_thresh) {
1289 info->bytes > block_group->sectorsize * 4) 1360 /*
1290 return 0; 1361 * If this block group has some small extents we don't want to
1362 * use up all of our free slots in the cache with them, we want
1363 * to reserve them to larger extents, however if we have plent
1364 * of cache left then go ahead an dadd them, no sense in adding
1365 * the overhead of a bitmap if we don't have to.
1366 */
1367 if (info->bytes <= block_group->sectorsize * 4) {
1368 if (block_group->free_extents * 2 <=
1369 block_group->extents_thresh)
1370 return 0;
1371 } else {
1372 return 0;
1373 }
1374 }
1291 1375
1292 /* 1376 /*
1293 * some block groups are so tiny they can't be enveloped by a bitmap, so 1377 * some block groups are so tiny they can't be enveloped by a bitmap, so
@@ -1342,8 +1426,8 @@ new_bitmap:
1342 1426
1343 /* no pre-allocated info, allocate a new one */ 1427 /* no pre-allocated info, allocate a new one */
1344 if (!info) { 1428 if (!info) {
1345 info = kzalloc(sizeof(struct btrfs_free_space), 1429 info = kmem_cache_zalloc(btrfs_free_space_cachep,
1346 GFP_NOFS); 1430 GFP_NOFS);
1347 if (!info) { 1431 if (!info) {
1348 spin_lock(&block_group->tree_lock); 1432 spin_lock(&block_group->tree_lock);
1349 ret = -ENOMEM; 1433 ret = -ENOMEM;
@@ -1365,7 +1449,7 @@ out:
1365 if (info) { 1449 if (info) {
1366 if (info->bitmap) 1450 if (info->bitmap)
1367 kfree(info->bitmap); 1451 kfree(info->bitmap);
1368 kfree(info); 1452 kmem_cache_free(btrfs_free_space_cachep, info);
1369 } 1453 }
1370 1454
1371 return ret; 1455 return ret;
@@ -1398,7 +1482,7 @@ bool try_merge_free_space(struct btrfs_block_group_cache *block_group,
1398 else 1482 else
1399 __unlink_free_space(block_group, right_info); 1483 __unlink_free_space(block_group, right_info);
1400 info->bytes += right_info->bytes; 1484 info->bytes += right_info->bytes;
1401 kfree(right_info); 1485 kmem_cache_free(btrfs_free_space_cachep, right_info);
1402 merged = true; 1486 merged = true;
1403 } 1487 }
1404 1488
@@ -1410,7 +1494,7 @@ bool try_merge_free_space(struct btrfs_block_group_cache *block_group,
1410 __unlink_free_space(block_group, left_info); 1494 __unlink_free_space(block_group, left_info);
1411 info->offset = left_info->offset; 1495 info->offset = left_info->offset;
1412 info->bytes += left_info->bytes; 1496 info->bytes += left_info->bytes;
1413 kfree(left_info); 1497 kmem_cache_free(btrfs_free_space_cachep, left_info);
1414 merged = true; 1498 merged = true;
1415 } 1499 }
1416 1500
@@ -1423,7 +1507,7 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
1423 struct btrfs_free_space *info; 1507 struct btrfs_free_space *info;
1424 int ret = 0; 1508 int ret = 0;
1425 1509
1426 info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); 1510 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
1427 if (!info) 1511 if (!info)
1428 return -ENOMEM; 1512 return -ENOMEM;
1429 1513
@@ -1450,7 +1534,7 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
1450link: 1534link:
1451 ret = link_free_space(block_group, info); 1535 ret = link_free_space(block_group, info);
1452 if (ret) 1536 if (ret)
1453 kfree(info); 1537 kmem_cache_free(btrfs_free_space_cachep, info);
1454out: 1538out:
1455 spin_unlock(&block_group->tree_lock); 1539 spin_unlock(&block_group->tree_lock);
1456 1540
@@ -1520,7 +1604,7 @@ again:
1520 kfree(info->bitmap); 1604 kfree(info->bitmap);
1521 block_group->total_bitmaps--; 1605 block_group->total_bitmaps--;
1522 } 1606 }
1523 kfree(info); 1607 kmem_cache_free(btrfs_free_space_cachep, info);
1524 goto out_lock; 1608 goto out_lock;
1525 } 1609 }
1526 1610
@@ -1556,7 +1640,7 @@ again:
1556 /* the hole we're creating ends at the end 1640 /* the hole we're creating ends at the end
1557 * of the info struct, just free the info 1641 * of the info struct, just free the info
1558 */ 1642 */
1559 kfree(info); 1643 kmem_cache_free(btrfs_free_space_cachep, info);
1560 } 1644 }
1561 spin_unlock(&block_group->tree_lock); 1645 spin_unlock(&block_group->tree_lock);
1562 1646
@@ -1629,30 +1713,28 @@ __btrfs_return_cluster_to_free_space(
1629{ 1713{
1630 struct btrfs_free_space *entry; 1714 struct btrfs_free_space *entry;
1631 struct rb_node *node; 1715 struct rb_node *node;
1632 bool bitmap;
1633 1716
1634 spin_lock(&cluster->lock); 1717 spin_lock(&cluster->lock);
1635 if (cluster->block_group != block_group) 1718 if (cluster->block_group != block_group)
1636 goto out; 1719 goto out;
1637 1720
1638 bitmap = cluster->points_to_bitmap;
1639 cluster->block_group = NULL; 1721 cluster->block_group = NULL;
1640 cluster->window_start = 0; 1722 cluster->window_start = 0;
1641 list_del_init(&cluster->block_group_list); 1723 list_del_init(&cluster->block_group_list);
1642 cluster->points_to_bitmap = false;
1643
1644 if (bitmap)
1645 goto out;
1646 1724
1647 node = rb_first(&cluster->root); 1725 node = rb_first(&cluster->root);
1648 while (node) { 1726 while (node) {
1727 bool bitmap;
1728
1649 entry = rb_entry(node, struct btrfs_free_space, offset_index); 1729 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1650 node = rb_next(&entry->offset_index); 1730 node = rb_next(&entry->offset_index);
1651 rb_erase(&entry->offset_index, &cluster->root); 1731 rb_erase(&entry->offset_index, &cluster->root);
1652 BUG_ON(entry->bitmap); 1732
1653 try_merge_free_space(block_group, entry, false); 1733 bitmap = (entry->bitmap != NULL);
1734 if (!bitmap)
1735 try_merge_free_space(block_group, entry, false);
1654 tree_insert_offset(&block_group->free_space_offset, 1736 tree_insert_offset(&block_group->free_space_offset,
1655 entry->offset, &entry->offset_index, 0); 1737 entry->offset, &entry->offset_index, bitmap);
1656 } 1738 }
1657 cluster->root = RB_ROOT; 1739 cluster->root = RB_ROOT;
1658 1740
@@ -1689,7 +1771,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
1689 unlink_free_space(block_group, info); 1771 unlink_free_space(block_group, info);
1690 if (info->bitmap) 1772 if (info->bitmap)
1691 kfree(info->bitmap); 1773 kfree(info->bitmap);
1692 kfree(info); 1774 kmem_cache_free(btrfs_free_space_cachep, info);
1693 if (need_resched()) { 1775 if (need_resched()) {
1694 spin_unlock(&block_group->tree_lock); 1776 spin_unlock(&block_group->tree_lock);
1695 cond_resched(); 1777 cond_resched();
@@ -1722,7 +1804,7 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
1722 entry->offset += bytes; 1804 entry->offset += bytes;
1723 entry->bytes -= bytes; 1805 entry->bytes -= bytes;
1724 if (!entry->bytes) 1806 if (!entry->bytes)
1725 kfree(entry); 1807 kmem_cache_free(btrfs_free_space_cachep, entry);
1726 else 1808 else
1727 link_free_space(block_group, entry); 1809 link_free_space(block_group, entry);
1728 } 1810 }
@@ -1775,50 +1857,24 @@ int btrfs_return_cluster_to_free_space(
1775 1857
1776static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, 1858static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
1777 struct btrfs_free_cluster *cluster, 1859 struct btrfs_free_cluster *cluster,
1860 struct btrfs_free_space *entry,
1778 u64 bytes, u64 min_start) 1861 u64 bytes, u64 min_start)
1779{ 1862{
1780 struct btrfs_free_space *entry;
1781 int err; 1863 int err;
1782 u64 search_start = cluster->window_start; 1864 u64 search_start = cluster->window_start;
1783 u64 search_bytes = bytes; 1865 u64 search_bytes = bytes;
1784 u64 ret = 0; 1866 u64 ret = 0;
1785 1867
1786 spin_lock(&block_group->tree_lock);
1787 spin_lock(&cluster->lock);
1788
1789 if (!cluster->points_to_bitmap)
1790 goto out;
1791
1792 if (cluster->block_group != block_group)
1793 goto out;
1794
1795 /*
1796 * search_start is the beginning of the bitmap, but at some point it may
1797 * be a good idea to point to the actual start of the free area in the
1798 * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only
1799 * to 1 to make sure we get the bitmap entry
1800 */
1801 entry = tree_search_offset(block_group,
1802 offset_to_bitmap(block_group, search_start),
1803 1, 0);
1804 if (!entry || !entry->bitmap)
1805 goto out;
1806
1807 search_start = min_start; 1868 search_start = min_start;
1808 search_bytes = bytes; 1869 search_bytes = bytes;
1809 1870
1810 err = search_bitmap(block_group, entry, &search_start, 1871 err = search_bitmap(block_group, entry, &search_start,
1811 &search_bytes); 1872 &search_bytes);
1812 if (err) 1873 if (err)
1813 goto out; 1874 return 0;
1814 1875
1815 ret = search_start; 1876 ret = search_start;
1816 bitmap_clear_bits(block_group, entry, ret, bytes); 1877 bitmap_clear_bits(block_group, entry, ret, bytes);
1817 if (entry->bytes == 0)
1818 free_bitmap(block_group, entry);
1819out:
1820 spin_unlock(&cluster->lock);
1821 spin_unlock(&block_group->tree_lock);
1822 1878
1823 return ret; 1879 return ret;
1824} 1880}
@@ -1836,10 +1892,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
1836 struct rb_node *node; 1892 struct rb_node *node;
1837 u64 ret = 0; 1893 u64 ret = 0;
1838 1894
1839 if (cluster->points_to_bitmap)
1840 return btrfs_alloc_from_bitmap(block_group, cluster, bytes,
1841 min_start);
1842
1843 spin_lock(&cluster->lock); 1895 spin_lock(&cluster->lock);
1844 if (bytes > cluster->max_size) 1896 if (bytes > cluster->max_size)
1845 goto out; 1897 goto out;
@@ -1852,9 +1904,9 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
1852 goto out; 1904 goto out;
1853 1905
1854 entry = rb_entry(node, struct btrfs_free_space, offset_index); 1906 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1855
1856 while(1) { 1907 while(1) {
1857 if (entry->bytes < bytes || entry->offset < min_start) { 1908 if (entry->bytes < bytes ||
1909 (!entry->bitmap && entry->offset < min_start)) {
1858 struct rb_node *node; 1910 struct rb_node *node;
1859 1911
1860 node = rb_next(&entry->offset_index); 1912 node = rb_next(&entry->offset_index);
@@ -1864,10 +1916,27 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
1864 offset_index); 1916 offset_index);
1865 continue; 1917 continue;
1866 } 1918 }
1867 ret = entry->offset;
1868 1919
1869 entry->offset += bytes; 1920 if (entry->bitmap) {
1870 entry->bytes -= bytes; 1921 ret = btrfs_alloc_from_bitmap(block_group,
1922 cluster, entry, bytes,
1923 min_start);
1924 if (ret == 0) {
1925 struct rb_node *node;
1926 node = rb_next(&entry->offset_index);
1927 if (!node)
1928 break;
1929 entry = rb_entry(node, struct btrfs_free_space,
1930 offset_index);
1931 continue;
1932 }
1933 } else {
1934
1935 ret = entry->offset;
1936
1937 entry->offset += bytes;
1938 entry->bytes -= bytes;
1939 }
1871 1940
1872 if (entry->bytes == 0) 1941 if (entry->bytes == 0)
1873 rb_erase(&entry->offset_index, &cluster->root); 1942 rb_erase(&entry->offset_index, &cluster->root);
@@ -1884,7 +1953,12 @@ out:
1884 block_group->free_space -= bytes; 1953 block_group->free_space -= bytes;
1885 if (entry->bytes == 0) { 1954 if (entry->bytes == 0) {
1886 block_group->free_extents--; 1955 block_group->free_extents--;
1887 kfree(entry); 1956 if (entry->bitmap) {
1957 kfree(entry->bitmap);
1958 block_group->total_bitmaps--;
1959 recalculate_thresholds(block_group);
1960 }
1961 kmem_cache_free(btrfs_free_space_cachep, entry);
1888 } 1962 }
1889 1963
1890 spin_unlock(&block_group->tree_lock); 1964 spin_unlock(&block_group->tree_lock);
@@ -1904,12 +1978,13 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
1904 unsigned long found_bits; 1978 unsigned long found_bits;
1905 unsigned long start = 0; 1979 unsigned long start = 0;
1906 unsigned long total_found = 0; 1980 unsigned long total_found = 0;
1981 int ret;
1907 bool found = false; 1982 bool found = false;
1908 1983
1909 i = offset_to_bit(entry->offset, block_group->sectorsize, 1984 i = offset_to_bit(entry->offset, block_group->sectorsize,
1910 max_t(u64, offset, entry->offset)); 1985 max_t(u64, offset, entry->offset));
1911 search_bits = bytes_to_bits(min_bytes, block_group->sectorsize); 1986 search_bits = bytes_to_bits(bytes, block_group->sectorsize);
1912 total_bits = bytes_to_bits(bytes, block_group->sectorsize); 1987 total_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
1913 1988
1914again: 1989again:
1915 found_bits = 0; 1990 found_bits = 0;
@@ -1926,7 +2001,7 @@ again:
1926 } 2001 }
1927 2002
1928 if (!found_bits) 2003 if (!found_bits)
1929 return -1; 2004 return -ENOSPC;
1930 2005
1931 if (!found) { 2006 if (!found) {
1932 start = i; 2007 start = i;
@@ -1950,189 +2025,208 @@ again:
1950 2025
1951 cluster->window_start = start * block_group->sectorsize + 2026 cluster->window_start = start * block_group->sectorsize +
1952 entry->offset; 2027 entry->offset;
1953 cluster->points_to_bitmap = true; 2028 rb_erase(&entry->offset_index, &block_group->free_space_offset);
2029 ret = tree_insert_offset(&cluster->root, entry->offset,
2030 &entry->offset_index, 1);
2031 BUG_ON(ret);
1954 2032
1955 return 0; 2033 return 0;
1956} 2034}
1957 2035
1958/* 2036/*
1959 * here we try to find a cluster of blocks in a block group. The goal 2037 * This searches the block group for just extents to fill the cluster with.
1960 * is to find at least bytes free and up to empty_size + bytes free.
1961 * We might not find them all in one contiguous area.
1962 *
1963 * returns zero and sets up cluster if things worked out, otherwise
1964 * it returns -enospc
1965 */ 2038 */
1966int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, 2039static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
1967 struct btrfs_root *root, 2040 struct btrfs_free_cluster *cluster,
1968 struct btrfs_block_group_cache *block_group, 2041 u64 offset, u64 bytes, u64 min_bytes)
1969 struct btrfs_free_cluster *cluster,
1970 u64 offset, u64 bytes, u64 empty_size)
1971{ 2042{
2043 struct btrfs_free_space *first = NULL;
1972 struct btrfs_free_space *entry = NULL; 2044 struct btrfs_free_space *entry = NULL;
2045 struct btrfs_free_space *prev = NULL;
2046 struct btrfs_free_space *last;
1973 struct rb_node *node; 2047 struct rb_node *node;
1974 struct btrfs_free_space *next;
1975 struct btrfs_free_space *last = NULL;
1976 u64 min_bytes;
1977 u64 window_start; 2048 u64 window_start;
1978 u64 window_free; 2049 u64 window_free;
1979 u64 max_extent = 0; 2050 u64 max_extent;
1980 bool found_bitmap = false; 2051 u64 max_gap = 128 * 1024;
1981 int ret;
1982 2052
1983 /* for metadata, allow allocates with more holes */ 2053 entry = tree_search_offset(block_group, offset, 0, 1);
1984 if (btrfs_test_opt(root, SSD_SPREAD)) { 2054 if (!entry)
1985 min_bytes = bytes + empty_size; 2055 return -ENOSPC;
1986 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
1987 /*
1988 * we want to do larger allocations when we are
1989 * flushing out the delayed refs, it helps prevent
1990 * making more work as we go along.
1991 */
1992 if (trans->transaction->delayed_refs.flushing)
1993 min_bytes = max(bytes, (bytes + empty_size) >> 1);
1994 else
1995 min_bytes = max(bytes, (bytes + empty_size) >> 4);
1996 } else
1997 min_bytes = max(bytes, (bytes + empty_size) >> 2);
1998
1999 spin_lock(&block_group->tree_lock);
2000 spin_lock(&cluster->lock);
2001
2002 /* someone already found a cluster, hooray */
2003 if (cluster->block_group) {
2004 ret = 0;
2005 goto out;
2006 }
2007again:
2008 entry = tree_search_offset(block_group, offset, found_bitmap, 1);
2009 if (!entry) {
2010 ret = -ENOSPC;
2011 goto out;
2012 }
2013 2056
2014 /* 2057 /*
2015 * If found_bitmap is true, we exhausted our search for extent entries, 2058 * We don't want bitmaps, so just move along until we find a normal
2016 * and we just want to search all of the bitmaps that we can find, and 2059 * extent entry.
2017 * ignore any extent entries we find.
2018 */ 2060 */
2019 while (entry->bitmap || found_bitmap || 2061 while (entry->bitmap) {
2020 (!entry->bitmap && entry->bytes < min_bytes)) { 2062 node = rb_next(&entry->offset_index);
2021 struct rb_node *node = rb_next(&entry->offset_index); 2063 if (!node)
2022 2064 return -ENOSPC;
2023 if (entry->bitmap && entry->bytes > bytes + empty_size) {
2024 ret = btrfs_bitmap_cluster(block_group, entry, cluster,
2025 offset, bytes + empty_size,
2026 min_bytes);
2027 if (!ret)
2028 goto got_it;
2029 }
2030
2031 if (!node) {
2032 ret = -ENOSPC;
2033 goto out;
2034 }
2035 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2065 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2036 } 2066 }
2037 2067
2038 /*
2039 * We already searched all the extent entries from the passed in offset
2040 * to the end and didn't find enough space for the cluster, and we also
2041 * didn't find any bitmaps that met our criteria, just go ahead and exit
2042 */
2043 if (found_bitmap) {
2044 ret = -ENOSPC;
2045 goto out;
2046 }
2047
2048 cluster->points_to_bitmap = false;
2049 window_start = entry->offset; 2068 window_start = entry->offset;
2050 window_free = entry->bytes; 2069 window_free = entry->bytes;
2051 last = entry;
2052 max_extent = entry->bytes; 2070 max_extent = entry->bytes;
2071 first = entry;
2072 last = entry;
2073 prev = entry;
2053 2074
2054 while (1) { 2075 while (window_free <= min_bytes) {
2055 /* out window is just right, lets fill it */ 2076 node = rb_next(&entry->offset_index);
2056 if (window_free >= bytes + empty_size) 2077 if (!node)
2057 break; 2078 return -ENOSPC;
2058 2079 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2059 node = rb_next(&last->offset_index);
2060 if (!node) {
2061 if (found_bitmap)
2062 goto again;
2063 ret = -ENOSPC;
2064 goto out;
2065 }
2066 next = rb_entry(node, struct btrfs_free_space, offset_index);
2067 2080
2068 /* 2081 if (entry->bitmap)
2069 * we found a bitmap, so if this search doesn't result in a
2070 * cluster, we know to go and search again for the bitmaps and
2071 * start looking for space there
2072 */
2073 if (next->bitmap) {
2074 if (!found_bitmap)
2075 offset = next->offset;
2076 found_bitmap = true;
2077 last = next;
2078 continue; 2082 continue;
2079 }
2080
2081 /* 2083 /*
2082 * we haven't filled the empty size and the window is 2084 * we haven't filled the empty size and the window is
2083 * very large. reset and try again 2085 * very large. reset and try again
2084 */ 2086 */
2085 if (next->offset - (last->offset + last->bytes) > 128 * 1024 || 2087 if (entry->offset - (prev->offset + prev->bytes) > max_gap ||
2086 next->offset - window_start > (bytes + empty_size) * 2) { 2088 entry->offset - window_start > (min_bytes * 2)) {
2087 entry = next; 2089 first = entry;
2088 window_start = entry->offset; 2090 window_start = entry->offset;
2089 window_free = entry->bytes; 2091 window_free = entry->bytes;
2090 last = entry; 2092 last = entry;
2091 max_extent = entry->bytes; 2093 max_extent = entry->bytes;
2092 } else { 2094 } else {
2093 last = next; 2095 last = entry;
2094 window_free += next->bytes; 2096 window_free += entry->bytes;
2095 if (entry->bytes > max_extent) 2097 if (entry->bytes > max_extent)
2096 max_extent = entry->bytes; 2098 max_extent = entry->bytes;
2097 } 2099 }
2100 prev = entry;
2098 } 2101 }
2099 2102
2100 cluster->window_start = entry->offset; 2103 cluster->window_start = first->offset;
2104
2105 node = &first->offset_index;
2101 2106
2102 /* 2107 /*
2103 * now we've found our entries, pull them out of the free space 2108 * now we've found our entries, pull them out of the free space
2104 * cache and put them into the cluster rbtree 2109 * cache and put them into the cluster rbtree
2105 *
2106 * The cluster includes an rbtree, but only uses the offset index
2107 * of each free space cache entry.
2108 */ 2110 */
2109 while (1) { 2111 do {
2112 int ret;
2113
2114 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2110 node = rb_next(&entry->offset_index); 2115 node = rb_next(&entry->offset_index);
2111 if (entry->bitmap && node) { 2116 if (entry->bitmap)
2112 entry = rb_entry(node, struct btrfs_free_space,
2113 offset_index);
2114 continue; 2117 continue;
2115 } else if (entry->bitmap && !node) {
2116 break;
2117 }
2118 2118
2119 rb_erase(&entry->offset_index, &block_group->free_space_offset); 2119 rb_erase(&entry->offset_index, &block_group->free_space_offset);
2120 ret = tree_insert_offset(&cluster->root, entry->offset, 2120 ret = tree_insert_offset(&cluster->root, entry->offset,
2121 &entry->offset_index, 0); 2121 &entry->offset_index, 0);
2122 BUG_ON(ret); 2122 BUG_ON(ret);
2123 } while (node && entry != last);
2123 2124
2124 if (!node || entry == last) 2125 cluster->max_size = max_extent;
2125 break;
2126 2126
2127 return 0;
2128}
2129
2130/*
2131 * This specifically looks for bitmaps that may work in the cluster, we assume
2132 * that we have already failed to find extents that will work.
2133 */
2134static int setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2135 struct btrfs_free_cluster *cluster,
2136 u64 offset, u64 bytes, u64 min_bytes)
2137{
2138 struct btrfs_free_space *entry;
2139 struct rb_node *node;
2140 int ret = -ENOSPC;
2141
2142 if (block_group->total_bitmaps == 0)
2143 return -ENOSPC;
2144
2145 entry = tree_search_offset(block_group,
2146 offset_to_bitmap(block_group, offset),
2147 0, 1);
2148 if (!entry)
2149 return -ENOSPC;
2150
2151 node = &entry->offset_index;
2152 do {
2127 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2153 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2154 node = rb_next(&entry->offset_index);
2155 if (!entry->bitmap)
2156 continue;
2157 if (entry->bytes < min_bytes)
2158 continue;
2159 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
2160 bytes, min_bytes);
2161 } while (ret && node);
2162
2163 return ret;
2164}
2165
2166/*
2167 * here we try to find a cluster of blocks in a block group. The goal
2168 * is to find at least bytes free and up to empty_size + bytes free.
2169 * We might not find them all in one contiguous area.
2170 *
2171 * returns zero and sets up cluster if things worked out, otherwise
2172 * it returns -enospc
2173 */
2174int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2175 struct btrfs_root *root,
2176 struct btrfs_block_group_cache *block_group,
2177 struct btrfs_free_cluster *cluster,
2178 u64 offset, u64 bytes, u64 empty_size)
2179{
2180 u64 min_bytes;
2181 int ret;
2182
2183 /* for metadata, allow allocates with more holes */
2184 if (btrfs_test_opt(root, SSD_SPREAD)) {
2185 min_bytes = bytes + empty_size;
2186 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
2187 /*
2188 * we want to do larger allocations when we are
2189 * flushing out the delayed refs, it helps prevent
2190 * making more work as we go along.
2191 */
2192 if (trans->transaction->delayed_refs.flushing)
2193 min_bytes = max(bytes, (bytes + empty_size) >> 1);
2194 else
2195 min_bytes = max(bytes, (bytes + empty_size) >> 4);
2196 } else
2197 min_bytes = max(bytes, (bytes + empty_size) >> 2);
2198
2199 spin_lock(&block_group->tree_lock);
2200
2201 /*
2202 * If we know we don't have enough space to make a cluster don't even
2203 * bother doing all the work to try and find one.
2204 */
2205 if (block_group->free_space < min_bytes) {
2206 spin_unlock(&block_group->tree_lock);
2207 return -ENOSPC;
2128 } 2208 }
2129 2209
2130 cluster->max_size = max_extent; 2210 spin_lock(&cluster->lock);
2131got_it: 2211
2132 ret = 0; 2212 /* someone already found a cluster, hooray */
2133 atomic_inc(&block_group->count); 2213 if (cluster->block_group) {
2134 list_add_tail(&cluster->block_group_list, &block_group->cluster_list); 2214 ret = 0;
2135 cluster->block_group = block_group; 2215 goto out;
2216 }
2217
2218 ret = setup_cluster_no_bitmap(block_group, cluster, offset, bytes,
2219 min_bytes);
2220 if (ret)
2221 ret = setup_cluster_bitmap(block_group, cluster, offset,
2222 bytes, min_bytes);
2223
2224 if (!ret) {
2225 atomic_inc(&block_group->count);
2226 list_add_tail(&cluster->block_group_list,
2227 &block_group->cluster_list);
2228 cluster->block_group = block_group;
2229 }
2136out: 2230out:
2137 spin_unlock(&cluster->lock); 2231 spin_unlock(&cluster->lock);
2138 spin_unlock(&block_group->tree_lock); 2232 spin_unlock(&block_group->tree_lock);
@@ -2149,8 +2243,99 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
2149 spin_lock_init(&cluster->refill_lock); 2243 spin_lock_init(&cluster->refill_lock);
2150 cluster->root = RB_ROOT; 2244 cluster->root = RB_ROOT;
2151 cluster->max_size = 0; 2245 cluster->max_size = 0;
2152 cluster->points_to_bitmap = false;
2153 INIT_LIST_HEAD(&cluster->block_group_list); 2246 INIT_LIST_HEAD(&cluster->block_group_list);
2154 cluster->block_group = NULL; 2247 cluster->block_group = NULL;
2155} 2248}
2156 2249
2250int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2251 u64 *trimmed, u64 start, u64 end, u64 minlen)
2252{
2253 struct btrfs_free_space *entry = NULL;
2254 struct btrfs_fs_info *fs_info = block_group->fs_info;
2255 u64 bytes = 0;
2256 u64 actually_trimmed;
2257 int ret = 0;
2258
2259 *trimmed = 0;
2260
2261 while (start < end) {
2262 spin_lock(&block_group->tree_lock);
2263
2264 if (block_group->free_space < minlen) {
2265 spin_unlock(&block_group->tree_lock);
2266 break;
2267 }
2268
2269 entry = tree_search_offset(block_group, start, 0, 1);
2270 if (!entry)
2271 entry = tree_search_offset(block_group,
2272 offset_to_bitmap(block_group,
2273 start),
2274 1, 1);
2275
2276 if (!entry || entry->offset >= end) {
2277 spin_unlock(&block_group->tree_lock);
2278 break;
2279 }
2280
2281 if (entry->bitmap) {
2282 ret = search_bitmap(block_group, entry, &start, &bytes);
2283 if (!ret) {
2284 if (start >= end) {
2285 spin_unlock(&block_group->tree_lock);
2286 break;
2287 }
2288 bytes = min(bytes, end - start);
2289 bitmap_clear_bits(block_group, entry,
2290 start, bytes);
2291 if (entry->bytes == 0)
2292 free_bitmap(block_group, entry);
2293 } else {
2294 start = entry->offset + BITS_PER_BITMAP *
2295 block_group->sectorsize;
2296 spin_unlock(&block_group->tree_lock);
2297 ret = 0;
2298 continue;
2299 }
2300 } else {
2301 start = entry->offset;
2302 bytes = min(entry->bytes, end - start);
2303 unlink_free_space(block_group, entry);
2304 kfree(entry);
2305 }
2306
2307 spin_unlock(&block_group->tree_lock);
2308
2309 if (bytes >= minlen) {
2310 int update_ret;
2311 update_ret = btrfs_update_reserved_bytes(block_group,
2312 bytes, 1, 1);
2313
2314 ret = btrfs_error_discard_extent(fs_info->extent_root,
2315 start,
2316 bytes,
2317 &actually_trimmed);
2318
2319 btrfs_add_free_space(block_group,
2320 start, bytes);
2321 if (!update_ret)
2322 btrfs_update_reserved_bytes(block_group,
2323 bytes, 0, 1);
2324
2325 if (ret)
2326 break;
2327 *trimmed += actually_trimmed;
2328 }
2329 start += bytes;
2330 bytes = 0;
2331
2332 if (fatal_signal_pending(current)) {
2333 ret = -ERESTARTSYS;
2334 break;
2335 }
2336
2337 cond_resched();
2338 }
2339
2340 return ret;
2341}
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index e49ca5c321b5..65c3b935289f 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -68,4 +68,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
68int btrfs_return_cluster_to_free_space( 68int btrfs_return_cluster_to_free_space(
69 struct btrfs_block_group_cache *block_group, 69 struct btrfs_block_group_cache *block_group,
70 struct btrfs_free_cluster *cluster); 70 struct btrfs_free_cluster *cluster);
71int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
72 u64 *trimmed, u64 start, u64 end, u64 minlen);
71#endif 73#endif
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index c56eb5909172..c05a08f4c411 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -30,7 +30,8 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
30 int slot; 30 int slot;
31 31
32 path = btrfs_alloc_path(); 32 path = btrfs_alloc_path();
33 BUG_ON(!path); 33 if (!path)
34 return -ENOMEM;
34 35
35 search_key.objectid = BTRFS_LAST_FREE_OBJECTID; 36 search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
36 search_key.type = -1; 37 search_key.type = -1;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 512c3d1da083..fcd66b6a8086 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -50,6 +50,7 @@
50#include "tree-log.h" 50#include "tree-log.h"
51#include "compression.h" 51#include "compression.h"
52#include "locking.h" 52#include "locking.h"
53#include "free-space-cache.h"
53 54
54struct btrfs_iget_args { 55struct btrfs_iget_args {
55 u64 ino; 56 u64 ino;
@@ -70,6 +71,7 @@ static struct kmem_cache *btrfs_inode_cachep;
70struct kmem_cache *btrfs_trans_handle_cachep; 71struct kmem_cache *btrfs_trans_handle_cachep;
71struct kmem_cache *btrfs_transaction_cachep; 72struct kmem_cache *btrfs_transaction_cachep;
72struct kmem_cache *btrfs_path_cachep; 73struct kmem_cache *btrfs_path_cachep;
74struct kmem_cache *btrfs_free_space_cachep;
73 75
74#define S_SHIFT 12 76#define S_SHIFT 12
75static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { 77static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
@@ -82,7 +84,8 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
82 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, 84 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
83}; 85};
84 86
85static void btrfs_truncate(struct inode *inode); 87static int btrfs_setsize(struct inode *inode, loff_t newsize);
88static int btrfs_truncate(struct inode *inode);
86static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); 89static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
87static noinline int cow_file_range(struct inode *inode, 90static noinline int cow_file_range(struct inode *inode,
88 struct page *locked_page, 91 struct page *locked_page,
@@ -109,6 +112,7 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
109static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, 112static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
110 struct btrfs_root *root, struct inode *inode, 113 struct btrfs_root *root, struct inode *inode,
111 u64 start, size_t size, size_t compressed_size, 114 u64 start, size_t size, size_t compressed_size,
115 int compress_type,
112 struct page **compressed_pages) 116 struct page **compressed_pages)
113{ 117{
114 struct btrfs_key key; 118 struct btrfs_key key;
@@ -123,12 +127,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
123 size_t cur_size = size; 127 size_t cur_size = size;
124 size_t datasize; 128 size_t datasize;
125 unsigned long offset; 129 unsigned long offset;
126 int compress_type = BTRFS_COMPRESS_NONE;
127 130
128 if (compressed_size && compressed_pages) { 131 if (compressed_size && compressed_pages)
129 compress_type = root->fs_info->compress_type;
130 cur_size = compressed_size; 132 cur_size = compressed_size;
131 }
132 133
133 path = btrfs_alloc_path(); 134 path = btrfs_alloc_path();
134 if (!path) 135 if (!path)
@@ -218,7 +219,7 @@ fail:
218static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, 219static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
219 struct btrfs_root *root, 220 struct btrfs_root *root,
220 struct inode *inode, u64 start, u64 end, 221 struct inode *inode, u64 start, u64 end,
221 size_t compressed_size, 222 size_t compressed_size, int compress_type,
222 struct page **compressed_pages) 223 struct page **compressed_pages)
223{ 224{
224 u64 isize = i_size_read(inode); 225 u64 isize = i_size_read(inode);
@@ -251,7 +252,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
251 inline_len = min_t(u64, isize, actual_end); 252 inline_len = min_t(u64, isize, actual_end);
252 ret = insert_inline_extent(trans, root, inode, start, 253 ret = insert_inline_extent(trans, root, inode, start,
253 inline_len, compressed_size, 254 inline_len, compressed_size,
254 compressed_pages); 255 compress_type, compressed_pages);
255 BUG_ON(ret); 256 BUG_ON(ret);
256 btrfs_delalloc_release_metadata(inode, end + 1 - start); 257 btrfs_delalloc_release_metadata(inode, end + 1 - start);
257 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 258 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
@@ -288,6 +289,7 @@ static noinline int add_async_extent(struct async_cow *cow,
288 struct async_extent *async_extent; 289 struct async_extent *async_extent;
289 290
290 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); 291 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
292 BUG_ON(!async_extent);
291 async_extent->start = start; 293 async_extent->start = start;
292 async_extent->ram_size = ram_size; 294 async_extent->ram_size = ram_size;
293 async_extent->compressed_size = compressed_size; 295 async_extent->compressed_size = compressed_size;
@@ -382,9 +384,11 @@ again:
382 */ 384 */
383 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && 385 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
384 (btrfs_test_opt(root, COMPRESS) || 386 (btrfs_test_opt(root, COMPRESS) ||
385 (BTRFS_I(inode)->force_compress))) { 387 (BTRFS_I(inode)->force_compress) ||
388 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
386 WARN_ON(pages); 389 WARN_ON(pages);
387 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 390 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
391 BUG_ON(!pages);
388 392
389 if (BTRFS_I(inode)->force_compress) 393 if (BTRFS_I(inode)->force_compress)
390 compress_type = BTRFS_I(inode)->force_compress; 394 compress_type = BTRFS_I(inode)->force_compress;
@@ -427,12 +431,13 @@ again:
427 * to make an uncompressed inline extent. 431 * to make an uncompressed inline extent.
428 */ 432 */
429 ret = cow_file_range_inline(trans, root, inode, 433 ret = cow_file_range_inline(trans, root, inode,
430 start, end, 0, NULL); 434 start, end, 0, 0, NULL);
431 } else { 435 } else {
432 /* try making a compressed inline extent */ 436 /* try making a compressed inline extent */
433 ret = cow_file_range_inline(trans, root, inode, 437 ret = cow_file_range_inline(trans, root, inode,
434 start, end, 438 start, end,
435 total_compressed, pages); 439 total_compressed,
440 compress_type, pages);
436 } 441 }
437 if (ret == 0) { 442 if (ret == 0) {
438 /* 443 /*
@@ -786,7 +791,7 @@ static noinline int cow_file_range(struct inode *inode,
786 if (start == 0) { 791 if (start == 0) {
787 /* lets try to make an inline extent */ 792 /* lets try to make an inline extent */
788 ret = cow_file_range_inline(trans, root, inode, 793 ret = cow_file_range_inline(trans, root, inode,
789 start, end, 0, NULL); 794 start, end, 0, 0, NULL);
790 if (ret == 0) { 795 if (ret == 0) {
791 extent_clear_unlock_delalloc(inode, 796 extent_clear_unlock_delalloc(inode,
792 &BTRFS_I(inode)->io_tree, 797 &BTRFS_I(inode)->io_tree,
@@ -1254,7 +1259,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1254 ret = run_delalloc_nocow(inode, locked_page, start, end, 1259 ret = run_delalloc_nocow(inode, locked_page, start, end,
1255 page_started, 0, nr_written); 1260 page_started, 0, nr_written);
1256 else if (!btrfs_test_opt(root, COMPRESS) && 1261 else if (!btrfs_test_opt(root, COMPRESS) &&
1257 !(BTRFS_I(inode)->force_compress)) 1262 !(BTRFS_I(inode)->force_compress) &&
1263 !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))
1258 ret = cow_file_range(inode, locked_page, start, end, 1264 ret = cow_file_range(inode, locked_page, start, end,
1259 page_started, nr_written, 1); 1265 page_started, nr_written, 1);
1260 else 1266 else
@@ -1461,8 +1467,11 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1461 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1467 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1462 return btrfs_submit_compressed_read(inode, bio, 1468 return btrfs_submit_compressed_read(inode, bio,
1463 mirror_num, bio_flags); 1469 mirror_num, bio_flags);
1464 } else if (!skip_sum) 1470 } else if (!skip_sum) {
1465 btrfs_lookup_bio_sums(root, inode, bio, NULL); 1471 ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
1472 if (ret)
1473 return ret;
1474 }
1466 goto mapit; 1475 goto mapit;
1467 } else if (!skip_sum) { 1476 } else if (!skip_sum) {
1468 /* csum items have already been cloned */ 1477 /* csum items have already been cloned */
@@ -1761,9 +1770,12 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1761 add_pending_csums(trans, inode, ordered_extent->file_offset, 1770 add_pending_csums(trans, inode, ordered_extent->file_offset,
1762 &ordered_extent->list); 1771 &ordered_extent->list);
1763 1772
1764 btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1773 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1765 ret = btrfs_update_inode(trans, root, inode); 1774 if (!ret) {
1766 BUG_ON(ret); 1775 ret = btrfs_update_inode(trans, root, inode);
1776 BUG_ON(ret);
1777 }
1778 ret = 0;
1767out: 1779out:
1768 if (nolock) { 1780 if (nolock) {
1769 if (trans) 1781 if (trans)
@@ -1785,6 +1797,8 @@ out:
1785static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, 1797static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
1786 struct extent_state *state, int uptodate) 1798 struct extent_state *state, int uptodate)
1787{ 1799{
1800 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
1801
1788 ClearPagePrivate2(page); 1802 ClearPagePrivate2(page);
1789 return btrfs_finish_ordered_io(page->mapping->host, start, end); 1803 return btrfs_finish_ordered_io(page->mapping->host, start, end);
1790} 1804}
@@ -1895,10 +1909,10 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
1895 else 1909 else
1896 rw = READ; 1910 rw = READ;
1897 1911
1898 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, 1912 ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1899 failrec->last_mirror, 1913 failrec->last_mirror,
1900 failrec->bio_flags, 0); 1914 failrec->bio_flags, 0);
1901 return 0; 1915 return ret;
1902} 1916}
1903 1917
1904/* 1918/*
@@ -2210,8 +2224,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2210 insert = 1; 2224 insert = 1;
2211#endif 2225#endif
2212 insert = 1; 2226 insert = 1;
2213 } else {
2214 WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved);
2215 } 2227 }
2216 2228
2217 if (!BTRFS_I(inode)->orphan_meta_reserved) { 2229 if (!BTRFS_I(inode)->orphan_meta_reserved) {
@@ -2282,7 +2294,7 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
2282 * this cleans up any orphans that may be left on the list from the last use 2294 * this cleans up any orphans that may be left on the list from the last use
2283 * of this root. 2295 * of this root.
2284 */ 2296 */
2285void btrfs_orphan_cleanup(struct btrfs_root *root) 2297int btrfs_orphan_cleanup(struct btrfs_root *root)
2286{ 2298{
2287 struct btrfs_path *path; 2299 struct btrfs_path *path;
2288 struct extent_buffer *leaf; 2300 struct extent_buffer *leaf;
@@ -2292,10 +2304,13 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2292 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2304 int ret = 0, nr_unlink = 0, nr_truncate = 0;
2293 2305
2294 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) 2306 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
2295 return; 2307 return 0;
2296 2308
2297 path = btrfs_alloc_path(); 2309 path = btrfs_alloc_path();
2298 BUG_ON(!path); 2310 if (!path) {
2311 ret = -ENOMEM;
2312 goto out;
2313 }
2299 path->reada = -1; 2314 path->reada = -1;
2300 2315
2301 key.objectid = BTRFS_ORPHAN_OBJECTID; 2316 key.objectid = BTRFS_ORPHAN_OBJECTID;
@@ -2304,18 +2319,16 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2304 2319
2305 while (1) { 2320 while (1) {
2306 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2321 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2307 if (ret < 0) { 2322 if (ret < 0)
2308 printk(KERN_ERR "Error searching slot for orphan: %d" 2323 goto out;
2309 "\n", ret);
2310 break;
2311 }
2312 2324
2313 /* 2325 /*
2314 * if ret == 0 means we found what we were searching for, which 2326 * if ret == 0 means we found what we were searching for, which
2315 * is weird, but possible, so only screw with path if we didnt 2327 * is weird, but possible, so only screw with path if we didn't
2316 * find the key and see if we have stuff that matches 2328 * find the key and see if we have stuff that matches
2317 */ 2329 */
2318 if (ret > 0) { 2330 if (ret > 0) {
2331 ret = 0;
2319 if (path->slots[0] == 0) 2332 if (path->slots[0] == 0)
2320 break; 2333 break;
2321 path->slots[0]--; 2334 path->slots[0]--;
@@ -2343,7 +2356,10 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2343 found_key.type = BTRFS_INODE_ITEM_KEY; 2356 found_key.type = BTRFS_INODE_ITEM_KEY;
2344 found_key.offset = 0; 2357 found_key.offset = 0;
2345 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 2358 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
2346 BUG_ON(IS_ERR(inode)); 2359 if (IS_ERR(inode)) {
2360 ret = PTR_ERR(inode);
2361 goto out;
2362 }
2347 2363
2348 /* 2364 /*
2349 * add this inode to the orphan list so btrfs_orphan_del does 2365 * add this inode to the orphan list so btrfs_orphan_del does
@@ -2361,7 +2377,10 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2361 */ 2377 */
2362 if (is_bad_inode(inode)) { 2378 if (is_bad_inode(inode)) {
2363 trans = btrfs_start_transaction(root, 0); 2379 trans = btrfs_start_transaction(root, 0);
2364 BUG_ON(IS_ERR(trans)); 2380 if (IS_ERR(trans)) {
2381 ret = PTR_ERR(trans);
2382 goto out;
2383 }
2365 btrfs_orphan_del(trans, inode); 2384 btrfs_orphan_del(trans, inode);
2366 btrfs_end_transaction(trans, root); 2385 btrfs_end_transaction(trans, root);
2367 iput(inode); 2386 iput(inode);
@@ -2370,17 +2389,22 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2370 2389
2371 /* if we have links, this was a truncate, lets do that */ 2390 /* if we have links, this was a truncate, lets do that */
2372 if (inode->i_nlink) { 2391 if (inode->i_nlink) {
2392 if (!S_ISREG(inode->i_mode)) {
2393 WARN_ON(1);
2394 iput(inode);
2395 continue;
2396 }
2373 nr_truncate++; 2397 nr_truncate++;
2374 btrfs_truncate(inode); 2398 ret = btrfs_truncate(inode);
2375 } else { 2399 } else {
2376 nr_unlink++; 2400 nr_unlink++;
2377 } 2401 }
2378 2402
2379 /* this will do delete_inode and everything for us */ 2403 /* this will do delete_inode and everything for us */
2380 iput(inode); 2404 iput(inode);
2405 if (ret)
2406 goto out;
2381 } 2407 }
2382 btrfs_free_path(path);
2383
2384 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; 2408 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
2385 2409
2386 if (root->orphan_block_rsv) 2410 if (root->orphan_block_rsv)
@@ -2389,14 +2413,20 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2389 2413
2390 if (root->orphan_block_rsv || root->orphan_item_inserted) { 2414 if (root->orphan_block_rsv || root->orphan_item_inserted) {
2391 trans = btrfs_join_transaction(root, 1); 2415 trans = btrfs_join_transaction(root, 1);
2392 BUG_ON(IS_ERR(trans)); 2416 if (!IS_ERR(trans))
2393 btrfs_end_transaction(trans, root); 2417 btrfs_end_transaction(trans, root);
2394 } 2418 }
2395 2419
2396 if (nr_unlink) 2420 if (nr_unlink)
2397 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); 2421 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
2398 if (nr_truncate) 2422 if (nr_truncate)
2399 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); 2423 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
2424
2425out:
2426 if (ret)
2427 printk(KERN_CRIT "btrfs: could not do orphan cleanup %d\n", ret);
2428 btrfs_free_path(path);
2429 return ret;
2400} 2430}
2401 2431
2402/* 2432/*
@@ -2563,6 +2593,13 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2563 struct btrfs_inode_item *item, 2593 struct btrfs_inode_item *item,
2564 struct inode *inode) 2594 struct inode *inode)
2565{ 2595{
2596 if (!leaf->map_token)
2597 map_private_extent_buffer(leaf, (unsigned long)item,
2598 sizeof(struct btrfs_inode_item),
2599 &leaf->map_token, &leaf->kaddr,
2600 &leaf->map_start, &leaf->map_len,
2601 KM_USER1);
2602
2566 btrfs_set_inode_uid(leaf, item, inode->i_uid); 2603 btrfs_set_inode_uid(leaf, item, inode->i_uid);
2567 btrfs_set_inode_gid(leaf, item, inode->i_gid); 2604 btrfs_set_inode_gid(leaf, item, inode->i_gid);
2568 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); 2605 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
@@ -2591,6 +2628,11 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2591 btrfs_set_inode_rdev(leaf, item, inode->i_rdev); 2628 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2592 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); 2629 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
2593 btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group); 2630 btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group);
2631
2632 if (leaf->map_token) {
2633 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
2634 leaf->map_token = NULL;
2635 }
2594} 2636}
2595 2637
2596/* 2638/*
@@ -2635,10 +2677,10 @@ failed:
2635 * recovery code. It remove a link in a directory with a given name, and 2677 * recovery code. It remove a link in a directory with a given name, and
2636 * also drops the back refs in the inode to the directory 2678 * also drops the back refs in the inode to the directory
2637 */ 2679 */
2638int btrfs_unlink_inode(struct btrfs_trans_handle *trans, 2680static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2639 struct btrfs_root *root, 2681 struct btrfs_root *root,
2640 struct inode *dir, struct inode *inode, 2682 struct inode *dir, struct inode *inode,
2641 const char *name, int name_len) 2683 const char *name, int name_len)
2642{ 2684{
2643 struct btrfs_path *path; 2685 struct btrfs_path *path;
2644 int ret = 0; 2686 int ret = 0;
@@ -2710,12 +2752,25 @@ err:
2710 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 2752 btrfs_i_size_write(dir, dir->i_size - name_len * 2);
2711 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; 2753 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
2712 btrfs_update_inode(trans, root, dir); 2754 btrfs_update_inode(trans, root, dir);
2713 btrfs_drop_nlink(inode);
2714 ret = btrfs_update_inode(trans, root, inode);
2715out: 2755out:
2716 return ret; 2756 return ret;
2717} 2757}
2718 2758
2759int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2760 struct btrfs_root *root,
2761 struct inode *dir, struct inode *inode,
2762 const char *name, int name_len)
2763{
2764 int ret;
2765 ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
2766 if (!ret) {
2767 btrfs_drop_nlink(inode);
2768 ret = btrfs_update_inode(trans, root, inode);
2769 }
2770 return ret;
2771}
2772
2773
2719/* helper to check if there is any shared block in the path */ 2774/* helper to check if there is any shared block in the path */
2720static int check_path_shared(struct btrfs_root *root, 2775static int check_path_shared(struct btrfs_root *root,
2721 struct btrfs_path *path) 2776 struct btrfs_path *path)
@@ -3537,7 +3592,13 @@ out:
3537 return ret; 3592 return ret;
3538} 3593}
3539 3594
3540int btrfs_cont_expand(struct inode *inode, loff_t size) 3595/*
3596 * This function puts in dummy file extents for the area we're creating a hole
3597 * for. So if we are truncating this file to a larger size we need to insert
3598 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
3599 * the range between oldsize and size
3600 */
3601int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3541{ 3602{
3542 struct btrfs_trans_handle *trans; 3603 struct btrfs_trans_handle *trans;
3543 struct btrfs_root *root = BTRFS_I(inode)->root; 3604 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -3545,7 +3606,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3545 struct extent_map *em = NULL; 3606 struct extent_map *em = NULL;
3546 struct extent_state *cached_state = NULL; 3607 struct extent_state *cached_state = NULL;
3547 u64 mask = root->sectorsize - 1; 3608 u64 mask = root->sectorsize - 1;
3548 u64 hole_start = (inode->i_size + mask) & ~mask; 3609 u64 hole_start = (oldsize + mask) & ~mask;
3549 u64 block_end = (size + mask) & ~mask; 3610 u64 block_end = (size + mask) & ~mask;
3550 u64 last_byte; 3611 u64 last_byte;
3551 u64 cur_offset; 3612 u64 cur_offset;
@@ -3590,13 +3651,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3590 err = btrfs_drop_extents(trans, inode, cur_offset, 3651 err = btrfs_drop_extents(trans, inode, cur_offset,
3591 cur_offset + hole_size, 3652 cur_offset + hole_size,
3592 &hint_byte, 1); 3653 &hint_byte, 1);
3593 BUG_ON(err); 3654 if (err)
3655 break;
3594 3656
3595 err = btrfs_insert_file_extent(trans, root, 3657 err = btrfs_insert_file_extent(trans, root,
3596 inode->i_ino, cur_offset, 0, 3658 inode->i_ino, cur_offset, 0,
3597 0, hole_size, 0, hole_size, 3659 0, hole_size, 0, hole_size,
3598 0, 0, 0); 3660 0, 0, 0);
3599 BUG_ON(err); 3661 if (err)
3662 break;
3600 3663
3601 btrfs_drop_extent_cache(inode, hole_start, 3664 btrfs_drop_extent_cache(inode, hole_start,
3602 last_byte - 1, 0); 3665 last_byte - 1, 0);
@@ -3616,81 +3679,41 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3616 return err; 3679 return err;
3617} 3680}
3618 3681
3619static int btrfs_setattr_size(struct inode *inode, struct iattr *attr) 3682static int btrfs_setsize(struct inode *inode, loff_t newsize)
3620{ 3683{
3621 struct btrfs_root *root = BTRFS_I(inode)->root; 3684 loff_t oldsize = i_size_read(inode);
3622 struct btrfs_trans_handle *trans;
3623 unsigned long nr;
3624 int ret; 3685 int ret;
3625 3686
3626 if (attr->ia_size == inode->i_size) 3687 if (newsize == oldsize)
3627 return 0; 3688 return 0;
3628 3689
3629 if (attr->ia_size > inode->i_size) { 3690 if (newsize > oldsize) {
3630 unsigned long limit; 3691 i_size_write(inode, newsize);
3631 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; 3692 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
3632 if (attr->ia_size > inode->i_sb->s_maxbytes) 3693 truncate_pagecache(inode, oldsize, newsize);
3633 return -EFBIG; 3694 ret = btrfs_cont_expand(inode, oldsize, newsize);
3634 if (limit != RLIM_INFINITY && attr->ia_size > limit) {
3635 send_sig(SIGXFSZ, current, 0);
3636 return -EFBIG;
3637 }
3638 }
3639
3640 trans = btrfs_start_transaction(root, 5);
3641 if (IS_ERR(trans))
3642 return PTR_ERR(trans);
3643
3644 btrfs_set_trans_block_group(trans, inode);
3645
3646 ret = btrfs_orphan_add(trans, inode);
3647 BUG_ON(ret);
3648
3649 nr = trans->blocks_used;
3650 btrfs_end_transaction(trans, root);
3651 btrfs_btree_balance_dirty(root, nr);
3652
3653 if (attr->ia_size > inode->i_size) {
3654 ret = btrfs_cont_expand(inode, attr->ia_size);
3655 if (ret) { 3695 if (ret) {
3656 btrfs_truncate(inode); 3696 btrfs_setsize(inode, oldsize);
3657 return ret; 3697 return ret;
3658 } 3698 }
3659 3699
3660 i_size_write(inode, attr->ia_size); 3700 mark_inode_dirty(inode);
3661 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 3701 } else {
3662 3702
3663 trans = btrfs_start_transaction(root, 0); 3703 /*
3664 BUG_ON(IS_ERR(trans)); 3704 * We're truncating a file that used to have good data down to
3665 btrfs_set_trans_block_group(trans, inode); 3705 * zero. Make sure it gets into the ordered flush list so that
3666 trans->block_rsv = root->orphan_block_rsv; 3706 * any new writes get down to disk quickly.
3667 BUG_ON(!trans->block_rsv); 3707 */
3708 if (newsize == 0)
3709 BTRFS_I(inode)->ordered_data_close = 1;
3668 3710
3669 ret = btrfs_update_inode(trans, root, inode); 3711 /* we don't support swapfiles, so vmtruncate shouldn't fail */
3670 BUG_ON(ret); 3712 truncate_setsize(inode, newsize);
3671 if (inode->i_nlink > 0) { 3713 ret = btrfs_truncate(inode);
3672 ret = btrfs_orphan_del(trans, inode);
3673 BUG_ON(ret);
3674 }
3675 nr = trans->blocks_used;
3676 btrfs_end_transaction(trans, root);
3677 btrfs_btree_balance_dirty(root, nr);
3678 return 0;
3679 } 3714 }
3680 3715
3681 /* 3716 return ret;
3682 * We're truncating a file that used to have good data down to
3683 * zero. Make sure it gets into the ordered flush list so that
3684 * any new writes get down to disk quickly.
3685 */
3686 if (attr->ia_size == 0)
3687 BTRFS_I(inode)->ordered_data_close = 1;
3688
3689 /* we don't support swapfiles, so vmtruncate shouldn't fail */
3690 ret = vmtruncate(inode, attr->ia_size);
3691 BUG_ON(ret);
3692
3693 return 0;
3694} 3717}
3695 3718
3696static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) 3719static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
@@ -3707,7 +3730,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
3707 return err; 3730 return err;
3708 3731
3709 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 3732 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
3710 err = btrfs_setattr_size(inode, attr); 3733 err = btrfs_setsize(inode, attr->ia_size);
3711 if (err) 3734 if (err)
3712 return err; 3735 return err;
3713 } 3736 }
@@ -3730,6 +3753,8 @@ void btrfs_evict_inode(struct inode *inode)
3730 unsigned long nr; 3753 unsigned long nr;
3731 int ret; 3754 int ret;
3732 3755
3756 trace_btrfs_inode_evict(inode);
3757
3733 truncate_inode_pages(&inode->i_data, 0); 3758 truncate_inode_pages(&inode->i_data, 0);
3734 if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || 3759 if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
3735 root == root->fs_info->tree_root)) 3760 root == root->fs_info->tree_root))
@@ -4072,7 +4097,6 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
4072 BTRFS_I(inode)->root = root; 4097 BTRFS_I(inode)->root = root;
4073 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); 4098 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
4074 btrfs_read_locked_inode(inode); 4099 btrfs_read_locked_inode(inode);
4075
4076 inode_tree_add(inode); 4100 inode_tree_add(inode);
4077 unlock_new_inode(inode); 4101 unlock_new_inode(inode);
4078 if (new) 4102 if (new)
@@ -4147,8 +4171,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
4147 if (!IS_ERR(inode) && root != sub_root) { 4171 if (!IS_ERR(inode) && root != sub_root) {
4148 down_read(&root->fs_info->cleanup_work_sem); 4172 down_read(&root->fs_info->cleanup_work_sem);
4149 if (!(inode->i_sb->s_flags & MS_RDONLY)) 4173 if (!(inode->i_sb->s_flags & MS_RDONLY))
4150 btrfs_orphan_cleanup(sub_root); 4174 ret = btrfs_orphan_cleanup(sub_root);
4151 up_read(&root->fs_info->cleanup_work_sem); 4175 up_read(&root->fs_info->cleanup_work_sem);
4176 if (ret)
4177 inode = ERR_PTR(ret);
4152 } 4178 }
4153 4179
4154 return inode; 4180 return inode;
@@ -4196,10 +4222,8 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4196 struct btrfs_key found_key; 4222 struct btrfs_key found_key;
4197 struct btrfs_path *path; 4223 struct btrfs_path *path;
4198 int ret; 4224 int ret;
4199 u32 nritems;
4200 struct extent_buffer *leaf; 4225 struct extent_buffer *leaf;
4201 int slot; 4226 int slot;
4202 int advance;
4203 unsigned char d_type; 4227 unsigned char d_type;
4204 int over = 0; 4228 int over = 0;
4205 u32 di_cur; 4229 u32 di_cur;
@@ -4242,27 +4266,19 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4242 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4266 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4243 if (ret < 0) 4267 if (ret < 0)
4244 goto err; 4268 goto err;
4245 advance = 0;
4246 4269
4247 while (1) { 4270 while (1) {
4248 leaf = path->nodes[0]; 4271 leaf = path->nodes[0];
4249 nritems = btrfs_header_nritems(leaf);
4250 slot = path->slots[0]; 4272 slot = path->slots[0];
4251 if (advance || slot >= nritems) { 4273 if (slot >= btrfs_header_nritems(leaf)) {
4252 if (slot >= nritems - 1) { 4274 ret = btrfs_next_leaf(root, path);
4253 ret = btrfs_next_leaf(root, path); 4275 if (ret < 0)
4254 if (ret) 4276 goto err;
4255 break; 4277 else if (ret > 0)
4256 leaf = path->nodes[0]; 4278 break;
4257 nritems = btrfs_header_nritems(leaf); 4279 continue;
4258 slot = path->slots[0];
4259 } else {
4260 slot++;
4261 path->slots[0]++;
4262 }
4263 } 4280 }
4264 4281
4265 advance = 1;
4266 item = btrfs_item_nr(leaf, slot); 4282 item = btrfs_item_nr(leaf, slot);
4267 btrfs_item_key_to_cpu(leaf, &found_key, slot); 4283 btrfs_item_key_to_cpu(leaf, &found_key, slot);
4268 4284
@@ -4271,7 +4287,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4271 if (btrfs_key_type(&found_key) != key_type) 4287 if (btrfs_key_type(&found_key) != key_type)
4272 break; 4288 break;
4273 if (found_key.offset < filp->f_pos) 4289 if (found_key.offset < filp->f_pos)
4274 continue; 4290 goto next;
4275 4291
4276 filp->f_pos = found_key.offset; 4292 filp->f_pos = found_key.offset;
4277 4293
@@ -4282,6 +4298,9 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4282 while (di_cur < di_total) { 4298 while (di_cur < di_total) {
4283 struct btrfs_key location; 4299 struct btrfs_key location;
4284 4300
4301 if (verify_dir_item(root, leaf, di))
4302 break;
4303
4285 name_len = btrfs_dir_name_len(leaf, di); 4304 name_len = btrfs_dir_name_len(leaf, di);
4286 if (name_len <= sizeof(tmp_name)) { 4305 if (name_len <= sizeof(tmp_name)) {
4287 name_ptr = tmp_name; 4306 name_ptr = tmp_name;
@@ -4321,6 +4340,8 @@ skip:
4321 di_cur += di_len; 4340 di_cur += di_len;
4322 di = (struct btrfs_dir_item *)((char *)di + di_len); 4341 di = (struct btrfs_dir_item *)((char *)di + di_len);
4323 } 4342 }
4343next:
4344 path->slots[0]++;
4324 } 4345 }
4325 4346
4326 /* Reached end of directory/root. Bump pos past the last item. */ 4347 /* Reached end of directory/root. Bump pos past the last item. */
@@ -4513,12 +4534,17 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4513 BUG_ON(!path); 4534 BUG_ON(!path);
4514 4535
4515 inode = new_inode(root->fs_info->sb); 4536 inode = new_inode(root->fs_info->sb);
4516 if (!inode) 4537 if (!inode) {
4538 btrfs_free_path(path);
4517 return ERR_PTR(-ENOMEM); 4539 return ERR_PTR(-ENOMEM);
4540 }
4518 4541
4519 if (dir) { 4542 if (dir) {
4543 trace_btrfs_inode_request(dir);
4544
4520 ret = btrfs_set_inode_index(dir, index); 4545 ret = btrfs_set_inode_index(dir, index);
4521 if (ret) { 4546 if (ret) {
4547 btrfs_free_path(path);
4522 iput(inode); 4548 iput(inode);
4523 return ERR_PTR(ret); 4549 return ERR_PTR(ret);
4524 } 4550 }
@@ -4585,12 +4611,16 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4585 if ((mode & S_IFREG)) { 4611 if ((mode & S_IFREG)) {
4586 if (btrfs_test_opt(root, NODATASUM)) 4612 if (btrfs_test_opt(root, NODATASUM))
4587 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 4613 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
4588 if (btrfs_test_opt(root, NODATACOW)) 4614 if (btrfs_test_opt(root, NODATACOW) ||
4615 (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW))
4589 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 4616 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
4590 } 4617 }
4591 4618
4592 insert_inode_hash(inode); 4619 insert_inode_hash(inode);
4593 inode_tree_add(inode); 4620 inode_tree_add(inode);
4621
4622 trace_btrfs_inode_new(inode);
4623
4594 return inode; 4624 return inode;
4595fail: 4625fail:
4596 if (dir) 4626 if (dir)
@@ -4809,10 +4839,10 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4809 4839
4810 /* do not allow sys_link's with other subvols of the same device */ 4840 /* do not allow sys_link's with other subvols of the same device */
4811 if (root->objectid != BTRFS_I(inode)->root->objectid) 4841 if (root->objectid != BTRFS_I(inode)->root->objectid)
4812 return -EPERM; 4842 return -EXDEV;
4813 4843
4814 btrfs_inc_nlink(inode); 4844 if (inode->i_nlink == ~0U)
4815 inode->i_ctime = CURRENT_TIME; 4845 return -EMLINK;
4816 4846
4817 err = btrfs_set_inode_index(dir, &index); 4847 err = btrfs_set_inode_index(dir, &index);
4818 if (err) 4848 if (err)
@@ -4829,6 +4859,9 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4829 goto fail; 4859 goto fail;
4830 } 4860 }
4831 4861
4862 btrfs_inc_nlink(inode);
4863 inode->i_ctime = CURRENT_TIME;
4864
4832 btrfs_set_trans_block_group(trans, dir); 4865 btrfs_set_trans_block_group(trans, dir);
4833 ihold(inode); 4866 ihold(inode);
4834 4867
@@ -5198,7 +5231,7 @@ again:
5198 btrfs_mark_buffer_dirty(leaf); 5231 btrfs_mark_buffer_dirty(leaf);
5199 } 5232 }
5200 set_extent_uptodate(io_tree, em->start, 5233 set_extent_uptodate(io_tree, em->start,
5201 extent_map_end(em) - 1, GFP_NOFS); 5234 extent_map_end(em) - 1, NULL, GFP_NOFS);
5202 goto insert; 5235 goto insert;
5203 } else { 5236 } else {
5204 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); 5237 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
@@ -5265,6 +5298,9 @@ insert:
5265 } 5298 }
5266 write_unlock(&em_tree->lock); 5299 write_unlock(&em_tree->lock);
5267out: 5300out:
5301
5302 trace_btrfs_get_extent(root, em);
5303
5268 if (path) 5304 if (path)
5269 btrfs_free_path(path); 5305 btrfs_free_path(path);
5270 if (trans) { 5306 if (trans) {
@@ -5402,17 +5438,30 @@ out:
5402} 5438}
5403 5439
5404static struct extent_map *btrfs_new_extent_direct(struct inode *inode, 5440static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5441 struct extent_map *em,
5405 u64 start, u64 len) 5442 u64 start, u64 len)
5406{ 5443{
5407 struct btrfs_root *root = BTRFS_I(inode)->root; 5444 struct btrfs_root *root = BTRFS_I(inode)->root;
5408 struct btrfs_trans_handle *trans; 5445 struct btrfs_trans_handle *trans;
5409 struct extent_map *em;
5410 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5446 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
5411 struct btrfs_key ins; 5447 struct btrfs_key ins;
5412 u64 alloc_hint; 5448 u64 alloc_hint;
5413 int ret; 5449 int ret;
5450 bool insert = false;
5414 5451
5415 btrfs_drop_extent_cache(inode, start, start + len - 1, 0); 5452 /*
5453 * Ok if the extent map we looked up is a hole and is for the exact
5454 * range we want, there is no reason to allocate a new one, however if
5455 * it is not right then we need to free this one and drop the cache for
5456 * our range.
5457 */
5458 if (em->block_start != EXTENT_MAP_HOLE || em->start != start ||
5459 em->len != len) {
5460 free_extent_map(em);
5461 em = NULL;
5462 insert = true;
5463 btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
5464 }
5416 5465
5417 trans = btrfs_join_transaction(root, 0); 5466 trans = btrfs_join_transaction(root, 0);
5418 if (IS_ERR(trans)) 5467 if (IS_ERR(trans))
@@ -5428,10 +5477,12 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5428 goto out; 5477 goto out;
5429 } 5478 }
5430 5479
5431 em = alloc_extent_map(GFP_NOFS);
5432 if (!em) { 5480 if (!em) {
5433 em = ERR_PTR(-ENOMEM); 5481 em = alloc_extent_map(GFP_NOFS);
5434 goto out; 5482 if (!em) {
5483 em = ERR_PTR(-ENOMEM);
5484 goto out;
5485 }
5435 } 5486 }
5436 5487
5437 em->start = start; 5488 em->start = start;
@@ -5441,9 +5492,15 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5441 em->block_start = ins.objectid; 5492 em->block_start = ins.objectid;
5442 em->block_len = ins.offset; 5493 em->block_len = ins.offset;
5443 em->bdev = root->fs_info->fs_devices->latest_bdev; 5494 em->bdev = root->fs_info->fs_devices->latest_bdev;
5495
5496 /*
5497 * We need to do this because if we're using the original em we searched
5498 * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that.
5499 */
5500 em->flags = 0;
5444 set_bit(EXTENT_FLAG_PINNED, &em->flags); 5501 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5445 5502
5446 while (1) { 5503 while (insert) {
5447 write_lock(&em_tree->lock); 5504 write_lock(&em_tree->lock);
5448 ret = add_extent_mapping(em_tree, em); 5505 ret = add_extent_mapping(em_tree, em);
5449 write_unlock(&em_tree->lock); 5506 write_unlock(&em_tree->lock);
@@ -5661,8 +5718,7 @@ must_cow:
5661 * it above 5718 * it above
5662 */ 5719 */
5663 len = bh_result->b_size; 5720 len = bh_result->b_size;
5664 free_extent_map(em); 5721 em = btrfs_new_extent_direct(inode, em, start, len);
5665 em = btrfs_new_extent_direct(inode, start, len);
5666 if (IS_ERR(em)) 5722 if (IS_ERR(em))
5667 return PTR_ERR(em); 5723 return PTR_ERR(em);
5668 len = min(len, em->len - (start - em->start)); 5724 len = min(len, em->len - (start - em->start));
@@ -5748,6 +5804,10 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
5748 5804
5749 kfree(dip->csums); 5805 kfree(dip->csums);
5750 kfree(dip); 5806 kfree(dip);
5807
5808 /* If we had a csum failure make sure to clear the uptodate flag */
5809 if (err)
5810 clear_bit(BIO_UPTODATE, &bio->bi_flags);
5751 dio_end_io(bio, err); 5811 dio_end_io(bio, err);
5752} 5812}
5753 5813
@@ -5821,8 +5881,10 @@ again:
5821 } 5881 }
5822 5882
5823 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); 5883 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
5824 btrfs_ordered_update_i_size(inode, 0, ordered); 5884 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5825 btrfs_update_inode(trans, root, inode); 5885 if (!ret)
5886 btrfs_update_inode(trans, root, inode);
5887 ret = 0;
5826out_unlock: 5888out_unlock:
5827 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, 5889 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5828 ordered->file_offset + ordered->len - 1, 5890 ordered->file_offset + ordered->len - 1,
@@ -5849,6 +5911,10 @@ out_done:
5849 5911
5850 kfree(dip->csums); 5912 kfree(dip->csums);
5851 kfree(dip); 5913 kfree(dip);
5914
5915 /* If we had an error make sure to clear the uptodate flag */
5916 if (err)
5917 clear_bit(BIO_UPTODATE, &bio->bi_flags);
5852 dio_end_io(bio, err); 5918 dio_end_io(bio, err);
5853} 5919}
5854 5920
@@ -5904,7 +5970,7 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
5904 5970
5905static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, 5971static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
5906 int rw, u64 file_offset, int skip_sum, 5972 int rw, u64 file_offset, int skip_sum,
5907 u32 *csums) 5973 u32 *csums, int async_submit)
5908{ 5974{
5909 int write = rw & REQ_WRITE; 5975 int write = rw & REQ_WRITE;
5910 struct btrfs_root *root = BTRFS_I(inode)->root; 5976 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -5915,18 +5981,33 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
5915 if (ret) 5981 if (ret)
5916 goto err; 5982 goto err;
5917 5983
5918 if (write && !skip_sum) { 5984 if (skip_sum)
5985 goto map;
5986
5987 if (write && async_submit) {
5919 ret = btrfs_wq_submit_bio(root->fs_info, 5988 ret = btrfs_wq_submit_bio(root->fs_info,
5920 inode, rw, bio, 0, 0, 5989 inode, rw, bio, 0, 0,
5921 file_offset, 5990 file_offset,
5922 __btrfs_submit_bio_start_direct_io, 5991 __btrfs_submit_bio_start_direct_io,
5923 __btrfs_submit_bio_done); 5992 __btrfs_submit_bio_done);
5924 goto err; 5993 goto err;
5925 } else if (!skip_sum) 5994 } else if (write) {
5926 btrfs_lookup_bio_sums_dio(root, inode, bio, 5995 /*
5996 * If we aren't doing async submit, calculate the csum of the
5997 * bio now.
5998 */
5999 ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
6000 if (ret)
6001 goto err;
6002 } else if (!skip_sum) {
6003 ret = btrfs_lookup_bio_sums_dio(root, inode, bio,
5927 file_offset, csums); 6004 file_offset, csums);
6005 if (ret)
6006 goto err;
6007 }
5928 6008
5929 ret = btrfs_map_bio(root, rw, bio, 0, 1); 6009map:
6010 ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
5930err: 6011err:
5931 bio_put(bio); 6012 bio_put(bio);
5932 return ret; 6013 return ret;
@@ -5948,13 +6029,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
5948 int nr_pages = 0; 6029 int nr_pages = 0;
5949 u32 *csums = dip->csums; 6030 u32 *csums = dip->csums;
5950 int ret = 0; 6031 int ret = 0;
5951 6032 int async_submit = 0;
5952 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); 6033 int write = rw & REQ_WRITE;
5953 if (!bio)
5954 return -ENOMEM;
5955 bio->bi_private = dip;
5956 bio->bi_end_io = btrfs_end_dio_bio;
5957 atomic_inc(&dip->pending_bios);
5958 6034
5959 map_length = orig_bio->bi_size; 6035 map_length = orig_bio->bi_size;
5960 ret = btrfs_map_block(map_tree, READ, start_sector << 9, 6036 ret = btrfs_map_block(map_tree, READ, start_sector << 9,
@@ -5964,6 +6040,19 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
5964 return -EIO; 6040 return -EIO;
5965 } 6041 }
5966 6042
6043 if (map_length >= orig_bio->bi_size) {
6044 bio = orig_bio;
6045 goto submit;
6046 }
6047
6048 async_submit = 1;
6049 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
6050 if (!bio)
6051 return -ENOMEM;
6052 bio->bi_private = dip;
6053 bio->bi_end_io = btrfs_end_dio_bio;
6054 atomic_inc(&dip->pending_bios);
6055
5967 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { 6056 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
5968 if (unlikely(map_length < submit_len + bvec->bv_len || 6057 if (unlikely(map_length < submit_len + bvec->bv_len ||
5969 bio_add_page(bio, bvec->bv_page, bvec->bv_len, 6058 bio_add_page(bio, bvec->bv_page, bvec->bv_len,
@@ -5977,14 +6066,15 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
5977 atomic_inc(&dip->pending_bios); 6066 atomic_inc(&dip->pending_bios);
5978 ret = __btrfs_submit_dio_bio(bio, inode, rw, 6067 ret = __btrfs_submit_dio_bio(bio, inode, rw,
5979 file_offset, skip_sum, 6068 file_offset, skip_sum,
5980 csums); 6069 csums, async_submit);
5981 if (ret) { 6070 if (ret) {
5982 bio_put(bio); 6071 bio_put(bio);
5983 atomic_dec(&dip->pending_bios); 6072 atomic_dec(&dip->pending_bios);
5984 goto out_err; 6073 goto out_err;
5985 } 6074 }
5986 6075
5987 if (!skip_sum) 6076 /* Write's use the ordered csums */
6077 if (!write && !skip_sum)
5988 csums = csums + nr_pages; 6078 csums = csums + nr_pages;
5989 start_sector += submit_len >> 9; 6079 start_sector += submit_len >> 9;
5990 file_offset += submit_len; 6080 file_offset += submit_len;
@@ -6013,8 +6103,9 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6013 } 6103 }
6014 } 6104 }
6015 6105
6106submit:
6016 ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, 6107 ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
6017 csums); 6108 csums, async_submit);
6018 if (!ret) 6109 if (!ret)
6019 return 0; 6110 return 0;
6020 6111
@@ -6052,7 +6143,8 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
6052 } 6143 }
6053 dip->csums = NULL; 6144 dip->csums = NULL;
6054 6145
6055 if (!skip_sum) { 6146 /* Write's use the ordered csum stuff, so we don't need dip->csums */
6147 if (!write && !skip_sum) {
6056 dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS); 6148 dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
6057 if (!dip->csums) { 6149 if (!dip->csums) {
6058 kfree(dip); 6150 kfree(dip);
@@ -6108,6 +6200,7 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io
6108 unsigned long nr_segs) 6200 unsigned long nr_segs)
6109{ 6201{
6110 int seg; 6202 int seg;
6203 int i;
6111 size_t size; 6204 size_t size;
6112 unsigned long addr; 6205 unsigned long addr;
6113 unsigned blocksize_mask = root->sectorsize - 1; 6206 unsigned blocksize_mask = root->sectorsize - 1;
@@ -6122,8 +6215,22 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io
6122 addr = (unsigned long)iov[seg].iov_base; 6215 addr = (unsigned long)iov[seg].iov_base;
6123 size = iov[seg].iov_len; 6216 size = iov[seg].iov_len;
6124 end += size; 6217 end += size;
6125 if ((addr & blocksize_mask) || (size & blocksize_mask)) 6218 if ((addr & blocksize_mask) || (size & blocksize_mask))
6126 goto out; 6219 goto out;
6220
6221 /* If this is a write we don't need to check anymore */
6222 if (rw & WRITE)
6223 continue;
6224
6225 /*
6226 * Check to make sure we don't have duplicate iov_base's in this
6227 * iovec, if so return EINVAL, otherwise we'll get csum errors
6228 * when reading back.
6229 */
6230 for (i = seg + 1; i < nr_segs; i++) {
6231 if (iov[seg].iov_base == iov[i].iov_base)
6232 goto out;
6233 }
6127 } 6234 }
6128 retval = 0; 6235 retval = 0;
6129out: 6236out:
@@ -6474,28 +6581,42 @@ out:
6474 return ret; 6581 return ret;
6475} 6582}
6476 6583
6477static void btrfs_truncate(struct inode *inode) 6584static int btrfs_truncate(struct inode *inode)
6478{ 6585{
6479 struct btrfs_root *root = BTRFS_I(inode)->root; 6586 struct btrfs_root *root = BTRFS_I(inode)->root;
6480 int ret; 6587 int ret;
6588 int err = 0;
6481 struct btrfs_trans_handle *trans; 6589 struct btrfs_trans_handle *trans;
6482 unsigned long nr; 6590 unsigned long nr;
6483 u64 mask = root->sectorsize - 1; 6591 u64 mask = root->sectorsize - 1;
6484 6592
6485 if (!S_ISREG(inode->i_mode)) {
6486 WARN_ON(1);
6487 return;
6488 }
6489
6490 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); 6593 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
6491 if (ret) 6594 if (ret)
6492 return; 6595 return ret;
6493 6596
6494 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 6597 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
6495 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 6598 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
6496 6599
6600 trans = btrfs_start_transaction(root, 5);
6601 if (IS_ERR(trans))
6602 return PTR_ERR(trans);
6603
6604 btrfs_set_trans_block_group(trans, inode);
6605
6606 ret = btrfs_orphan_add(trans, inode);
6607 if (ret) {
6608 btrfs_end_transaction(trans, root);
6609 return ret;
6610 }
6611
6612 nr = trans->blocks_used;
6613 btrfs_end_transaction(trans, root);
6614 btrfs_btree_balance_dirty(root, nr);
6615
6616 /* Now start a transaction for the truncate */
6497 trans = btrfs_start_transaction(root, 0); 6617 trans = btrfs_start_transaction(root, 0);
6498 BUG_ON(IS_ERR(trans)); 6618 if (IS_ERR(trans))
6619 return PTR_ERR(trans);
6499 btrfs_set_trans_block_group(trans, inode); 6620 btrfs_set_trans_block_group(trans, inode);
6500 trans->block_rsv = root->orphan_block_rsv; 6621 trans->block_rsv = root->orphan_block_rsv;
6501 6622
@@ -6522,29 +6643,38 @@ static void btrfs_truncate(struct inode *inode)
6522 while (1) { 6643 while (1) {
6523 if (!trans) { 6644 if (!trans) {
6524 trans = btrfs_start_transaction(root, 0); 6645 trans = btrfs_start_transaction(root, 0);
6525 BUG_ON(IS_ERR(trans)); 6646 if (IS_ERR(trans))
6647 return PTR_ERR(trans);
6526 btrfs_set_trans_block_group(trans, inode); 6648 btrfs_set_trans_block_group(trans, inode);
6527 trans->block_rsv = root->orphan_block_rsv; 6649 trans->block_rsv = root->orphan_block_rsv;
6528 } 6650 }
6529 6651
6530 ret = btrfs_block_rsv_check(trans, root, 6652 ret = btrfs_block_rsv_check(trans, root,
6531 root->orphan_block_rsv, 0, 5); 6653 root->orphan_block_rsv, 0, 5);
6532 if (ret) { 6654 if (ret == -EAGAIN) {
6533 BUG_ON(ret != -EAGAIN);
6534 ret = btrfs_commit_transaction(trans, root); 6655 ret = btrfs_commit_transaction(trans, root);
6535 BUG_ON(ret); 6656 if (ret)
6657 return ret;
6536 trans = NULL; 6658 trans = NULL;
6537 continue; 6659 continue;
6660 } else if (ret) {
6661 err = ret;
6662 break;
6538 } 6663 }
6539 6664
6540 ret = btrfs_truncate_inode_items(trans, root, inode, 6665 ret = btrfs_truncate_inode_items(trans, root, inode,
6541 inode->i_size, 6666 inode->i_size,
6542 BTRFS_EXTENT_DATA_KEY); 6667 BTRFS_EXTENT_DATA_KEY);
6543 if (ret != -EAGAIN) 6668 if (ret != -EAGAIN) {
6669 err = ret;
6544 break; 6670 break;
6671 }
6545 6672
6546 ret = btrfs_update_inode(trans, root, inode); 6673 ret = btrfs_update_inode(trans, root, inode);
6547 BUG_ON(ret); 6674 if (ret) {
6675 err = ret;
6676 break;
6677 }
6548 6678
6549 nr = trans->blocks_used; 6679 nr = trans->blocks_used;
6550 btrfs_end_transaction(trans, root); 6680 btrfs_end_transaction(trans, root);
@@ -6554,16 +6684,27 @@ static void btrfs_truncate(struct inode *inode)
6554 6684
6555 if (ret == 0 && inode->i_nlink > 0) { 6685 if (ret == 0 && inode->i_nlink > 0) {
6556 ret = btrfs_orphan_del(trans, inode); 6686 ret = btrfs_orphan_del(trans, inode);
6557 BUG_ON(ret); 6687 if (ret)
6688 err = ret;
6689 } else if (ret && inode->i_nlink > 0) {
6690 /*
6691 * Failed to do the truncate, remove us from the in memory
6692 * orphan list.
6693 */
6694 ret = btrfs_orphan_del(NULL, inode);
6558 } 6695 }
6559 6696
6560 ret = btrfs_update_inode(trans, root, inode); 6697 ret = btrfs_update_inode(trans, root, inode);
6561 BUG_ON(ret); 6698 if (ret && !err)
6699 err = ret;
6562 6700
6563 nr = trans->blocks_used; 6701 nr = trans->blocks_used;
6564 ret = btrfs_end_transaction_throttle(trans, root); 6702 ret = btrfs_end_transaction_throttle(trans, root);
6565 BUG_ON(ret); 6703 if (ret && !err)
6704 err = ret;
6566 btrfs_btree_balance_dirty(root, nr); 6705 btrfs_btree_balance_dirty(root, nr);
6706
6707 return err;
6567} 6708}
6568 6709
6569/* 6710/*
@@ -6630,9 +6771,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6630 ei->index_cnt = (u64)-1; 6771 ei->index_cnt = (u64)-1;
6631 ei->last_unlink_trans = 0; 6772 ei->last_unlink_trans = 0;
6632 6773
6633 spin_lock_init(&ei->accounting_lock);
6634 atomic_set(&ei->outstanding_extents, 0); 6774 atomic_set(&ei->outstanding_extents, 0);
6635 ei->reserved_extents = 0; 6775 atomic_set(&ei->reserved_extents, 0);
6636 6776
6637 ei->ordered_data_close = 0; 6777 ei->ordered_data_close = 0;
6638 ei->orphan_meta_reserved = 0; 6778 ei->orphan_meta_reserved = 0;
@@ -6668,7 +6808,7 @@ void btrfs_destroy_inode(struct inode *inode)
6668 WARN_ON(!list_empty(&inode->i_dentry)); 6808 WARN_ON(!list_empty(&inode->i_dentry));
6669 WARN_ON(inode->i_data.nrpages); 6809 WARN_ON(inode->i_data.nrpages);
6670 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents)); 6810 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
6671 WARN_ON(BTRFS_I(inode)->reserved_extents); 6811 WARN_ON(atomic_read(&BTRFS_I(inode)->reserved_extents));
6672 6812
6673 /* 6813 /*
6674 * This can happen where we create an inode, but somebody else also 6814 * This can happen where we create an inode, but somebody else also
@@ -6760,6 +6900,8 @@ void btrfs_destroy_cachep(void)
6760 kmem_cache_destroy(btrfs_transaction_cachep); 6900 kmem_cache_destroy(btrfs_transaction_cachep);
6761 if (btrfs_path_cachep) 6901 if (btrfs_path_cachep)
6762 kmem_cache_destroy(btrfs_path_cachep); 6902 kmem_cache_destroy(btrfs_path_cachep);
6903 if (btrfs_free_space_cachep)
6904 kmem_cache_destroy(btrfs_free_space_cachep);
6763} 6905}
6764 6906
6765int btrfs_init_cachep(void) 6907int btrfs_init_cachep(void)
@@ -6788,6 +6930,12 @@ int btrfs_init_cachep(void)
6788 if (!btrfs_path_cachep) 6930 if (!btrfs_path_cachep)
6789 goto fail; 6931 goto fail;
6790 6932
6933 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache",
6934 sizeof(struct btrfs_free_space), 0,
6935 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
6936 if (!btrfs_free_space_cachep)
6937 goto fail;
6938
6791 return 0; 6939 return 0;
6792fail: 6940fail:
6793 btrfs_destroy_cachep(); 6941 btrfs_destroy_cachep();
@@ -6806,6 +6954,26 @@ static int btrfs_getattr(struct vfsmount *mnt,
6806 return 0; 6954 return 0;
6807} 6955}
6808 6956
6957/*
6958 * If a file is moved, it will inherit the cow and compression flags of the new
6959 * directory.
6960 */
6961static void fixup_inode_flags(struct inode *dir, struct inode *inode)
6962{
6963 struct btrfs_inode *b_dir = BTRFS_I(dir);
6964 struct btrfs_inode *b_inode = BTRFS_I(inode);
6965
6966 if (b_dir->flags & BTRFS_INODE_NODATACOW)
6967 b_inode->flags |= BTRFS_INODE_NODATACOW;
6968 else
6969 b_inode->flags &= ~BTRFS_INODE_NODATACOW;
6970
6971 if (b_dir->flags & BTRFS_INODE_COMPRESS)
6972 b_inode->flags |= BTRFS_INODE_COMPRESS;
6973 else
6974 b_inode->flags &= ~BTRFS_INODE_COMPRESS;
6975}
6976
6809static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, 6977static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
6810 struct inode *new_dir, struct dentry *new_dentry) 6978 struct inode *new_dir, struct dentry *new_dentry)
6811{ 6979{
@@ -6854,8 +7022,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
6854 * should cover the worst case number of items we'll modify. 7022 * should cover the worst case number of items we'll modify.
6855 */ 7023 */
6856 trans = btrfs_start_transaction(root, 20); 7024 trans = btrfs_start_transaction(root, 20);
6857 if (IS_ERR(trans)) 7025 if (IS_ERR(trans)) {
6858 return PTR_ERR(trans); 7026 ret = PTR_ERR(trans);
7027 goto out_notrans;
7028 }
6859 7029
6860 btrfs_set_trans_block_group(trans, new_dir); 7030 btrfs_set_trans_block_group(trans, new_dir);
6861 7031
@@ -6908,11 +7078,12 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
6908 old_dentry->d_name.name, 7078 old_dentry->d_name.name,
6909 old_dentry->d_name.len); 7079 old_dentry->d_name.len);
6910 } else { 7080 } else {
6911 btrfs_inc_nlink(old_dentry->d_inode); 7081 ret = __btrfs_unlink_inode(trans, root, old_dir,
6912 ret = btrfs_unlink_inode(trans, root, old_dir, 7082 old_dentry->d_inode,
6913 old_dentry->d_inode, 7083 old_dentry->d_name.name,
6914 old_dentry->d_name.name, 7084 old_dentry->d_name.len);
6915 old_dentry->d_name.len); 7085 if (!ret)
7086 ret = btrfs_update_inode(trans, root, old_inode);
6916 } 7087 }
6917 BUG_ON(ret); 7088 BUG_ON(ret);
6918 7089
@@ -6939,6 +7110,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
6939 } 7110 }
6940 } 7111 }
6941 7112
7113 fixup_inode_flags(new_dir, old_inode);
7114
6942 ret = btrfs_add_link(trans, new_dir, old_inode, 7115 ret = btrfs_add_link(trans, new_dir, old_inode,
6943 new_dentry->d_name.name, 7116 new_dentry->d_name.name,
6944 new_dentry->d_name.len, 0, index); 7117 new_dentry->d_name.len, 0, index);
@@ -6952,7 +7125,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
6952 } 7125 }
6953out_fail: 7126out_fail:
6954 btrfs_end_transaction_throttle(trans, root); 7127 btrfs_end_transaction_throttle(trans, root);
6955 7128out_notrans:
6956 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 7129 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
6957 up_read(&root->fs_info->subvol_sem); 7130 up_read(&root->fs_info->subvol_sem);
6958 7131
@@ -7340,7 +7513,6 @@ static const struct address_space_operations btrfs_aops = {
7340 .writepage = btrfs_writepage, 7513 .writepage = btrfs_writepage,
7341 .writepages = btrfs_writepages, 7514 .writepages = btrfs_writepages,
7342 .readpages = btrfs_readpages, 7515 .readpages = btrfs_readpages,
7343 .sync_page = block_sync_page,
7344 .direct_IO = btrfs_direct_IO, 7516 .direct_IO = btrfs_direct_IO,
7345 .invalidatepage = btrfs_invalidatepage, 7517 .invalidatepage = btrfs_invalidatepage,
7346 .releasepage = btrfs_releasepage, 7518 .releasepage = btrfs_releasepage,
@@ -7356,7 +7528,6 @@ static const struct address_space_operations btrfs_symlink_aops = {
7356}; 7528};
7357 7529
7358static const struct inode_operations btrfs_file_inode_operations = { 7530static const struct inode_operations btrfs_file_inode_operations = {
7359 .truncate = btrfs_truncate,
7360 .getattr = btrfs_getattr, 7531 .getattr = btrfs_getattr,
7361 .setattr = btrfs_setattr, 7532 .setattr = btrfs_setattr,
7362 .setxattr = btrfs_setxattr, 7533 .setxattr = btrfs_setxattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5fdb2abc4fa7..ffb48d6c5433 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -40,6 +40,7 @@
40#include <linux/xattr.h> 40#include <linux/xattr.h>
41#include <linux/vmalloc.h> 41#include <linux/vmalloc.h>
42#include <linux/slab.h> 42#include <linux/slab.h>
43#include <linux/blkdev.h>
43#include "compat.h" 44#include "compat.h"
44#include "ctree.h" 45#include "ctree.h"
45#include "disk-io.h" 46#include "disk-io.h"
@@ -138,6 +139,24 @@ static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
138 return 0; 139 return 0;
139} 140}
140 141
142static int check_flags(unsigned int flags)
143{
144 if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
145 FS_NOATIME_FL | FS_NODUMP_FL | \
146 FS_SYNC_FL | FS_DIRSYNC_FL | \
147 FS_NOCOMP_FL | FS_COMPR_FL | \
148 FS_NOCOW_FL | FS_COW_FL))
149 return -EOPNOTSUPP;
150
151 if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL))
152 return -EINVAL;
153
154 if ((flags & FS_NOCOW_FL) && (flags & FS_COW_FL))
155 return -EINVAL;
156
157 return 0;
158}
159
141static int btrfs_ioctl_setflags(struct file *file, void __user *arg) 160static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
142{ 161{
143 struct inode *inode = file->f_path.dentry->d_inode; 162 struct inode *inode = file->f_path.dentry->d_inode;
@@ -153,12 +172,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
153 if (copy_from_user(&flags, arg, sizeof(flags))) 172 if (copy_from_user(&flags, arg, sizeof(flags)))
154 return -EFAULT; 173 return -EFAULT;
155 174
156 if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ 175 ret = check_flags(flags);
157 FS_NOATIME_FL | FS_NODUMP_FL | \ 176 if (ret)
158 FS_SYNC_FL | FS_DIRSYNC_FL)) 177 return ret;
159 return -EOPNOTSUPP;
160 178
161 if (!is_owner_or_cap(inode)) 179 if (!inode_owner_or_capable(inode))
162 return -EACCES; 180 return -EACCES;
163 181
164 mutex_lock(&inode->i_mutex); 182 mutex_lock(&inode->i_mutex);
@@ -201,6 +219,22 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
201 else 219 else
202 ip->flags &= ~BTRFS_INODE_DIRSYNC; 220 ip->flags &= ~BTRFS_INODE_DIRSYNC;
203 221
222 /*
223 * The COMPRESS flag can only be changed by users, while the NOCOMPRESS
224 * flag may be changed automatically if compression code won't make
225 * things smaller.
226 */
227 if (flags & FS_NOCOMP_FL) {
228 ip->flags &= ~BTRFS_INODE_COMPRESS;
229 ip->flags |= BTRFS_INODE_NOCOMPRESS;
230 } else if (flags & FS_COMPR_FL) {
231 ip->flags |= BTRFS_INODE_COMPRESS;
232 ip->flags &= ~BTRFS_INODE_NOCOMPRESS;
233 }
234 if (flags & FS_NOCOW_FL)
235 ip->flags |= BTRFS_INODE_NODATACOW;
236 else if (flags & FS_COW_FL)
237 ip->flags &= ~BTRFS_INODE_NODATACOW;
204 238
205 trans = btrfs_join_transaction(root, 1); 239 trans = btrfs_join_transaction(root, 1);
206 BUG_ON(IS_ERR(trans)); 240 BUG_ON(IS_ERR(trans));
@@ -213,9 +247,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
213 btrfs_end_transaction(trans, root); 247 btrfs_end_transaction(trans, root);
214 248
215 mnt_drop_write(file->f_path.mnt); 249 mnt_drop_write(file->f_path.mnt);
250
251 ret = 0;
216 out_unlock: 252 out_unlock:
217 mutex_unlock(&inode->i_mutex); 253 mutex_unlock(&inode->i_mutex);
218 return 0; 254 return ret;
219} 255}
220 256
221static int btrfs_ioctl_getversion(struct file *file, int __user *arg) 257static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
@@ -225,6 +261,49 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
225 return put_user(inode->i_generation, arg); 261 return put_user(inode->i_generation, arg);
226} 262}
227 263
264static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
265{
266 struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info;
267 struct btrfs_fs_info *fs_info = root->fs_info;
268 struct btrfs_device *device;
269 struct request_queue *q;
270 struct fstrim_range range;
271 u64 minlen = ULLONG_MAX;
272 u64 num_devices = 0;
273 int ret;
274
275 if (!capable(CAP_SYS_ADMIN))
276 return -EPERM;
277
278 mutex_lock(&fs_info->fs_devices->device_list_mutex);
279 list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
280 if (!device->bdev)
281 continue;
282 q = bdev_get_queue(device->bdev);
283 if (blk_queue_discard(q)) {
284 num_devices++;
285 minlen = min((u64)q->limits.discard_granularity,
286 minlen);
287 }
288 }
289 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
290 if (!num_devices)
291 return -EOPNOTSUPP;
292
293 if (copy_from_user(&range, arg, sizeof(range)))
294 return -EFAULT;
295
296 range.minlen = max(range.minlen, minlen);
297 ret = btrfs_trim_fs(root, &range);
298 if (ret < 0)
299 return ret;
300
301 if (copy_to_user(arg, &range, sizeof(range)))
302 return -EFAULT;
303
304 return 0;
305}
306
228static noinline int create_subvol(struct btrfs_root *root, 307static noinline int create_subvol(struct btrfs_root *root,
229 struct dentry *dentry, 308 struct dentry *dentry,
230 char *name, int namelen, 309 char *name, int namelen,
@@ -294,6 +373,10 @@ static noinline int create_subvol(struct btrfs_root *root,
294 inode_item->nbytes = cpu_to_le64(root->leafsize); 373 inode_item->nbytes = cpu_to_le64(root->leafsize);
295 inode_item->mode = cpu_to_le32(S_IFDIR | 0755); 374 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
296 375
376 root_item.flags = 0;
377 root_item.byte_limit = 0;
378 inode_item->flags = cpu_to_le64(BTRFS_INODE_ROOT_ITEM_INIT);
379
297 btrfs_set_root_bytenr(&root_item, leaf->start); 380 btrfs_set_root_bytenr(&root_item, leaf->start);
298 btrfs_set_root_generation(&root_item, trans->transid); 381 btrfs_set_root_generation(&root_item, trans->transid);
299 btrfs_set_root_level(&root_item, 0); 382 btrfs_set_root_level(&root_item, 0);
@@ -409,7 +492,9 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
409 if (ret) 492 if (ret)
410 goto fail; 493 goto fail;
411 494
412 btrfs_orphan_cleanup(pending_snapshot->snap); 495 ret = btrfs_orphan_cleanup(pending_snapshot->snap);
496 if (ret)
497 goto fail;
413 498
414 parent = dget_parent(dentry); 499 parent = dget_parent(dentry);
415 inode = btrfs_lookup_dentry(parent->d_inode, dentry); 500 inode = btrfs_lookup_dentry(parent->d_inode, dentry);
@@ -1077,7 +1162,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
1077 if (flags & ~BTRFS_SUBVOL_RDONLY) 1162 if (flags & ~BTRFS_SUBVOL_RDONLY)
1078 return -EOPNOTSUPP; 1163 return -EOPNOTSUPP;
1079 1164
1080 if (!is_owner_or_cap(inode)) 1165 if (!inode_owner_or_capable(inode))
1081 return -EACCES; 1166 return -EACCES;
1082 1167
1083 down_write(&root->fs_info->subvol_sem); 1168 down_write(&root->fs_info->subvol_sem);
@@ -2202,7 +2287,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
2202 struct btrfs_ioctl_space_info space; 2287 struct btrfs_ioctl_space_info space;
2203 struct btrfs_ioctl_space_info *dest; 2288 struct btrfs_ioctl_space_info *dest;
2204 struct btrfs_ioctl_space_info *dest_orig; 2289 struct btrfs_ioctl_space_info *dest_orig;
2205 struct btrfs_ioctl_space_info *user_dest; 2290 struct btrfs_ioctl_space_info __user *user_dest;
2206 struct btrfs_space_info *info; 2291 struct btrfs_space_info *info;
2207 u64 types[] = {BTRFS_BLOCK_GROUP_DATA, 2292 u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
2208 BTRFS_BLOCK_GROUP_SYSTEM, 2293 BTRFS_BLOCK_GROUP_SYSTEM,
@@ -2348,12 +2433,17 @@ static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp
2348 struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root; 2433 struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
2349 struct btrfs_trans_handle *trans; 2434 struct btrfs_trans_handle *trans;
2350 u64 transid; 2435 u64 transid;
2436 int ret;
2351 2437
2352 trans = btrfs_start_transaction(root, 0); 2438 trans = btrfs_start_transaction(root, 0);
2353 if (IS_ERR(trans)) 2439 if (IS_ERR(trans))
2354 return PTR_ERR(trans); 2440 return PTR_ERR(trans);
2355 transid = trans->transid; 2441 transid = trans->transid;
2356 btrfs_commit_transaction_async(trans, root, 0); 2442 ret = btrfs_commit_transaction_async(trans, root, 0);
2443 if (ret) {
2444 btrfs_end_transaction(trans, root);
2445 return ret;
2446 }
2357 2447
2358 if (argp) 2448 if (argp)
2359 if (copy_to_user(argp, &transid, sizeof(transid))) 2449 if (copy_to_user(argp, &transid, sizeof(transid)))
@@ -2388,6 +2478,8 @@ long btrfs_ioctl(struct file *file, unsigned int
2388 return btrfs_ioctl_setflags(file, argp); 2478 return btrfs_ioctl_setflags(file, argp);
2389 case FS_IOC_GETVERSION: 2479 case FS_IOC_GETVERSION:
2390 return btrfs_ioctl_getversion(file, argp); 2480 return btrfs_ioctl_getversion(file, argp);
2481 case FITRIM:
2482 return btrfs_ioctl_fitrim(file, argp);
2391 case BTRFS_IOC_SNAP_CREATE: 2483 case BTRFS_IOC_SNAP_CREATE:
2392 return btrfs_ioctl_snap_create(file, argp, 0); 2484 return btrfs_ioctl_snap_create(file, argp, 0);
2393 case BTRFS_IOC_SNAP_CREATE_V2: 2485 case BTRFS_IOC_SNAP_CREATE_V2:
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 083a55477375..a1c940425307 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -202,6 +202,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
202 INIT_LIST_HEAD(&entry->list); 202 INIT_LIST_HEAD(&entry->list);
203 INIT_LIST_HEAD(&entry->root_extent_list); 203 INIT_LIST_HEAD(&entry->root_extent_list);
204 204
205 trace_btrfs_ordered_extent_add(inode, entry);
206
205 spin_lock(&tree->lock); 207 spin_lock(&tree->lock);
206 node = tree_insert(&tree->tree, file_offset, 208 node = tree_insert(&tree->tree, file_offset,
207 &entry->rb_node); 209 &entry->rb_node);
@@ -387,6 +389,8 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
387 struct list_head *cur; 389 struct list_head *cur;
388 struct btrfs_ordered_sum *sum; 390 struct btrfs_ordered_sum *sum;
389 391
392 trace_btrfs_ordered_extent_put(entry->inode, entry);
393
390 if (atomic_dec_and_test(&entry->refs)) { 394 if (atomic_dec_and_test(&entry->refs)) {
391 while (!list_empty(&entry->list)) { 395 while (!list_empty(&entry->list)) {
392 cur = entry->list.next; 396 cur = entry->list.next;
@@ -420,6 +424,8 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
420 spin_lock(&root->fs_info->ordered_extent_lock); 424 spin_lock(&root->fs_info->ordered_extent_lock);
421 list_del_init(&entry->root_extent_list); 425 list_del_init(&entry->root_extent_list);
422 426
427 trace_btrfs_ordered_extent_remove(inode, entry);
428
423 /* 429 /*
424 * we have no more ordered extents for this inode and 430 * we have no more ordered extents for this inode and
425 * no dirty pages. We can safely remove it from the 431 * no dirty pages. We can safely remove it from the
@@ -585,6 +591,8 @@ void btrfs_start_ordered_extent(struct inode *inode,
585 u64 start = entry->file_offset; 591 u64 start = entry->file_offset;
586 u64 end = start + entry->len - 1; 592 u64 end = start + entry->len - 1;
587 593
594 trace_btrfs_ordered_extent_start(inode, entry);
595
588 /* 596 /*
589 * pages in the range can be dirty, clean or writeback. We 597 * pages in the range can be dirty, clean or writeback. We
590 * start IO on any dirty ones so the wait doesn't stall waiting 598 * start IO on any dirty ones so the wait doesn't stall waiting
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 31ade5802ae8..199a80134312 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1724,6 +1724,7 @@ again:
1724 1724
1725 eb = read_tree_block(dest, old_bytenr, blocksize, 1725 eb = read_tree_block(dest, old_bytenr, blocksize,
1726 old_ptr_gen); 1726 old_ptr_gen);
1727 BUG_ON(!eb);
1727 btrfs_tree_lock(eb); 1728 btrfs_tree_lock(eb);
1728 if (cow) { 1729 if (cow) {
1729 ret = btrfs_cow_block(trans, dest, eb, parent, 1730 ret = btrfs_cow_block(trans, dest, eb, parent,
@@ -2345,7 +2346,7 @@ struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
2345 root = next->root; 2346 root = next->root;
2346 BUG_ON(!root); 2347 BUG_ON(!root);
2347 2348
2348 /* no other choice for non-refernce counted tree */ 2349 /* no other choice for non-references counted tree */
2349 if (!root->ref_cows) 2350 if (!root->ref_cows)
2350 return root; 2351 return root;
2351 2352
@@ -2513,6 +2514,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2513 blocksize = btrfs_level_size(root, node->level); 2514 blocksize = btrfs_level_size(root, node->level);
2514 generation = btrfs_node_ptr_generation(upper->eb, slot); 2515 generation = btrfs_node_ptr_generation(upper->eb, slot);
2515 eb = read_tree_block(root, bytenr, blocksize, generation); 2516 eb = read_tree_block(root, bytenr, blocksize, generation);
2517 if (!eb) {
2518 err = -EIO;
2519 goto next;
2520 }
2516 btrfs_tree_lock(eb); 2521 btrfs_tree_lock(eb);
2517 btrfs_set_lock_blocking(eb); 2522 btrfs_set_lock_blocking(eb);
2518 2523
@@ -2670,6 +2675,7 @@ static int get_tree_block_key(struct reloc_control *rc,
2670 BUG_ON(block->key_ready); 2675 BUG_ON(block->key_ready);
2671 eb = read_tree_block(rc->extent_root, block->bytenr, 2676 eb = read_tree_block(rc->extent_root, block->bytenr,
2672 block->key.objectid, block->key.offset); 2677 block->key.objectid, block->key.offset);
2678 BUG_ON(!eb);
2673 WARN_ON(btrfs_header_level(eb) != block->level); 2679 WARN_ON(btrfs_header_level(eb) != block->level);
2674 if (block->level == 0) 2680 if (block->level == 0)
2675 btrfs_item_key_to_cpu(eb, &block->key, 0); 2681 btrfs_item_key_to_cpu(eb, &block->key, 0);
@@ -4209,7 +4215,7 @@ out:
4209 if (IS_ERR(fs_root)) 4215 if (IS_ERR(fs_root))
4210 err = PTR_ERR(fs_root); 4216 err = PTR_ERR(fs_root);
4211 else 4217 else
4212 btrfs_orphan_cleanup(fs_root); 4218 err = btrfs_orphan_cleanup(fs_root);
4213 } 4219 }
4214 return err; 4220 return err;
4215} 4221}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 6a1086e83ffc..6928bff62daa 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -88,7 +88,8 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
88 search_key.offset = (u64)-1; 88 search_key.offset = (u64)-1;
89 89
90 path = btrfs_alloc_path(); 90 path = btrfs_alloc_path();
91 BUG_ON(!path); 91 if (!path)
92 return -ENOMEM;
92 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 93 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
93 if (ret < 0) 94 if (ret < 0)
94 goto out; 95 goto out;
@@ -332,7 +333,8 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
332 struct extent_buffer *leaf; 333 struct extent_buffer *leaf;
333 334
334 path = btrfs_alloc_path(); 335 path = btrfs_alloc_path();
335 BUG_ON(!path); 336 if (!path)
337 return -ENOMEM;
336 ret = btrfs_search_slot(trans, root, key, path, -1, 1); 338 ret = btrfs_search_slot(trans, root, key, path, -1, 1);
337 if (ret < 0) 339 if (ret < 0)
338 goto out; 340 goto out;
@@ -471,3 +473,21 @@ again:
471 btrfs_free_path(path); 473 btrfs_free_path(path);
472 return 0; 474 return 0;
473} 475}
476
477/*
478 * Old btrfs forgets to init root_item->flags and root_item->byte_limit
479 * for subvolumes. To work around this problem, we steal a bit from
480 * root_item->inode_item->flags, and use it to indicate if those fields
481 * have been properly initialized.
482 */
483void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item)
484{
485 u64 inode_flags = le64_to_cpu(root_item->inode.flags);
486
487 if (!(inode_flags & BTRFS_INODE_ROOT_ITEM_INIT)) {
488 inode_flags |= BTRFS_INODE_ROOT_ITEM_INIT;
489 root_item->inode.flags = cpu_to_le64(inode_flags);
490 root_item->flags = 0;
491 root_item->byte_limit = 0;
492 }
493}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d39a9895d932..0ac712efcdf2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -52,6 +52,9 @@
52#include "export.h" 52#include "export.h"
53#include "compression.h" 53#include "compression.h"
54 54
55#define CREATE_TRACE_POINTS
56#include <trace/events/btrfs.h>
57
55static const struct super_operations btrfs_super_ops; 58static const struct super_operations btrfs_super_ops;
56 59
57static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, 60static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
@@ -156,7 +159,7 @@ enum {
156 Opt_compress_type, Opt_compress_force, Opt_compress_force_type, 159 Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
157 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, 160 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
158 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, 161 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
159 Opt_enospc_debug, Opt_err, 162 Opt_enospc_debug, Opt_subvolrootid, Opt_err,
160}; 163};
161 164
162static match_table_t tokens = { 165static match_table_t tokens = {
@@ -186,6 +189,7 @@ static match_table_t tokens = {
186 {Opt_clear_cache, "clear_cache"}, 189 {Opt_clear_cache, "clear_cache"},
187 {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, 190 {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
188 {Opt_enospc_debug, "enospc_debug"}, 191 {Opt_enospc_debug, "enospc_debug"},
192 {Opt_subvolrootid, "subvolrootid=%d"},
189 {Opt_err, NULL}, 193 {Opt_err, NULL},
190}; 194};
191 195
@@ -229,6 +233,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
229 break; 233 break;
230 case Opt_subvol: 234 case Opt_subvol:
231 case Opt_subvolid: 235 case Opt_subvolid:
236 case Opt_subvolrootid:
232 case Opt_device: 237 case Opt_device:
233 /* 238 /*
234 * These are parsed by btrfs_parse_early_options 239 * These are parsed by btrfs_parse_early_options
@@ -385,7 +390,7 @@ out:
385 */ 390 */
386static int btrfs_parse_early_options(const char *options, fmode_t flags, 391static int btrfs_parse_early_options(const char *options, fmode_t flags,
387 void *holder, char **subvol_name, u64 *subvol_objectid, 392 void *holder, char **subvol_name, u64 *subvol_objectid,
388 struct btrfs_fs_devices **fs_devices) 393 u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
389{ 394{
390 substring_t args[MAX_OPT_ARGS]; 395 substring_t args[MAX_OPT_ARGS];
391 char *opts, *orig, *p; 396 char *opts, *orig, *p;
@@ -426,6 +431,18 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
426 *subvol_objectid = intarg; 431 *subvol_objectid = intarg;
427 } 432 }
428 break; 433 break;
434 case Opt_subvolrootid:
435 intarg = 0;
436 error = match_int(&args[0], &intarg);
437 if (!error) {
438 /* we want the original fs_tree */
439 if (!intarg)
440 *subvol_rootid =
441 BTRFS_FS_TREE_OBJECTID;
442 else
443 *subvol_rootid = intarg;
444 }
445 break;
429 case Opt_device: 446 case Opt_device:
430 error = btrfs_scan_one_device(match_strdup(&args[0]), 447 error = btrfs_scan_one_device(match_strdup(&args[0]),
431 flags, holder, fs_devices); 448 flags, holder, fs_devices);
@@ -620,6 +637,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
620 struct btrfs_root *root = btrfs_sb(sb); 637 struct btrfs_root *root = btrfs_sb(sb);
621 int ret; 638 int ret;
622 639
640 trace_btrfs_sync_fs(wait);
641
623 if (!wait) { 642 if (!wait) {
624 filemap_flush(root->fs_info->btree_inode->i_mapping); 643 filemap_flush(root->fs_info->btree_inode->i_mapping);
625 return 0; 644 return 0;
@@ -639,6 +658,7 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
639{ 658{
640 struct btrfs_root *root = btrfs_sb(vfs->mnt_sb); 659 struct btrfs_root *root = btrfs_sb(vfs->mnt_sb);
641 struct btrfs_fs_info *info = root->fs_info; 660 struct btrfs_fs_info *info = root->fs_info;
661 char *compress_type;
642 662
643 if (btrfs_test_opt(root, DEGRADED)) 663 if (btrfs_test_opt(root, DEGRADED))
644 seq_puts(seq, ",degraded"); 664 seq_puts(seq, ",degraded");
@@ -657,8 +677,16 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
657 if (info->thread_pool_size != min_t(unsigned long, 677 if (info->thread_pool_size != min_t(unsigned long,
658 num_online_cpus() + 2, 8)) 678 num_online_cpus() + 2, 8))
659 seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); 679 seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
660 if (btrfs_test_opt(root, COMPRESS)) 680 if (btrfs_test_opt(root, COMPRESS)) {
661 seq_puts(seq, ",compress"); 681 if (info->compress_type == BTRFS_COMPRESS_ZLIB)
682 compress_type = "zlib";
683 else
684 compress_type = "lzo";
685 if (btrfs_test_opt(root, FORCE_COMPRESS))
686 seq_printf(seq, ",compress-force=%s", compress_type);
687 else
688 seq_printf(seq, ",compress=%s", compress_type);
689 }
662 if (btrfs_test_opt(root, NOSSD)) 690 if (btrfs_test_opt(root, NOSSD))
663 seq_puts(seq, ",nossd"); 691 seq_puts(seq, ",nossd");
664 if (btrfs_test_opt(root, SSD_SPREAD)) 692 if (btrfs_test_opt(root, SSD_SPREAD))
@@ -673,6 +701,12 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
673 seq_puts(seq, ",discard"); 701 seq_puts(seq, ",discard");
674 if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) 702 if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
675 seq_puts(seq, ",noacl"); 703 seq_puts(seq, ",noacl");
704 if (btrfs_test_opt(root, SPACE_CACHE))
705 seq_puts(seq, ",space_cache");
706 if (btrfs_test_opt(root, CLEAR_CACHE))
707 seq_puts(seq, ",clear_cache");
708 if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
709 seq_puts(seq, ",user_subvol_rm_allowed");
676 return 0; 710 return 0;
677} 711}
678 712
@@ -716,6 +750,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
716 fmode_t mode = FMODE_READ; 750 fmode_t mode = FMODE_READ;
717 char *subvol_name = NULL; 751 char *subvol_name = NULL;
718 u64 subvol_objectid = 0; 752 u64 subvol_objectid = 0;
753 u64 subvol_rootid = 0;
719 int error = 0; 754 int error = 0;
720 755
721 if (!(flags & MS_RDONLY)) 756 if (!(flags & MS_RDONLY))
@@ -723,7 +758,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
723 758
724 error = btrfs_parse_early_options(data, mode, fs_type, 759 error = btrfs_parse_early_options(data, mode, fs_type,
725 &subvol_name, &subvol_objectid, 760 &subvol_name, &subvol_objectid,
726 &fs_devices); 761 &subvol_rootid, &fs_devices);
727 if (error) 762 if (error)
728 return ERR_PTR(error); 763 return ERR_PTR(error);
729 764
@@ -787,15 +822,17 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
787 s->s_flags |= MS_ACTIVE; 822 s->s_flags |= MS_ACTIVE;
788 } 823 }
789 824
790 root = get_default_root(s, subvol_objectid);
791 if (IS_ERR(root)) {
792 error = PTR_ERR(root);
793 deactivate_locked_super(s);
794 goto error_free_subvol_name;
795 }
796 /* if they gave us a subvolume name bind mount into that */ 825 /* if they gave us a subvolume name bind mount into that */
797 if (strcmp(subvol_name, ".")) { 826 if (strcmp(subvol_name, ".")) {
798 struct dentry *new_root; 827 struct dentry *new_root;
828
829 root = get_default_root(s, subvol_rootid);
830 if (IS_ERR(root)) {
831 error = PTR_ERR(root);
832 deactivate_locked_super(s);
833 goto error_free_subvol_name;
834 }
835
799 mutex_lock(&root->d_inode->i_mutex); 836 mutex_lock(&root->d_inode->i_mutex);
800 new_root = lookup_one_len(subvol_name, root, 837 new_root = lookup_one_len(subvol_name, root,
801 strlen(subvol_name)); 838 strlen(subvol_name));
@@ -816,6 +853,13 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
816 } 853 }
817 dput(root); 854 dput(root);
818 root = new_root; 855 root = new_root;
856 } else {
857 root = get_default_root(s, subvol_objectid);
858 if (IS_ERR(root)) {
859 error = PTR_ERR(root);
860 deactivate_locked_super(s);
861 goto error_free_subvol_name;
862 }
819 } 863 }
820 864
821 kfree(subvol_name); 865 kfree(subvol_name);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 3d73c8d93bbb..c571734d5e5a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -32,10 +32,8 @@
32 32
33static noinline void put_transaction(struct btrfs_transaction *transaction) 33static noinline void put_transaction(struct btrfs_transaction *transaction)
34{ 34{
35 WARN_ON(transaction->use_count == 0); 35 WARN_ON(atomic_read(&transaction->use_count) == 0);
36 transaction->use_count--; 36 if (atomic_dec_and_test(&transaction->use_count)) {
37 if (transaction->use_count == 0) {
38 list_del_init(&transaction->list);
39 memset(transaction, 0, sizeof(*transaction)); 37 memset(transaction, 0, sizeof(*transaction));
40 kmem_cache_free(btrfs_transaction_cachep, transaction); 38 kmem_cache_free(btrfs_transaction_cachep, transaction);
41 } 39 }
@@ -57,16 +55,17 @@ static noinline int join_transaction(struct btrfs_root *root)
57 if (!cur_trans) { 55 if (!cur_trans) {
58 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, 56 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
59 GFP_NOFS); 57 GFP_NOFS);
60 BUG_ON(!cur_trans); 58 if (!cur_trans)
59 return -ENOMEM;
61 root->fs_info->generation++; 60 root->fs_info->generation++;
62 cur_trans->num_writers = 1; 61 atomic_set(&cur_trans->num_writers, 1);
63 cur_trans->num_joined = 0; 62 cur_trans->num_joined = 0;
64 cur_trans->transid = root->fs_info->generation; 63 cur_trans->transid = root->fs_info->generation;
65 init_waitqueue_head(&cur_trans->writer_wait); 64 init_waitqueue_head(&cur_trans->writer_wait);
66 init_waitqueue_head(&cur_trans->commit_wait); 65 init_waitqueue_head(&cur_trans->commit_wait);
67 cur_trans->in_commit = 0; 66 cur_trans->in_commit = 0;
68 cur_trans->blocked = 0; 67 cur_trans->blocked = 0;
69 cur_trans->use_count = 1; 68 atomic_set(&cur_trans->use_count, 1);
70 cur_trans->commit_done = 0; 69 cur_trans->commit_done = 0;
71 cur_trans->start_time = get_seconds(); 70 cur_trans->start_time = get_seconds();
72 71
@@ -87,7 +86,7 @@ static noinline int join_transaction(struct btrfs_root *root)
87 root->fs_info->running_transaction = cur_trans; 86 root->fs_info->running_transaction = cur_trans;
88 spin_unlock(&root->fs_info->new_trans_lock); 87 spin_unlock(&root->fs_info->new_trans_lock);
89 } else { 88 } else {
90 cur_trans->num_writers++; 89 atomic_inc(&cur_trans->num_writers);
91 cur_trans->num_joined++; 90 cur_trans->num_joined++;
92 } 91 }
93 92
@@ -144,7 +143,7 @@ static void wait_current_trans(struct btrfs_root *root)
144 cur_trans = root->fs_info->running_transaction; 143 cur_trans = root->fs_info->running_transaction;
145 if (cur_trans && cur_trans->blocked) { 144 if (cur_trans && cur_trans->blocked) {
146 DEFINE_WAIT(wait); 145 DEFINE_WAIT(wait);
147 cur_trans->use_count++; 146 atomic_inc(&cur_trans->use_count);
148 while (1) { 147 while (1) {
149 prepare_to_wait(&root->fs_info->transaction_wait, &wait, 148 prepare_to_wait(&root->fs_info->transaction_wait, &wait,
150 TASK_UNINTERRUPTIBLE); 149 TASK_UNINTERRUPTIBLE);
@@ -180,6 +179,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
180{ 179{
181 struct btrfs_trans_handle *h; 180 struct btrfs_trans_handle *h;
182 struct btrfs_transaction *cur_trans; 181 struct btrfs_transaction *cur_trans;
182 int retries = 0;
183 int ret; 183 int ret;
184 184
185 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 185 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
@@ -195,10 +195,15 @@ again:
195 wait_current_trans(root); 195 wait_current_trans(root);
196 196
197 ret = join_transaction(root); 197 ret = join_transaction(root);
198 BUG_ON(ret); 198 if (ret < 0) {
199 kmem_cache_free(btrfs_trans_handle_cachep, h);
200 if (type != TRANS_JOIN_NOLOCK)
201 mutex_unlock(&root->fs_info->trans_mutex);
202 return ERR_PTR(ret);
203 }
199 204
200 cur_trans = root->fs_info->running_transaction; 205 cur_trans = root->fs_info->running_transaction;
201 cur_trans->use_count++; 206 atomic_inc(&cur_trans->use_count);
202 if (type != TRANS_JOIN_NOLOCK) 207 if (type != TRANS_JOIN_NOLOCK)
203 mutex_unlock(&root->fs_info->trans_mutex); 208 mutex_unlock(&root->fs_info->trans_mutex);
204 209
@@ -218,10 +223,18 @@ again:
218 223
219 if (num_items > 0) { 224 if (num_items > 0) {
220 ret = btrfs_trans_reserve_metadata(h, root, num_items); 225 ret = btrfs_trans_reserve_metadata(h, root, num_items);
221 if (ret == -EAGAIN) { 226 if (ret == -EAGAIN && !retries) {
227 retries++;
222 btrfs_commit_transaction(h, root); 228 btrfs_commit_transaction(h, root);
223 goto again; 229 goto again;
230 } else if (ret == -EAGAIN) {
231 /*
232 * We have already retried and got EAGAIN, so really we
233 * don't have space, so set ret to -ENOSPC.
234 */
235 ret = -ENOSPC;
224 } 236 }
237
225 if (ret < 0) { 238 if (ret < 0) {
226 btrfs_end_transaction(h, root); 239 btrfs_end_transaction(h, root);
227 return ERR_PTR(ret); 240 return ERR_PTR(ret);
@@ -321,7 +334,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
321 goto out_unlock; /* nothing committing|committed */ 334 goto out_unlock; /* nothing committing|committed */
322 } 335 }
323 336
324 cur_trans->use_count++; 337 atomic_inc(&cur_trans->use_count);
325 mutex_unlock(&root->fs_info->trans_mutex); 338 mutex_unlock(&root->fs_info->trans_mutex);
326 339
327 wait_for_commit(root, cur_trans); 340 wait_for_commit(root, cur_trans);
@@ -451,18 +464,14 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
451 wake_up_process(info->transaction_kthread); 464 wake_up_process(info->transaction_kthread);
452 } 465 }
453 466
454 if (lock)
455 mutex_lock(&info->trans_mutex);
456 WARN_ON(cur_trans != info->running_transaction); 467 WARN_ON(cur_trans != info->running_transaction);
457 WARN_ON(cur_trans->num_writers < 1); 468 WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
458 cur_trans->num_writers--; 469 atomic_dec(&cur_trans->num_writers);
459 470
460 smp_mb(); 471 smp_mb();
461 if (waitqueue_active(&cur_trans->writer_wait)) 472 if (waitqueue_active(&cur_trans->writer_wait))
462 wake_up(&cur_trans->writer_wait); 473 wake_up(&cur_trans->writer_wait);
463 put_transaction(cur_trans); 474 put_transaction(cur_trans);
464 if (lock)
465 mutex_unlock(&info->trans_mutex);
466 475
467 if (current->journal_info == trans) 476 if (current->journal_info == trans)
468 current->journal_info = NULL; 477 current->journal_info = NULL;
@@ -970,6 +979,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
970 record_root_in_trans(trans, root); 979 record_root_in_trans(trans, root);
971 btrfs_set_root_last_snapshot(&root->root_item, trans->transid); 980 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
972 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); 981 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
982 btrfs_check_and_init_root_item(new_root_item);
973 983
974 root_flags = btrfs_root_flags(new_root_item); 984 root_flags = btrfs_root_flags(new_root_item);
975 if (pending->readonly) 985 if (pending->readonly)
@@ -1156,7 +1166,8 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
1156 struct btrfs_transaction *cur_trans; 1166 struct btrfs_transaction *cur_trans;
1157 1167
1158 ac = kmalloc(sizeof(*ac), GFP_NOFS); 1168 ac = kmalloc(sizeof(*ac), GFP_NOFS);
1159 BUG_ON(!ac); 1169 if (!ac)
1170 return -ENOMEM;
1160 1171
1161 INIT_DELAYED_WORK(&ac->work, do_async_commit); 1172 INIT_DELAYED_WORK(&ac->work, do_async_commit);
1162 ac->root = root; 1173 ac->root = root;
@@ -1170,7 +1181,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
1170 /* take transaction reference */ 1181 /* take transaction reference */
1171 mutex_lock(&root->fs_info->trans_mutex); 1182 mutex_lock(&root->fs_info->trans_mutex);
1172 cur_trans = trans->transaction; 1183 cur_trans = trans->transaction;
1173 cur_trans->use_count++; 1184 atomic_inc(&cur_trans->use_count);
1174 mutex_unlock(&root->fs_info->trans_mutex); 1185 mutex_unlock(&root->fs_info->trans_mutex);
1175 1186
1176 btrfs_end_transaction(trans, root); 1187 btrfs_end_transaction(trans, root);
@@ -1229,7 +1240,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1229 1240
1230 mutex_lock(&root->fs_info->trans_mutex); 1241 mutex_lock(&root->fs_info->trans_mutex);
1231 if (cur_trans->in_commit) { 1242 if (cur_trans->in_commit) {
1232 cur_trans->use_count++; 1243 atomic_inc(&cur_trans->use_count);
1233 mutex_unlock(&root->fs_info->trans_mutex); 1244 mutex_unlock(&root->fs_info->trans_mutex);
1234 btrfs_end_transaction(trans, root); 1245 btrfs_end_transaction(trans, root);
1235 1246
@@ -1251,7 +1262,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1251 prev_trans = list_entry(cur_trans->list.prev, 1262 prev_trans = list_entry(cur_trans->list.prev,
1252 struct btrfs_transaction, list); 1263 struct btrfs_transaction, list);
1253 if (!prev_trans->commit_done) { 1264 if (!prev_trans->commit_done) {
1254 prev_trans->use_count++; 1265 atomic_inc(&prev_trans->use_count);
1255 mutex_unlock(&root->fs_info->trans_mutex); 1266 mutex_unlock(&root->fs_info->trans_mutex);
1256 1267
1257 wait_for_commit(root, prev_trans); 1268 wait_for_commit(root, prev_trans);
@@ -1292,14 +1303,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1292 TASK_UNINTERRUPTIBLE); 1303 TASK_UNINTERRUPTIBLE);
1293 1304
1294 smp_mb(); 1305 smp_mb();
1295 if (cur_trans->num_writers > 1) 1306 if (atomic_read(&cur_trans->num_writers) > 1)
1296 schedule_timeout(MAX_SCHEDULE_TIMEOUT); 1307 schedule_timeout(MAX_SCHEDULE_TIMEOUT);
1297 else if (should_grow) 1308 else if (should_grow)
1298 schedule_timeout(1); 1309 schedule_timeout(1);
1299 1310
1300 mutex_lock(&root->fs_info->trans_mutex); 1311 mutex_lock(&root->fs_info->trans_mutex);
1301 finish_wait(&cur_trans->writer_wait, &wait); 1312 finish_wait(&cur_trans->writer_wait, &wait);
1302 } while (cur_trans->num_writers > 1 || 1313 } while (atomic_read(&cur_trans->num_writers) > 1 ||
1303 (should_grow && cur_trans->num_joined != joined)); 1314 (should_grow && cur_trans->num_joined != joined));
1304 1315
1305 ret = create_pending_snapshots(trans, root->fs_info); 1316 ret = create_pending_snapshots(trans, root->fs_info);
@@ -1386,9 +1397,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1386 1397
1387 wake_up(&cur_trans->commit_wait); 1398 wake_up(&cur_trans->commit_wait);
1388 1399
1400 list_del_init(&cur_trans->list);
1389 put_transaction(cur_trans); 1401 put_transaction(cur_trans);
1390 put_transaction(cur_trans); 1402 put_transaction(cur_trans);
1391 1403
1404 trace_btrfs_transaction_commit(root);
1405
1392 mutex_unlock(&root->fs_info->trans_mutex); 1406 mutex_unlock(&root->fs_info->trans_mutex);
1393 1407
1394 if (current->journal_info == trans) 1408 if (current->journal_info == trans)
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 229a594cacd5..e441acc6c584 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -27,11 +27,11 @@ struct btrfs_transaction {
27 * total writers in this transaction, it must be zero before the 27 * total writers in this transaction, it must be zero before the
28 * transaction can end 28 * transaction can end
29 */ 29 */
30 unsigned long num_writers; 30 atomic_t num_writers;
31 31
32 unsigned long num_joined; 32 unsigned long num_joined;
33 int in_commit; 33 int in_commit;
34 int use_count; 34 atomic_t use_count;
35 int commit_done; 35 int commit_done;
36 int blocked; 36 int blocked;
37 struct list_head list; 37 struct list_head list;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index a4bbb854dfd2..c50271ad3157 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -799,12 +799,12 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
799 struct inode *dir; 799 struct inode *dir;
800 int ret; 800 int ret;
801 struct btrfs_inode_ref *ref; 801 struct btrfs_inode_ref *ref;
802 struct btrfs_dir_item *di;
803 struct inode *inode; 802 struct inode *inode;
804 char *name; 803 char *name;
805 int namelen; 804 int namelen;
806 unsigned long ref_ptr; 805 unsigned long ref_ptr;
807 unsigned long ref_end; 806 unsigned long ref_end;
807 int search_done = 0;
808 808
809 /* 809 /*
810 * it is possible that we didn't log all the parent directories 810 * it is possible that we didn't log all the parent directories
@@ -845,7 +845,10 @@ again:
845 * existing back reference, and we don't want to create 845 * existing back reference, and we don't want to create
846 * dangling pointers in the directory. 846 * dangling pointers in the directory.
847 */ 847 */
848conflict_again: 848
849 if (search_done)
850 goto insert;
851
849 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 852 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
850 if (ret == 0) { 853 if (ret == 0) {
851 char *victim_name; 854 char *victim_name;
@@ -886,37 +889,21 @@ conflict_again:
886 ret = btrfs_unlink_inode(trans, root, dir, 889 ret = btrfs_unlink_inode(trans, root, dir,
887 inode, victim_name, 890 inode, victim_name,
888 victim_name_len); 891 victim_name_len);
889 kfree(victim_name);
890 btrfs_release_path(root, path);
891 goto conflict_again;
892 } 892 }
893 kfree(victim_name); 893 kfree(victim_name);
894 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 894 ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
895 } 895 }
896 BUG_ON(ret); 896 BUG_ON(ret);
897 }
898 btrfs_release_path(root, path);
899
900 /* look for a conflicting sequence number */
901 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
902 btrfs_inode_ref_index(eb, ref),
903 name, namelen, 0);
904 if (di && !IS_ERR(di)) {
905 ret = drop_one_dir_item(trans, root, path, dir, di);
906 BUG_ON(ret);
907 }
908 btrfs_release_path(root, path);
909 897
910 898 /*
911 /* look for a conflicting name */ 899 * NOTE: we have searched root tree and checked the
912 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, 900 * coresponding ref, it does not need to check again.
913 name, namelen, 0); 901 */
914 if (di && !IS_ERR(di)) { 902 search_done = 1;
915 ret = drop_one_dir_item(trans, root, path, dir, di);
916 BUG_ON(ret);
917 } 903 }
918 btrfs_release_path(root, path); 904 btrfs_release_path(root, path);
919 905
906insert:
920 /* insert our name */ 907 /* insert our name */
921 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, 908 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
922 btrfs_inode_ref_index(eb, ref)); 909 btrfs_inode_ref_index(eb, ref));
@@ -1286,6 +1273,8 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1286 ptr_end = ptr + item_size; 1273 ptr_end = ptr + item_size;
1287 while (ptr < ptr_end) { 1274 while (ptr < ptr_end) {
1288 di = (struct btrfs_dir_item *)ptr; 1275 di = (struct btrfs_dir_item *)ptr;
1276 if (verify_dir_item(root, eb, di))
1277 return -EIO;
1289 name_len = btrfs_dir_name_len(eb, di); 1278 name_len = btrfs_dir_name_len(eb, di);
1290 ret = replay_one_name(trans, root, path, eb, di, key); 1279 ret = replay_one_name(trans, root, path, eb, di, key);
1291 BUG_ON(ret); 1280 BUG_ON(ret);
@@ -1412,6 +1401,11 @@ again:
1412 ptr_end = ptr + item_size; 1401 ptr_end = ptr + item_size;
1413 while (ptr < ptr_end) { 1402 while (ptr < ptr_end) {
1414 di = (struct btrfs_dir_item *)ptr; 1403 di = (struct btrfs_dir_item *)ptr;
1404 if (verify_dir_item(root, eb, di)) {
1405 ret = -EIO;
1406 goto out;
1407 }
1408
1415 name_len = btrfs_dir_name_len(eb, di); 1409 name_len = btrfs_dir_name_len(eb, di);
1416 name = kmalloc(name_len, GFP_NOFS); 1410 name = kmalloc(name_len, GFP_NOFS);
1417 if (!name) { 1411 if (!name) {
@@ -1821,7 +1815,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1821 int orig_level; 1815 int orig_level;
1822 1816
1823 path = btrfs_alloc_path(); 1817 path = btrfs_alloc_path();
1824 BUG_ON(!path); 1818 if (!path)
1819 return -ENOMEM;
1825 1820
1826 level = btrfs_header_level(log->node); 1821 level = btrfs_header_level(log->node);
1827 orig_level = level; 1822 orig_level = level;
@@ -3107,9 +3102,11 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
3107 .stage = 0, 3102 .stage = 0,
3108 }; 3103 };
3109 3104
3110 fs_info->log_root_recovering = 1;
3111 path = btrfs_alloc_path(); 3105 path = btrfs_alloc_path();
3112 BUG_ON(!path); 3106 if (!path)
3107 return -ENOMEM;
3108
3109 fs_info->log_root_recovering = 1;
3113 3110
3114 trans = btrfs_start_transaction(fs_info->tree_root, 0); 3111 trans = btrfs_start_transaction(fs_info->tree_root, 0);
3115 BUG_ON(IS_ERR(trans)); 3112 BUG_ON(IS_ERR(trans));
@@ -3117,7 +3114,8 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
3117 wc.trans = trans; 3114 wc.trans = trans;
3118 wc.pin = 1; 3115 wc.pin = 1;
3119 3116
3120 walk_log_tree(trans, log_root_tree, &wc); 3117 ret = walk_log_tree(trans, log_root_tree, &wc);
3118 BUG_ON(ret);
3121 3119
3122again: 3120again:
3123 key.objectid = BTRFS_TREE_LOG_OBJECTID; 3121 key.objectid = BTRFS_TREE_LOG_OBJECTID;
@@ -3141,8 +3139,7 @@ again:
3141 3139
3142 log = btrfs_read_fs_root_no_radix(log_root_tree, 3140 log = btrfs_read_fs_root_no_radix(log_root_tree,
3143 &found_key); 3141 &found_key);
3144 BUG_ON(!log); 3142 BUG_ON(IS_ERR(log));
3145
3146 3143
3147 tmp_key.objectid = found_key.offset; 3144 tmp_key.objectid = found_key.offset;
3148 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 3145 tmp_key.type = BTRFS_ROOT_ITEM_KEY;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dd13eb81ee40..309a57b9fc85 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -33,17 +33,6 @@
33#include "volumes.h" 33#include "volumes.h"
34#include "async-thread.h" 34#include "async-thread.h"
35 35
36struct map_lookup {
37 u64 type;
38 int io_align;
39 int io_width;
40 int stripe_len;
41 int sector_size;
42 int num_stripes;
43 int sub_stripes;
44 struct btrfs_bio_stripe stripes[];
45};
46
47static int init_first_rw_device(struct btrfs_trans_handle *trans, 36static int init_first_rw_device(struct btrfs_trans_handle *trans,
48 struct btrfs_root *root, 37 struct btrfs_root *root,
49 struct btrfs_device *device); 38 struct btrfs_device *device);
@@ -162,7 +151,6 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
162 struct bio *cur; 151 struct bio *cur;
163 int again = 0; 152 int again = 0;
164 unsigned long num_run; 153 unsigned long num_run;
165 unsigned long num_sync_run;
166 unsigned long batch_run = 0; 154 unsigned long batch_run = 0;
167 unsigned long limit; 155 unsigned long limit;
168 unsigned long last_waited = 0; 156 unsigned long last_waited = 0;
@@ -173,11 +161,6 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
173 limit = btrfs_async_submit_limit(fs_info); 161 limit = btrfs_async_submit_limit(fs_info);
174 limit = limit * 2 / 3; 162 limit = limit * 2 / 3;
175 163
176 /* we want to make sure that every time we switch from the sync
177 * list to the normal list, we unplug
178 */
179 num_sync_run = 0;
180
181loop: 164loop:
182 spin_lock(&device->io_lock); 165 spin_lock(&device->io_lock);
183 166
@@ -223,15 +206,6 @@ loop_lock:
223 206
224 spin_unlock(&device->io_lock); 207 spin_unlock(&device->io_lock);
225 208
226 /*
227 * if we're doing the regular priority list, make sure we unplug
228 * for any high prio bios we've sent down
229 */
230 if (pending_bios == &device->pending_bios && num_sync_run > 0) {
231 num_sync_run = 0;
232 blk_run_backing_dev(bdi, NULL);
233 }
234
235 while (pending) { 209 while (pending) {
236 210
237 rmb(); 211 rmb();
@@ -259,19 +233,11 @@ loop_lock:
259 233
260 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 234 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
261 235
262 if (cur->bi_rw & REQ_SYNC)
263 num_sync_run++;
264
265 submit_bio(cur->bi_rw, cur); 236 submit_bio(cur->bi_rw, cur);
266 num_run++; 237 num_run++;
267 batch_run++; 238 batch_run++;
268 if (need_resched()) { 239 if (need_resched())
269 if (num_sync_run) {
270 blk_run_backing_dev(bdi, NULL);
271 num_sync_run = 0;
272 }
273 cond_resched(); 240 cond_resched();
274 }
275 241
276 /* 242 /*
277 * we made progress, there is more work to do and the bdi 243 * we made progress, there is more work to do and the bdi
@@ -304,13 +270,8 @@ loop_lock:
304 * against it before looping 270 * against it before looping
305 */ 271 */
306 last_waited = ioc->last_waited; 272 last_waited = ioc->last_waited;
307 if (need_resched()) { 273 if (need_resched())
308 if (num_sync_run) {
309 blk_run_backing_dev(bdi, NULL);
310 num_sync_run = 0;
311 }
312 cond_resched(); 274 cond_resched();
313 }
314 continue; 275 continue;
315 } 276 }
316 spin_lock(&device->io_lock); 277 spin_lock(&device->io_lock);
@@ -323,22 +284,6 @@ loop_lock:
323 } 284 }
324 } 285 }
325 286
326 if (num_sync_run) {
327 num_sync_run = 0;
328 blk_run_backing_dev(bdi, NULL);
329 }
330 /*
331 * IO has already been through a long path to get here. Checksumming,
332 * async helper threads, perhaps compression. We've done a pretty
333 * good job of collecting a batch of IO and should just unplug
334 * the device right away.
335 *
336 * This will help anyone who is waiting on the IO, they might have
337 * already unplugged, but managed to do so before the bio they
338 * cared about found its way down here.
339 */
340 blk_run_backing_dev(bdi, NULL);
341
342 cond_resched(); 287 cond_resched();
343 if (again) 288 if (again)
344 goto loop; 289 goto loop;
@@ -1923,6 +1868,8 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
1923 1868
1924 BUG_ON(ret); 1869 BUG_ON(ret);
1925 1870
1871 trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
1872
1926 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 1873 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
1927 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); 1874 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
1928 BUG_ON(ret); 1875 BUG_ON(ret);
@@ -2650,6 +2597,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2650 *num_bytes = chunk_bytes_by_type(type, calc_size, 2597 *num_bytes = chunk_bytes_by_type(type, calc_size,
2651 map->num_stripes, sub_stripes); 2598 map->num_stripes, sub_stripes);
2652 2599
2600 trace_btrfs_chunk_alloc(info->chunk_root, map, start, *num_bytes);
2601
2653 em = alloc_extent_map(GFP_NOFS); 2602 em = alloc_extent_map(GFP_NOFS);
2654 if (!em) { 2603 if (!em) {
2655 ret = -ENOMEM; 2604 ret = -ENOMEM;
@@ -2758,6 +2707,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2758 item_size); 2707 item_size);
2759 BUG_ON(ret); 2708 BUG_ON(ret);
2760 } 2709 }
2710
2761 kfree(chunk); 2711 kfree(chunk);
2762 return 0; 2712 return 0;
2763} 2713}
@@ -2955,14 +2905,17 @@ static int find_live_mirror(struct map_lookup *map, int first, int num,
2955static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2905static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2956 u64 logical, u64 *length, 2906 u64 logical, u64 *length,
2957 struct btrfs_multi_bio **multi_ret, 2907 struct btrfs_multi_bio **multi_ret,
2958 int mirror_num, struct page *unplug_page) 2908 int mirror_num)
2959{ 2909{
2960 struct extent_map *em; 2910 struct extent_map *em;
2961 struct map_lookup *map; 2911 struct map_lookup *map;
2962 struct extent_map_tree *em_tree = &map_tree->map_tree; 2912 struct extent_map_tree *em_tree = &map_tree->map_tree;
2963 u64 offset; 2913 u64 offset;
2964 u64 stripe_offset; 2914 u64 stripe_offset;
2915 u64 stripe_end_offset;
2965 u64 stripe_nr; 2916 u64 stripe_nr;
2917 u64 stripe_nr_orig;
2918 u64 stripe_nr_end;
2966 int stripes_allocated = 8; 2919 int stripes_allocated = 8;
2967 int stripes_required = 1; 2920 int stripes_required = 1;
2968 int stripe_index; 2921 int stripe_index;
@@ -2971,7 +2924,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2971 int max_errors = 0; 2924 int max_errors = 0;
2972 struct btrfs_multi_bio *multi = NULL; 2925 struct btrfs_multi_bio *multi = NULL;
2973 2926
2974 if (multi_ret && !(rw & REQ_WRITE)) 2927 if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
2975 stripes_allocated = 1; 2928 stripes_allocated = 1;
2976again: 2929again:
2977 if (multi_ret) { 2930 if (multi_ret) {
@@ -2987,11 +2940,6 @@ again:
2987 em = lookup_extent_mapping(em_tree, logical, *length); 2940 em = lookup_extent_mapping(em_tree, logical, *length);
2988 read_unlock(&em_tree->lock); 2941 read_unlock(&em_tree->lock);
2989 2942
2990 if (!em && unplug_page) {
2991 kfree(multi);
2992 return 0;
2993 }
2994
2995 if (!em) { 2943 if (!em) {
2996 printk(KERN_CRIT "unable to find logical %llu len %llu\n", 2944 printk(KERN_CRIT "unable to find logical %llu len %llu\n",
2997 (unsigned long long)logical, 2945 (unsigned long long)logical,
@@ -3017,7 +2965,15 @@ again:
3017 max_errors = 1; 2965 max_errors = 1;
3018 } 2966 }
3019 } 2967 }
3020 if (multi_ret && (rw & REQ_WRITE) && 2968 if (rw & REQ_DISCARD) {
2969 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2970 BTRFS_BLOCK_GROUP_RAID1 |
2971 BTRFS_BLOCK_GROUP_DUP |
2972 BTRFS_BLOCK_GROUP_RAID10)) {
2973 stripes_required = map->num_stripes;
2974 }
2975 }
2976 if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
3021 stripes_allocated < stripes_required) { 2977 stripes_allocated < stripes_required) {
3022 stripes_allocated = map->num_stripes; 2978 stripes_allocated = map->num_stripes;
3023 free_extent_map(em); 2979 free_extent_map(em);
@@ -3037,23 +2993,37 @@ again:
3037 /* stripe_offset is the offset of this block in its stripe*/ 2993 /* stripe_offset is the offset of this block in its stripe*/
3038 stripe_offset = offset - stripe_offset; 2994 stripe_offset = offset - stripe_offset;
3039 2995
3040 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 2996 if (rw & REQ_DISCARD)
3041 BTRFS_BLOCK_GROUP_RAID10 | 2997 *length = min_t(u64, em->len - offset, *length);
3042 BTRFS_BLOCK_GROUP_DUP)) { 2998 else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2999 BTRFS_BLOCK_GROUP_RAID1 |
3000 BTRFS_BLOCK_GROUP_RAID10 |
3001 BTRFS_BLOCK_GROUP_DUP)) {
3043 /* we limit the length of each bio to what fits in a stripe */ 3002 /* we limit the length of each bio to what fits in a stripe */
3044 *length = min_t(u64, em->len - offset, 3003 *length = min_t(u64, em->len - offset,
3045 map->stripe_len - stripe_offset); 3004 map->stripe_len - stripe_offset);
3046 } else { 3005 } else {
3047 *length = em->len - offset; 3006 *length = em->len - offset;
3048 } 3007 }
3049 3008
3050 if (!multi_ret && !unplug_page) 3009 if (!multi_ret)
3051 goto out; 3010 goto out;
3052 3011
3053 num_stripes = 1; 3012 num_stripes = 1;
3054 stripe_index = 0; 3013 stripe_index = 0;
3055 if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 3014 stripe_nr_orig = stripe_nr;
3056 if (unplug_page || (rw & REQ_WRITE)) 3015 stripe_nr_end = (offset + *length + map->stripe_len - 1) &
3016 (~(map->stripe_len - 1));
3017 do_div(stripe_nr_end, map->stripe_len);
3018 stripe_end_offset = stripe_nr_end * map->stripe_len -
3019 (offset + *length);
3020 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3021 if (rw & REQ_DISCARD)
3022 num_stripes = min_t(u64, map->num_stripes,
3023 stripe_nr_end - stripe_nr_orig);
3024 stripe_index = do_div(stripe_nr, map->num_stripes);
3025 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3026 if (rw & (REQ_WRITE | REQ_DISCARD))
3057 num_stripes = map->num_stripes; 3027 num_stripes = map->num_stripes;
3058 else if (mirror_num) 3028 else if (mirror_num)
3059 stripe_index = mirror_num - 1; 3029 stripe_index = mirror_num - 1;
@@ -3064,7 +3034,7 @@ again:
3064 } 3034 }
3065 3035
3066 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 3036 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3067 if (rw & REQ_WRITE) 3037 if (rw & (REQ_WRITE | REQ_DISCARD))
3068 num_stripes = map->num_stripes; 3038 num_stripes = map->num_stripes;
3069 else if (mirror_num) 3039 else if (mirror_num)
3070 stripe_index = mirror_num - 1; 3040 stripe_index = mirror_num - 1;
@@ -3075,8 +3045,12 @@ again:
3075 stripe_index = do_div(stripe_nr, factor); 3045 stripe_index = do_div(stripe_nr, factor);
3076 stripe_index *= map->sub_stripes; 3046 stripe_index *= map->sub_stripes;
3077 3047
3078 if (unplug_page || (rw & REQ_WRITE)) 3048 if (rw & REQ_WRITE)
3079 num_stripes = map->sub_stripes; 3049 num_stripes = map->sub_stripes;
3050 else if (rw & REQ_DISCARD)
3051 num_stripes = min_t(u64, map->sub_stripes *
3052 (stripe_nr_end - stripe_nr_orig),
3053 map->num_stripes);
3080 else if (mirror_num) 3054 else if (mirror_num)
3081 stripe_index += mirror_num - 1; 3055 stripe_index += mirror_num - 1;
3082 else { 3056 else {
@@ -3094,24 +3068,101 @@ again:
3094 } 3068 }
3095 BUG_ON(stripe_index >= map->num_stripes); 3069 BUG_ON(stripe_index >= map->num_stripes);
3096 3070
3097 for (i = 0; i < num_stripes; i++) { 3071 if (rw & REQ_DISCARD) {
3098 if (unplug_page) { 3072 for (i = 0; i < num_stripes; i++) {
3099 struct btrfs_device *device;
3100 struct backing_dev_info *bdi;
3101
3102 device = map->stripes[stripe_index].dev;
3103 if (device->bdev) {
3104 bdi = blk_get_backing_dev_info(device->bdev);
3105 if (bdi->unplug_io_fn)
3106 bdi->unplug_io_fn(bdi, unplug_page);
3107 }
3108 } else {
3109 multi->stripes[i].physical = 3073 multi->stripes[i].physical =
3110 map->stripes[stripe_index].physical + 3074 map->stripes[stripe_index].physical +
3111 stripe_offset + stripe_nr * map->stripe_len; 3075 stripe_offset + stripe_nr * map->stripe_len;
3112 multi->stripes[i].dev = map->stripes[stripe_index].dev; 3076 multi->stripes[i].dev = map->stripes[stripe_index].dev;
3077
3078 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3079 u64 stripes;
3080 u32 last_stripe = 0;
3081 int j;
3082
3083 div_u64_rem(stripe_nr_end - 1,
3084 map->num_stripes,
3085 &last_stripe);
3086
3087 for (j = 0; j < map->num_stripes; j++) {
3088 u32 test;
3089
3090 div_u64_rem(stripe_nr_end - 1 - j,
3091 map->num_stripes, &test);
3092 if (test == stripe_index)
3093 break;
3094 }
3095 stripes = stripe_nr_end - 1 - j;
3096 do_div(stripes, map->num_stripes);
3097 multi->stripes[i].length = map->stripe_len *
3098 (stripes - stripe_nr + 1);
3099
3100 if (i == 0) {
3101 multi->stripes[i].length -=
3102 stripe_offset;
3103 stripe_offset = 0;
3104 }
3105 if (stripe_index == last_stripe)
3106 multi->stripes[i].length -=
3107 stripe_end_offset;
3108 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3109 u64 stripes;
3110 int j;
3111 int factor = map->num_stripes /
3112 map->sub_stripes;
3113 u32 last_stripe = 0;
3114
3115 div_u64_rem(stripe_nr_end - 1,
3116 factor, &last_stripe);
3117 last_stripe *= map->sub_stripes;
3118
3119 for (j = 0; j < factor; j++) {
3120 u32 test;
3121
3122 div_u64_rem(stripe_nr_end - 1 - j,
3123 factor, &test);
3124
3125 if (test ==
3126 stripe_index / map->sub_stripes)
3127 break;
3128 }
3129 stripes = stripe_nr_end - 1 - j;
3130 do_div(stripes, factor);
3131 multi->stripes[i].length = map->stripe_len *
3132 (stripes - stripe_nr + 1);
3133
3134 if (i < map->sub_stripes) {
3135 multi->stripes[i].length -=
3136 stripe_offset;
3137 if (i == map->sub_stripes - 1)
3138 stripe_offset = 0;
3139 }
3140 if (stripe_index >= last_stripe &&
3141 stripe_index <= (last_stripe +
3142 map->sub_stripes - 1)) {
3143 multi->stripes[i].length -=
3144 stripe_end_offset;
3145 }
3146 } else
3147 multi->stripes[i].length = *length;
3148
3149 stripe_index++;
3150 if (stripe_index == map->num_stripes) {
3151 /* This could only happen for RAID0/10 */
3152 stripe_index = 0;
3153 stripe_nr++;
3154 }
3155 }
3156 } else {
3157 for (i = 0; i < num_stripes; i++) {
3158 multi->stripes[i].physical =
3159 map->stripes[stripe_index].physical +
3160 stripe_offset +
3161 stripe_nr * map->stripe_len;
3162 multi->stripes[i].dev =
3163 map->stripes[stripe_index].dev;
3164 stripe_index++;
3113 } 3165 }
3114 stripe_index++;
3115 } 3166 }
3116 if (multi_ret) { 3167 if (multi_ret) {
3117 *multi_ret = multi; 3168 *multi_ret = multi;
@@ -3128,7 +3179,7 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3128 struct btrfs_multi_bio **multi_ret, int mirror_num) 3179 struct btrfs_multi_bio **multi_ret, int mirror_num)
3129{ 3180{
3130 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, 3181 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
3131 mirror_num, NULL); 3182 mirror_num);
3132} 3183}
3133 3184
3134int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 3185int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -3196,14 +3247,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
3196 return 0; 3247 return 0;
3197} 3248}
3198 3249
3199int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
3200 u64 logical, struct page *page)
3201{
3202 u64 length = PAGE_CACHE_SIZE;
3203 return __btrfs_map_block(map_tree, READ, logical, &length,
3204 NULL, 0, page);
3205}
3206
3207static void end_bio_multi_stripe(struct bio *bio, int err) 3250static void end_bio_multi_stripe(struct bio *bio, int err)
3208{ 3251{
3209 struct btrfs_multi_bio *multi = bio->bi_private; 3252 struct btrfs_multi_bio *multi = bio->bi_private;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 7fb59d45fe8c..cc2eadaf7a27 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -126,6 +126,7 @@ struct btrfs_fs_devices {
126struct btrfs_bio_stripe { 126struct btrfs_bio_stripe {
127 struct btrfs_device *dev; 127 struct btrfs_device *dev;
128 u64 physical; 128 u64 physical;
129 u64 length; /* only used for discard mappings */
129}; 130};
130 131
131struct btrfs_multi_bio { 132struct btrfs_multi_bio {
@@ -145,6 +146,17 @@ struct btrfs_device_info {
145 u64 max_avail; 146 u64 max_avail;
146}; 147};
147 148
149struct map_lookup {
150 u64 type;
151 int io_align;
152 int io_width;
153 int stripe_len;
154 int sector_size;
155 int num_stripes;
156 int sub_stripes;
157 struct btrfs_bio_stripe stripes[];
158};
159
148/* Used to sort the devices by max_avail(descending sort) */ 160/* Used to sort the devices by max_avail(descending sort) */
149int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2); 161int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2);
150 162
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index d779cefcfd7d..cfd660550ded 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -180,11 +180,10 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
180 struct btrfs_path *path; 180 struct btrfs_path *path;
181 struct extent_buffer *leaf; 181 struct extent_buffer *leaf;
182 struct btrfs_dir_item *di; 182 struct btrfs_dir_item *di;
183 int ret = 0, slot, advance; 183 int ret = 0, slot;
184 size_t total_size = 0, size_left = size; 184 size_t total_size = 0, size_left = size;
185 unsigned long name_ptr; 185 unsigned long name_ptr;
186 size_t name_len; 186 size_t name_len;
187 u32 nritems;
188 187
189 /* 188 /*
190 * ok we want all objects associated with this id. 189 * ok we want all objects associated with this id.
@@ -204,34 +203,24 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
204 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 203 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
205 if (ret < 0) 204 if (ret < 0)
206 goto err; 205 goto err;
207 advance = 0; 206
208 while (1) { 207 while (1) {
209 leaf = path->nodes[0]; 208 leaf = path->nodes[0];
210 nritems = btrfs_header_nritems(leaf);
211 slot = path->slots[0]; 209 slot = path->slots[0];
212 210
213 /* this is where we start walking through the path */ 211 /* this is where we start walking through the path */
214 if (advance || slot >= nritems) { 212 if (slot >= btrfs_header_nritems(leaf)) {
215 /* 213 /*
216 * if we've reached the last slot in this leaf we need 214 * if we've reached the last slot in this leaf we need
217 * to go to the next leaf and reset everything 215 * to go to the next leaf and reset everything
218 */ 216 */
219 if (slot >= nritems-1) { 217 ret = btrfs_next_leaf(root, path);
220 ret = btrfs_next_leaf(root, path); 218 if (ret < 0)
221 if (ret) 219 goto err;
222 break; 220 else if (ret > 0)
223 leaf = path->nodes[0]; 221 break;
224 nritems = btrfs_header_nritems(leaf); 222 continue;
225 slot = path->slots[0];
226 } else {
227 /*
228 * just walking through the slots on this leaf
229 */
230 slot++;
231 path->slots[0]++;
232 }
233 } 223 }
234 advance = 1;
235 224
236 btrfs_item_key_to_cpu(leaf, &found_key, slot); 225 btrfs_item_key_to_cpu(leaf, &found_key, slot);
237 226
@@ -242,13 +231,15 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
242 break; 231 break;
243 232
244 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 233 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
234 if (verify_dir_item(root, leaf, di))
235 continue;
245 236
246 name_len = btrfs_dir_name_len(leaf, di); 237 name_len = btrfs_dir_name_len(leaf, di);
247 total_size += name_len + 1; 238 total_size += name_len + 1;
248 239
249 /* we are just looking for how big our buffer needs to be */ 240 /* we are just looking for how big our buffer needs to be */
250 if (!size) 241 if (!size)
251 continue; 242 goto next;
252 243
253 if (!buffer || (name_len + 1) > size_left) { 244 if (!buffer || (name_len + 1) > size_left) {
254 ret = -ERANGE; 245 ret = -ERANGE;
@@ -261,6 +252,8 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
261 252
262 size_left -= name_len + 1; 253 size_left -= name_len + 1;
263 buffer += name_len + 1; 254 buffer += name_len + 1;
255next:
256 path->slots[0]++;
264 } 257 }
265 ret = total_size; 258 ret = total_size;
266 259
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index f5ec2d44150d..faccd47c6c46 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -57,7 +57,8 @@ static struct list_head *zlib_alloc_workspace(void)
57 if (!workspace) 57 if (!workspace)
58 return ERR_PTR(-ENOMEM); 58 return ERR_PTR(-ENOMEM);
59 59
60 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize()); 60 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize(
61 MAX_WBITS, MAX_MEM_LEVEL));
61 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); 62 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
62 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); 63 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
63 if (!workspace->def_strm.workspace || 64 if (!workspace->def_strm.workspace ||