aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/Makefile4
-rw-r--r--fs/btrfs/acl.c17
-rw-r--r--fs/btrfs/btrfs_inode.h17
-rw-r--r--fs/btrfs/compression.c3
-rw-r--r--fs/btrfs/ctree.c10
-rw-r--r--fs/btrfs/ctree.h198
-rw-r--r--fs/btrfs/delayed-inode.c50
-rw-r--r--fs/btrfs/disk-io.c430
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c838
-rw-r--r--fs/btrfs/extent_io.c223
-rw-r--r--fs/btrfs/extent_io.h12
-rw-r--r--fs/btrfs/file-item.c17
-rw-r--r--fs/btrfs/file.c49
-rw-r--r--fs/btrfs/free-space-cache.c926
-rw-r--r--fs/btrfs/inode-map.c6
-rw-r--r--fs/btrfs/inode.c300
-rw-r--r--fs/btrfs/ioctl.c97
-rw-r--r--fs/btrfs/print-tree.c8
-rw-r--r--fs/btrfs/reada.c949
-rw-r--r--fs/btrfs/relocation.c24
-rw-r--r--fs/btrfs/scrub.c114
-rw-r--r--fs/btrfs/super.c298
-rw-r--r--fs/btrfs/transaction.c133
-rw-r--r--fs/btrfs/tree-log.c19
-rw-r--r--fs/btrfs/volumes.c77
-rw-r--r--fs/btrfs/volumes.h8
-rw-r--r--fs/btrfs/xattr.c11
-rw-r--r--fs/cifs/cifsencrypt.c54
-rw-r--r--fs/cifs/cifsfs.c10
-rw-r--r--fs/cifs/cifssmb.c3
-rw-r--r--fs/cifs/connect.c6
-rw-r--r--fs/ext3/inode.c4
-rw-r--r--fs/ext3/namei.c3
-rw-r--r--fs/ext4/inode.c4
-rw-r--r--fs/ext4/namei.c3
-rw-r--r--fs/gfs2/log.c4
-rw-r--r--fs/gfs2/meta_io.c6
-rw-r--r--fs/gfs2/ops_fstype.c2
-rw-r--r--fs/gfs2/quota.c2
-rw-r--r--fs/hfsplus/super.c15
-rw-r--r--fs/hfsplus/wrapper.c4
-rw-r--r--fs/namei.c12
-rw-r--r--fs/namespace.c2
-rw-r--r--fs/nfs/nfs4_fs.h8
-rw-r--r--fs/nfs/nfs4proc.c20
-rw-r--r--fs/nfs/nfs4renewd.c12
-rw-r--r--fs/nfs/nfs4state.c6
-rw-r--r--fs/nfs/super.c25
-rw-r--r--fs/nfs/write.c2
-rw-r--r--fs/proc/task_mmu.c80
-rw-r--r--fs/quota/quota.c2
-rw-r--r--fs/stat.c2
-rw-r--r--fs/xfs/xfs_aops.c3
-rw-r--r--fs/xfs/xfs_buf_item.c3
-rw-r--r--fs/xfs/xfs_dquot_item.c10
-rw-r--r--fs/xfs/xfs_inode_item.c10
-rw-r--r--fs/xfs/xfs_linux.h2
-rw-r--r--fs/xfs/xfs_super.c13
-rw-r--r--fs/xfs/xfs_trans.h2
-rw-r--r--fs/xfs/xfs_trans_ail.c83
-rw-r--r--fs/xfs/xfs_trans_priv.h8
62 files changed, 3745 insertions, 1522 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 89b6ce3634fd..c0ddfd29c5e5 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,7 +7,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o backref.o \ 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 scrub.o 11 reada.o backref.o
12 12
13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index eb159aaa5a11..89b156d85d63 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -59,22 +59,19 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
59 if (!value) 59 if (!value)
60 return ERR_PTR(-ENOMEM); 60 return ERR_PTR(-ENOMEM);
61 size = __btrfs_getxattr(inode, name, value, size); 61 size = __btrfs_getxattr(inode, name, value, size);
62 if (size > 0) { 62 }
63 acl = posix_acl_from_xattr(value, size); 63 if (size > 0) {
64 if (IS_ERR(acl)) { 64 acl = posix_acl_from_xattr(value, size);
65 kfree(value);
66 return acl;
67 }
68 set_cached_acl(inode, type, acl);
69 }
70 kfree(value);
71 } else if (size == -ENOENT || size == -ENODATA || size == 0) { 65 } else if (size == -ENOENT || size == -ENODATA || size == 0) {
72 /* FIXME, who returns -ENOENT? I think nobody */ 66 /* FIXME, who returns -ENOENT? I think nobody */
73 acl = NULL; 67 acl = NULL;
74 set_cached_acl(inode, type, acl);
75 } else { 68 } else {
76 acl = ERR_PTR(-EIO); 69 acl = ERR_PTR(-EIO);
77 } 70 }
71 kfree(value);
72
73 if (!IS_ERR(acl))
74 set_cached_acl(inode, type, acl);
78 75
79 return acl; 76 return acl;
80} 77}
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index d9f99a16edd6..5a5d325a3935 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -103,11 +103,6 @@ struct btrfs_inode {
103 */ 103 */
104 u64 delalloc_bytes; 104 u64 delalloc_bytes;
105 105
106 /* total number of bytes that may be used for this inode for
107 * delalloc
108 */
109 u64 reserved_bytes;
110
111 /* 106 /*
112 * the size of the file stored in the metadata on disk. data=ordered 107 * the size of the file stored in the metadata on disk. data=ordered
113 * means the in-memory i_size might be larger than the size on disk 108 * means the in-memory i_size might be larger than the size on disk
@@ -115,9 +110,6 @@ struct btrfs_inode {
115 */ 110 */
116 u64 disk_i_size; 111 u64 disk_i_size;
117 112
118 /* flags field from the on disk inode */
119 u32 flags;
120
121 /* 113 /*
122 * if this is a directory then index_cnt is the counter for the index 114 * if this is a directory then index_cnt is the counter for the index
123 * number for new files that are created 115 * number for new files that are created
@@ -132,6 +124,15 @@ struct btrfs_inode {
132 u64 last_unlink_trans; 124 u64 last_unlink_trans;
133 125
134 /* 126 /*
127 * Number of bytes outstanding that are going to need csums. This is
128 * used in ENOSPC accounting.
129 */
130 u64 csum_bytes;
131
132 /* flags field from the on disk inode */
133 u32 flags;
134
135 /*
135 * Counters to keep track of the number of extent item's we may use due 136 * Counters to keep track of the number of extent item's we may use due
136 * to delalloc and such. outstanding_extents is the number of extent 137 * to delalloc and such. outstanding_extents is the number of extent
137 * items we think we'll end up using, and reserved_extents is the number 138 * items we think we'll end up using, and reserved_extents is the number
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8ec5d86f1734..14f1c5a0b2d2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -85,7 +85,8 @@ struct compressed_bio {
85static inline int compressed_bio_size(struct btrfs_root *root, 85static inline int compressed_bio_size(struct btrfs_root *root,
86 unsigned long disk_size) 86 unsigned long disk_size)
87{ 87{
88 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); 88 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
89
89 return sizeof(struct compressed_bio) + 90 return sizeof(struct compressed_bio) +
90 ((disk_size + root->sectorsize - 1) / root->sectorsize) * 91 ((disk_size + root->sectorsize - 1) / root->sectorsize) *
91 csum_size; 92 csum_size;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 011cab3aca8d..0fe615e4ea38 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -902,9 +902,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
902 902
903 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 903 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
904 904
905 if (level < BTRFS_MAX_LEVEL - 1) 905 if (level < BTRFS_MAX_LEVEL - 1) {
906 parent = path->nodes[level + 1]; 906 parent = path->nodes[level + 1];
907 pslot = path->slots[level + 1]; 907 pslot = path->slots[level + 1];
908 }
908 909
909 /* 910 /*
910 * deal with the case where there is only one pointer in the root 911 * deal with the case where there is only one pointer in the root
@@ -1107,9 +1108,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1107 mid = path->nodes[level]; 1108 mid = path->nodes[level];
1108 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1109 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1109 1110
1110 if (level < BTRFS_MAX_LEVEL - 1) 1111 if (level < BTRFS_MAX_LEVEL - 1) {
1111 parent = path->nodes[level + 1]; 1112 parent = path->nodes[level + 1];
1112 pslot = path->slots[level + 1]; 1113 pslot = path->slots[level + 1];
1114 }
1113 1115
1114 if (!parent) 1116 if (!parent)
1115 return 1; 1117 return 1;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 03912c5c6f49..b9ba59ff9292 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -30,6 +30,7 @@
30#include <linux/kobject.h> 30#include <linux/kobject.h>
31#include <trace/events/btrfs.h> 31#include <trace/events/btrfs.h>
32#include <asm/kmap_types.h> 32#include <asm/kmap_types.h>
33#include <linux/pagemap.h>
33#include "extent_io.h" 34#include "extent_io.h"
34#include "extent_map.h" 35#include "extent_map.h"
35#include "async-thread.h" 36#include "async-thread.h"
@@ -360,6 +361,47 @@ struct btrfs_header {
360#define BTRFS_LABEL_SIZE 256 361#define BTRFS_LABEL_SIZE 256
361 362
362/* 363/*
364 * just in case we somehow lose the roots and are not able to mount,
365 * we store an array of the roots from previous transactions
366 * in the super.
367 */
368#define BTRFS_NUM_BACKUP_ROOTS 4
369struct btrfs_root_backup {
370 __le64 tree_root;
371 __le64 tree_root_gen;
372
373 __le64 chunk_root;
374 __le64 chunk_root_gen;
375
376 __le64 extent_root;
377 __le64 extent_root_gen;
378
379 __le64 fs_root;
380 __le64 fs_root_gen;
381
382 __le64 dev_root;
383 __le64 dev_root_gen;
384
385 __le64 csum_root;
386 __le64 csum_root_gen;
387
388 __le64 total_bytes;
389 __le64 bytes_used;
390 __le64 num_devices;
391 /* future */
392 __le64 unsed_64[4];
393
394 u8 tree_root_level;
395 u8 chunk_root_level;
396 u8 extent_root_level;
397 u8 fs_root_level;
398 u8 dev_root_level;
399 u8 csum_root_level;
400 /* future and to align */
401 u8 unused_8[10];
402} __attribute__ ((__packed__));
403
404/*
363 * the super block basically lists the main trees of the FS 405 * the super block basically lists the main trees of the FS
364 * it currently lacks any block count etc etc 406 * it currently lacks any block count etc etc
365 */ 407 */
@@ -405,6 +447,7 @@ struct btrfs_super_block {
405 /* future expansion */ 447 /* future expansion */
406 __le64 reserved[31]; 448 __le64 reserved[31];
407 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; 449 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
450 struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
408} __attribute__ ((__packed__)); 451} __attribute__ ((__packed__));
409 452
410/* 453/*
@@ -772,14 +815,8 @@ struct btrfs_space_info {
772struct btrfs_block_rsv { 815struct btrfs_block_rsv {
773 u64 size; 816 u64 size;
774 u64 reserved; 817 u64 reserved;
775 u64 freed[2];
776 struct btrfs_space_info *space_info; 818 struct btrfs_space_info *space_info;
777 struct list_head list;
778 spinlock_t lock; 819 spinlock_t lock;
779 atomic_t usage;
780 unsigned int priority:8;
781 unsigned int durable:1;
782 unsigned int refill_used:1;
783 unsigned int full:1; 820 unsigned int full:1;
784}; 821};
785 822
@@ -840,10 +877,10 @@ struct btrfs_block_group_cache {
840 spinlock_t lock; 877 spinlock_t lock;
841 u64 pinned; 878 u64 pinned;
842 u64 reserved; 879 u64 reserved;
843 u64 reserved_pinned;
844 u64 bytes_super; 880 u64 bytes_super;
845 u64 flags; 881 u64 flags;
846 u64 sectorsize; 882 u64 sectorsize;
883 u64 cache_generation;
847 unsigned int ro:1; 884 unsigned int ro:1;
848 unsigned int dirty:1; 885 unsigned int dirty:1;
849 unsigned int iref:1; 886 unsigned int iref:1;
@@ -899,6 +936,10 @@ struct btrfs_fs_info {
899 spinlock_t block_group_cache_lock; 936 spinlock_t block_group_cache_lock;
900 struct rb_root block_group_cache_tree; 937 struct rb_root block_group_cache_tree;
901 938
939 /* keep track of unallocated space */
940 spinlock_t free_chunk_lock;
941 u64 free_chunk_space;
942
902 struct extent_io_tree freed_extents[2]; 943 struct extent_io_tree freed_extents[2];
903 struct extent_io_tree *pinned_extents; 944 struct extent_io_tree *pinned_extents;
904 945
@@ -916,14 +957,11 @@ struct btrfs_fs_info {
916 struct btrfs_block_rsv trans_block_rsv; 957 struct btrfs_block_rsv trans_block_rsv;
917 /* block reservation for chunk tree */ 958 /* block reservation for chunk tree */
918 struct btrfs_block_rsv chunk_block_rsv; 959 struct btrfs_block_rsv chunk_block_rsv;
960 /* block reservation for delayed operations */
961 struct btrfs_block_rsv delayed_block_rsv;
919 962
920 struct btrfs_block_rsv empty_block_rsv; 963 struct btrfs_block_rsv empty_block_rsv;
921 964
922 /* list of block reservations that cross multiple transactions */
923 struct list_head durable_block_rsv_list;
924
925 struct mutex durable_block_rsv_mutex;
926
927 u64 generation; 965 u64 generation;
928 u64 last_trans_committed; 966 u64 last_trans_committed;
929 967
@@ -942,8 +980,8 @@ struct btrfs_fs_info {
942 wait_queue_head_t transaction_blocked_wait; 980 wait_queue_head_t transaction_blocked_wait;
943 wait_queue_head_t async_submit_wait; 981 wait_queue_head_t async_submit_wait;
944 982
945 struct btrfs_super_block super_copy; 983 struct btrfs_super_block *super_copy;
946 struct btrfs_super_block super_for_commit; 984 struct btrfs_super_block *super_for_commit;
947 struct block_device *__bdev; 985 struct block_device *__bdev;
948 struct super_block *sb; 986 struct super_block *sb;
949 struct inode *btree_inode; 987 struct inode *btree_inode;
@@ -1036,6 +1074,7 @@ struct btrfs_fs_info {
1036 struct btrfs_workers endio_freespace_worker; 1074 struct btrfs_workers endio_freespace_worker;
1037 struct btrfs_workers submit_workers; 1075 struct btrfs_workers submit_workers;
1038 struct btrfs_workers caching_workers; 1076 struct btrfs_workers caching_workers;
1077 struct btrfs_workers readahead_workers;
1039 1078
1040 /* 1079 /*
1041 * fixup workers take dirty pages that didn't properly go through 1080 * fixup workers take dirty pages that didn't properly go through
@@ -1119,6 +1158,13 @@ struct btrfs_fs_info {
1119 u64 fs_state; 1158 u64 fs_state;
1120 1159
1121 struct btrfs_delayed_root *delayed_root; 1160 struct btrfs_delayed_root *delayed_root;
1161
1162 /* readahead tree */
1163 spinlock_t reada_lock;
1164 struct radix_tree_root reada_tree;
1165
1166 /* next backup root to be overwritten */
1167 int backup_root_index;
1122}; 1168};
1123 1169
1124/* 1170/*
@@ -1363,6 +1409,7 @@ struct btrfs_ioctl_defrag_range_args {
1363#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) 1409#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
1364#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) 1410#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
1365#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) 1411#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
1412#define BTRFS_MOUNT_RECOVERY (1 << 18)
1366 1413
1367#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1414#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1368#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1415#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1978,6 +2025,55 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root)
1978 return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY; 2025 return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
1979} 2026}
1980 2027
2028/* struct btrfs_root_backup */
2029BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
2030 tree_root, 64);
2031BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
2032 tree_root_gen, 64);
2033BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
2034 tree_root_level, 8);
2035
2036BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
2037 chunk_root, 64);
2038BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
2039 chunk_root_gen, 64);
2040BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
2041 chunk_root_level, 8);
2042
2043BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
2044 extent_root, 64);
2045BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
2046 extent_root_gen, 64);
2047BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
2048 extent_root_level, 8);
2049
2050BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
2051 fs_root, 64);
2052BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
2053 fs_root_gen, 64);
2054BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
2055 fs_root_level, 8);
2056
2057BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
2058 dev_root, 64);
2059BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
2060 dev_root_gen, 64);
2061BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
2062 dev_root_level, 8);
2063
2064BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
2065 csum_root, 64);
2066BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
2067 csum_root_gen, 64);
2068BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
2069 csum_root_level, 8);
2070BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
2071 total_bytes, 64);
2072BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
2073 bytes_used, 64);
2074BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
2075 num_devices, 64);
2076
1981/* struct btrfs_super_block */ 2077/* struct btrfs_super_block */
1982 2078
1983BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); 2079BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@ -2129,6 +2225,11 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
2129 (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); 2225 (space_info->flags & BTRFS_BLOCK_GROUP_DATA));
2130} 2226}
2131 2227
2228static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
2229{
2230 return mapping_gfp_mask(mapping) & ~__GFP_FS;
2231}
2232
2132/* extent-tree.c */ 2233/* extent-tree.c */
2133static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, 2234static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
2134 unsigned num_items) 2235 unsigned num_items)
@@ -2137,6 +2238,17 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
2137 3 * num_items; 2238 3 * num_items;
2138} 2239}
2139 2240
2241/*
2242 * Doing a truncate won't result in new nodes or leaves, just what we need for
2243 * COW.
2244 */
2245static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
2246 unsigned num_items)
2247{
2248 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
2249 num_items;
2250}
2251
2140void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 2252void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
2141int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2253int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2142 struct btrfs_root *root, unsigned long count); 2254 struct btrfs_root *root, unsigned long count);
@@ -2146,6 +2258,9 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
2146 u64 num_bytes, u64 *refs, u64 *flags); 2258 u64 num_bytes, u64 *refs, u64 *flags);
2147int btrfs_pin_extent(struct btrfs_root *root, 2259int btrfs_pin_extent(struct btrfs_root *root,
2148 u64 bytenr, u64 num, int reserved); 2260 u64 bytenr, u64 num, int reserved);
2261int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
2262 struct btrfs_root *root,
2263 u64 bytenr, u64 num_bytes);
2149int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 2264int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2150 struct btrfs_root *root, 2265 struct btrfs_root *root,
2151 u64 objectid, u64 offset, u64 bytenr); 2266 u64 objectid, u64 offset, u64 bytenr);
@@ -2196,8 +2311,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
2196 u64 root_objectid, u64 owner, u64 offset); 2311 u64 root_objectid, u64 owner, u64 offset);
2197 2312
2198int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); 2313int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
2199int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 2314int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
2200 u64 num_bytes, int reserve, int sinfo); 2315 u64 start, u64 len);
2201int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 2316int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
2202 struct btrfs_root *root); 2317 struct btrfs_root *root);
2203int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 2318int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
@@ -2240,25 +2355,23 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
2240struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); 2355struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
2241void btrfs_free_block_rsv(struct btrfs_root *root, 2356void btrfs_free_block_rsv(struct btrfs_root *root,
2242 struct btrfs_block_rsv *rsv); 2357 struct btrfs_block_rsv *rsv);
2243void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, 2358int btrfs_block_rsv_add(struct btrfs_root *root,
2244 struct btrfs_block_rsv *rsv);
2245int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
2246 struct btrfs_root *root,
2247 struct btrfs_block_rsv *block_rsv, 2359 struct btrfs_block_rsv *block_rsv,
2248 u64 num_bytes); 2360 u64 num_bytes);
2249int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, 2361int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
2250 struct btrfs_root *root, 2362 struct btrfs_block_rsv *block_rsv,
2363 u64 num_bytes);
2364int btrfs_block_rsv_check(struct btrfs_root *root,
2365 struct btrfs_block_rsv *block_rsv, int min_factor);
2366int btrfs_block_rsv_refill(struct btrfs_root *root,
2251 struct btrfs_block_rsv *block_rsv, 2367 struct btrfs_block_rsv *block_rsv,
2252 u64 min_reserved, int min_factor); 2368 u64 min_reserved);
2253int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 2369int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2254 struct btrfs_block_rsv *dst_rsv, 2370 struct btrfs_block_rsv *dst_rsv,
2255 u64 num_bytes); 2371 u64 num_bytes);
2256void btrfs_block_rsv_release(struct btrfs_root *root, 2372void btrfs_block_rsv_release(struct btrfs_root *root,
2257 struct btrfs_block_rsv *block_rsv, 2373 struct btrfs_block_rsv *block_rsv,
2258 u64 num_bytes); 2374 u64 num_bytes);
2259int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
2260 struct btrfs_root *root,
2261 struct btrfs_block_rsv *rsv);
2262int btrfs_set_block_group_ro(struct btrfs_root *root, 2375int btrfs_set_block_group_ro(struct btrfs_root *root,
2263 struct btrfs_block_group_cache *cache); 2376 struct btrfs_block_group_cache *cache);
2264int btrfs_set_block_group_rw(struct btrfs_root *root, 2377int btrfs_set_block_group_rw(struct btrfs_root *root,
@@ -2379,6 +2492,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
2379 smp_mb(); 2492 smp_mb();
2380 return fs_info->closing; 2493 return fs_info->closing;
2381} 2494}
2495static inline void free_fs_info(struct btrfs_fs_info *fs_info)
2496{
2497 kfree(fs_info->delayed_root);
2498 kfree(fs_info->extent_root);
2499 kfree(fs_info->tree_root);
2500 kfree(fs_info->chunk_root);
2501 kfree(fs_info->dev_root);
2502 kfree(fs_info->csum_root);
2503 kfree(fs_info->super_copy);
2504 kfree(fs_info->super_for_commit);
2505 kfree(fs_info);
2506}
2382 2507
2383/* root-item.c */ 2508/* root-item.c */
2384int btrfs_find_root_ref(struct btrfs_root *tree_root, 2509int btrfs_find_root_ref(struct btrfs_root *tree_root,
@@ -2579,11 +2704,6 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
2579int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); 2704int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
2580int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); 2705int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2581int btrfs_orphan_cleanup(struct btrfs_root *root); 2706int btrfs_orphan_cleanup(struct btrfs_root *root);
2582void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2583 struct btrfs_pending_snapshot *pending,
2584 u64 *bytes_to_reserve);
2585void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2586 struct btrfs_pending_snapshot *pending);
2587void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, 2707void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2588 struct btrfs_root *root); 2708 struct btrfs_root *root);
2589int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); 2709int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
@@ -2697,4 +2817,20 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
2697int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 2817int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
2698 struct btrfs_scrub_progress *progress); 2818 struct btrfs_scrub_progress *progress);
2699 2819
2820/* reada.c */
2821struct reada_control {
2822 struct btrfs_root *root; /* tree to prefetch */
2823 struct btrfs_key key_start;
2824 struct btrfs_key key_end; /* exclusive */
2825 atomic_t elems;
2826 struct kref refcnt;
2827 wait_queue_head_t wait;
2828};
2829struct reada_control *btrfs_reada_add(struct btrfs_root *root,
2830 struct btrfs_key *start, struct btrfs_key *end);
2831int btrfs_reada_wait(void *handle);
2832void btrfs_reada_detach(void *handle);
2833int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
2834 u64 start, int err);
2835
2700#endif 2836#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index b52c672f4c18..bbe8496d5339 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -591,7 +591,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
591 return 0; 591 return 0;
592 592
593 src_rsv = trans->block_rsv; 593 src_rsv = trans->block_rsv;
594 dst_rsv = &root->fs_info->global_block_rsv; 594 dst_rsv = &root->fs_info->delayed_block_rsv;
595 595
596 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 596 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
597 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); 597 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
@@ -609,7 +609,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
609 if (!item->bytes_reserved) 609 if (!item->bytes_reserved)
610 return; 610 return;
611 611
612 rsv = &root->fs_info->global_block_rsv; 612 rsv = &root->fs_info->delayed_block_rsv;
613 btrfs_block_rsv_release(root, rsv, 613 btrfs_block_rsv_release(root, rsv,
614 item->bytes_reserved); 614 item->bytes_reserved);
615} 615}
@@ -624,13 +624,36 @@ static int btrfs_delayed_inode_reserve_metadata(
624 u64 num_bytes; 624 u64 num_bytes;
625 int ret; 625 int ret;
626 626
627 if (!trans->bytes_reserved)
628 return 0;
629
630 src_rsv = trans->block_rsv; 627 src_rsv = trans->block_rsv;
631 dst_rsv = &root->fs_info->global_block_rsv; 628 dst_rsv = &root->fs_info->delayed_block_rsv;
632 629
633 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 630 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
631
632 /*
633 * btrfs_dirty_inode will update the inode under btrfs_join_transaction
634 * which doesn't reserve space for speed. This is a problem since we
635 * still need to reserve space for this update, so try to reserve the
636 * space.
637 *
638 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
639 * we're accounted for.
640 */
641 if (!trans->bytes_reserved &&
642 src_rsv != &root->fs_info->delalloc_block_rsv) {
643 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
644 /*
645 * Since we're under a transaction reserve_metadata_bytes could
646 * try to commit the transaction which will make it return
647 * EAGAIN to make us stop the transaction we have, so return
648 * ENOSPC instead so that btrfs_dirty_inode knows what to do.
649 */
650 if (ret == -EAGAIN)
651 ret = -ENOSPC;
652 if (!ret)
653 node->bytes_reserved = num_bytes;
654 return ret;
655 }
656
634 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); 657 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
635 if (!ret) 658 if (!ret)
636 node->bytes_reserved = num_bytes; 659 node->bytes_reserved = num_bytes;
@@ -646,7 +669,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
646 if (!node->bytes_reserved) 669 if (!node->bytes_reserved)
647 return; 670 return;
648 671
649 rsv = &root->fs_info->global_block_rsv; 672 rsv = &root->fs_info->delayed_block_rsv;
650 btrfs_block_rsv_release(root, rsv, 673 btrfs_block_rsv_release(root, rsv,
651 node->bytes_reserved); 674 node->bytes_reserved);
652 node->bytes_reserved = 0; 675 node->bytes_reserved = 0;
@@ -1026,7 +1049,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1026 path->leave_spinning = 1; 1049 path->leave_spinning = 1;
1027 1050
1028 block_rsv = trans->block_rsv; 1051 block_rsv = trans->block_rsv;
1029 trans->block_rsv = &root->fs_info->global_block_rsv; 1052 trans->block_rsv = &root->fs_info->delayed_block_rsv;
1030 1053
1031 delayed_root = btrfs_get_delayed_root(root); 1054 delayed_root = btrfs_get_delayed_root(root);
1032 1055
@@ -1069,7 +1092,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
1069 path->leave_spinning = 1; 1092 path->leave_spinning = 1;
1070 1093
1071 block_rsv = trans->block_rsv; 1094 block_rsv = trans->block_rsv;
1072 trans->block_rsv = &node->root->fs_info->global_block_rsv; 1095 trans->block_rsv = &node->root->fs_info->delayed_block_rsv;
1073 1096
1074 ret = btrfs_insert_delayed_items(trans, path, node->root, node); 1097 ret = btrfs_insert_delayed_items(trans, path, node->root, node);
1075 if (!ret) 1098 if (!ret)
@@ -1149,7 +1172,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
1149 goto free_path; 1172 goto free_path;
1150 1173
1151 block_rsv = trans->block_rsv; 1174 block_rsv = trans->block_rsv;
1152 trans->block_rsv = &root->fs_info->global_block_rsv; 1175 trans->block_rsv = &root->fs_info->delayed_block_rsv;
1153 1176
1154 ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); 1177 ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);
1155 if (!ret) 1178 if (!ret)
@@ -1686,11 +1709,8 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
1686 } 1709 }
1687 1710
1688 ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); 1711 ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
1689 /* 1712 if (ret)
1690 * we must reserve enough space when we start a new transaction, 1713 goto release_node;
1691 * so reserving metadata failure is impossible
1692 */
1693 BUG_ON(ret);
1694 1714
1695 fill_stack_inode_item(trans, &delayed_node->inode_item, inode); 1715 fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
1696 delayed_node->inode_dirty = 1; 1716 delayed_node->inode_dirty = 1;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dc0343802535..0eb1f0951251 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -256,8 +256,7 @@ void btrfs_csum_final(u32 crc, char *result)
256static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, 256static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
257 int verify) 257 int verify)
258{ 258{
259 u16 csum_size = 259 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
260 btrfs_super_csum_size(&root->fs_info->super_copy);
261 char *result = NULL; 260 char *result = NULL;
262 unsigned long len; 261 unsigned long len;
263 unsigned long cur_len; 262 unsigned long cur_len;
@@ -367,7 +366,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
367 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); 366 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
368 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; 367 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
369 while (1) { 368 while (1) {
370 ret = read_extent_buffer_pages(io_tree, eb, start, 1, 369 ret = read_extent_buffer_pages(io_tree, eb, start,
370 WAIT_COMPLETE,
371 btree_get_extent, mirror_num); 371 btree_get_extent, mirror_num);
372 if (!ret && 372 if (!ret &&
373 !verify_parent_transid(io_tree, eb, parent_transid)) 373 !verify_parent_transid(io_tree, eb, parent_transid))
@@ -608,11 +608,47 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
608 end = min_t(u64, eb->len, PAGE_CACHE_SIZE); 608 end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
609 end = eb->start + end - 1; 609 end = eb->start + end - 1;
610err: 610err:
611 if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
612 clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
613 btree_readahead_hook(root, eb, eb->start, ret);
614 }
615
611 free_extent_buffer(eb); 616 free_extent_buffer(eb);
612out: 617out:
613 return ret; 618 return ret;
614} 619}
615 620
621static int btree_io_failed_hook(struct bio *failed_bio,
622 struct page *page, u64 start, u64 end,
623 u64 mirror_num, struct extent_state *state)
624{
625 struct extent_io_tree *tree;
626 unsigned long len;
627 struct extent_buffer *eb;
628 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
629
630 tree = &BTRFS_I(page->mapping->host)->io_tree;
631 if (page->private == EXTENT_PAGE_PRIVATE)
632 goto out;
633 if (!page->private)
634 goto out;
635
636 len = page->private >> 2;
637 WARN_ON(len == 0);
638
639 eb = alloc_extent_buffer(tree, start, len, page);
640 if (eb == NULL)
641 goto out;
642
643 if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
644 clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
645 btree_readahead_hook(root, eb, eb->start, -EIO);
646 }
647
648out:
649 return -EIO; /* we fixed nothing */
650}
651
616static void end_workqueue_bio(struct bio *bio, int err) 652static void end_workqueue_bio(struct bio *bio, int err)
617{ 653{
618 struct end_io_wq *end_io_wq = bio->bi_private; 654 struct end_io_wq *end_io_wq = bio->bi_private;
@@ -974,11 +1010,43 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
974 if (!buf) 1010 if (!buf)
975 return 0; 1011 return 0;
976 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, 1012 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
977 buf, 0, 0, btree_get_extent, 0); 1013 buf, 0, WAIT_NONE, btree_get_extent, 0);
978 free_extent_buffer(buf); 1014 free_extent_buffer(buf);
979 return ret; 1015 return ret;
980} 1016}
981 1017
1018int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
1019 int mirror_num, struct extent_buffer **eb)
1020{
1021 struct extent_buffer *buf = NULL;
1022 struct inode *btree_inode = root->fs_info->btree_inode;
1023 struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
1024 int ret;
1025
1026 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
1027 if (!buf)
1028 return 0;
1029
1030 set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
1031
1032 ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
1033 btree_get_extent, mirror_num);
1034 if (ret) {
1035 free_extent_buffer(buf);
1036 return ret;
1037 }
1038
1039 if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
1040 free_extent_buffer(buf);
1041 return -EIO;
1042 } else if (extent_buffer_uptodate(io_tree, buf, NULL)) {
1043 *eb = buf;
1044 } else {
1045 free_extent_buffer(buf);
1046 }
1047 return 0;
1048}
1049
982struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 1050struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
983 u64 bytenr, u32 blocksize) 1051 u64 bytenr, u32 blocksize)
984{ 1052{
@@ -1135,10 +1203,12 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
1135 1203
1136 generation = btrfs_root_generation(&root->root_item); 1204 generation = btrfs_root_generation(&root->root_item);
1137 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 1205 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1206 root->commit_root = NULL;
1138 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1207 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1139 blocksize, generation); 1208 blocksize, generation);
1140 if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) { 1209 if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
1141 free_extent_buffer(root->node); 1210 free_extent_buffer(root->node);
1211 root->node = NULL;
1142 return -EIO; 1212 return -EIO;
1143 } 1213 }
1144 root->commit_root = btrfs_root_node(root); 1214 root->commit_root = btrfs_root_node(root);
@@ -1577,6 +1647,228 @@ sleep:
1577 return 0; 1647 return 0;
1578} 1648}
1579 1649
1650/*
1651 * this will find the highest generation in the array of
1652 * root backups. The index of the highest array is returned,
1653 * or -1 if we can't find anything.
1654 *
1655 * We check to make sure the array is valid by comparing the
1656 * generation of the latest root in the array with the generation
1657 * in the super block. If they don't match we pitch it.
1658 */
1659static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
1660{
1661 u64 cur;
1662 int newest_index = -1;
1663 struct btrfs_root_backup *root_backup;
1664 int i;
1665
1666 for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
1667 root_backup = info->super_copy->super_roots + i;
1668 cur = btrfs_backup_tree_root_gen(root_backup);
1669 if (cur == newest_gen)
1670 newest_index = i;
1671 }
1672
1673 /* check to see if we actually wrapped around */
1674 if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
1675 root_backup = info->super_copy->super_roots;
1676 cur = btrfs_backup_tree_root_gen(root_backup);
1677 if (cur == newest_gen)
1678 newest_index = 0;
1679 }
1680 return newest_index;
1681}
1682
1683
1684/*
1685 * find the oldest backup so we know where to store new entries
1686 * in the backup array. This will set the backup_root_index
1687 * field in the fs_info struct
1688 */
1689static void find_oldest_super_backup(struct btrfs_fs_info *info,
1690 u64 newest_gen)
1691{
1692 int newest_index = -1;
1693
1694 newest_index = find_newest_super_backup(info, newest_gen);
1695 /* if there was garbage in there, just move along */
1696 if (newest_index == -1) {
1697 info->backup_root_index = 0;
1698 } else {
1699 info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
1700 }
1701}
1702
1703/*
1704 * copy all the root pointers into the super backup array.
1705 * this will bump the backup pointer by one when it is
1706 * done
1707 */
1708static void backup_super_roots(struct btrfs_fs_info *info)
1709{
1710 int next_backup;
1711 struct btrfs_root_backup *root_backup;
1712 int last_backup;
1713
1714 next_backup = info->backup_root_index;
1715 last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
1716 BTRFS_NUM_BACKUP_ROOTS;
1717
1718 /*
1719 * just overwrite the last backup if we're at the same generation
1720 * this happens only at umount
1721 */
1722 root_backup = info->super_for_commit->super_roots + last_backup;
1723 if (btrfs_backup_tree_root_gen(root_backup) ==
1724 btrfs_header_generation(info->tree_root->node))
1725 next_backup = last_backup;
1726
1727 root_backup = info->super_for_commit->super_roots + next_backup;
1728
1729 /*
1730 * make sure all of our padding and empty slots get zero filled
1731 * regardless of which ones we use today
1732 */
1733 memset(root_backup, 0, sizeof(*root_backup));
1734
1735 info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
1736
1737 btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
1738 btrfs_set_backup_tree_root_gen(root_backup,
1739 btrfs_header_generation(info->tree_root->node));
1740
1741 btrfs_set_backup_tree_root_level(root_backup,
1742 btrfs_header_level(info->tree_root->node));
1743
1744 btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
1745 btrfs_set_backup_chunk_root_gen(root_backup,
1746 btrfs_header_generation(info->chunk_root->node));
1747 btrfs_set_backup_chunk_root_level(root_backup,
1748 btrfs_header_level(info->chunk_root->node));
1749
1750 btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
1751 btrfs_set_backup_extent_root_gen(root_backup,
1752 btrfs_header_generation(info->extent_root->node));
1753 btrfs_set_backup_extent_root_level(root_backup,
1754 btrfs_header_level(info->extent_root->node));
1755
1756 btrfs_set_backup_fs_root(root_backup, info->fs_root->node->start);
1757 btrfs_set_backup_fs_root_gen(root_backup,
1758 btrfs_header_generation(info->fs_root->node));
1759 btrfs_set_backup_fs_root_level(root_backup,
1760 btrfs_header_level(info->fs_root->node));
1761
1762 btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
1763 btrfs_set_backup_dev_root_gen(root_backup,
1764 btrfs_header_generation(info->dev_root->node));
1765 btrfs_set_backup_dev_root_level(root_backup,
1766 btrfs_header_level(info->dev_root->node));
1767
1768 btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
1769 btrfs_set_backup_csum_root_gen(root_backup,
1770 btrfs_header_generation(info->csum_root->node));
1771 btrfs_set_backup_csum_root_level(root_backup,
1772 btrfs_header_level(info->csum_root->node));
1773
1774 btrfs_set_backup_total_bytes(root_backup,
1775 btrfs_super_total_bytes(info->super_copy));
1776 btrfs_set_backup_bytes_used(root_backup,
1777 btrfs_super_bytes_used(info->super_copy));
1778 btrfs_set_backup_num_devices(root_backup,
1779 btrfs_super_num_devices(info->super_copy));
1780
1781 /*
1782 * if we don't copy this out to the super_copy, it won't get remembered
1783 * for the next commit
1784 */
1785 memcpy(&info->super_copy->super_roots,
1786 &info->super_for_commit->super_roots,
1787 sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
1788}
1789
1790/*
1791 * this copies info out of the root backup array and back into
1792 * the in-memory super block. It is meant to help iterate through
1793 * the array, so you send it the number of backups you've already
1794 * tried and the last backup index you used.
1795 *
1796 * this returns -1 when it has tried all the backups
1797 */
1798static noinline int next_root_backup(struct btrfs_fs_info *info,
1799 struct btrfs_super_block *super,
1800 int *num_backups_tried, int *backup_index)
1801{
1802 struct btrfs_root_backup *root_backup;
1803 int newest = *backup_index;
1804
1805 if (*num_backups_tried == 0) {
1806 u64 gen = btrfs_super_generation(super);
1807
1808 newest = find_newest_super_backup(info, gen);
1809 if (newest == -1)
1810 return -1;
1811
1812 *backup_index = newest;
1813 *num_backups_tried = 1;
1814 } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
1815 /* we've tried all the backups, all done */
1816 return -1;
1817 } else {
1818 /* jump to the next oldest backup */
1819 newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
1820 BTRFS_NUM_BACKUP_ROOTS;
1821 *backup_index = newest;
1822 *num_backups_tried += 1;
1823 }
1824 root_backup = super->super_roots + newest;
1825
1826 btrfs_set_super_generation(super,
1827 btrfs_backup_tree_root_gen(root_backup));
1828 btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
1829 btrfs_set_super_root_level(super,
1830 btrfs_backup_tree_root_level(root_backup));
1831 btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
1832
1833 /*
1834 * fixme: the total bytes and num_devices need to match or we should
1835 * need a fsck
1836 */
1837 btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
1838 btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
1839 return 0;
1840}
1841
1842/* helper to cleanup tree roots */
1843static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
1844{
1845 free_extent_buffer(info->tree_root->node);
1846 free_extent_buffer(info->tree_root->commit_root);
1847 free_extent_buffer(info->dev_root->node);
1848 free_extent_buffer(info->dev_root->commit_root);
1849 free_extent_buffer(info->extent_root->node);
1850 free_extent_buffer(info->extent_root->commit_root);
1851 free_extent_buffer(info->csum_root->node);
1852 free_extent_buffer(info->csum_root->commit_root);
1853
1854 info->tree_root->node = NULL;
1855 info->tree_root->commit_root = NULL;
1856 info->dev_root->node = NULL;
1857 info->dev_root->commit_root = NULL;
1858 info->extent_root->node = NULL;
1859 info->extent_root->commit_root = NULL;
1860 info->csum_root->node = NULL;
1861 info->csum_root->commit_root = NULL;
1862
1863 if (chunk_root) {
1864 free_extent_buffer(info->chunk_root->node);
1865 free_extent_buffer(info->chunk_root->commit_root);
1866 info->chunk_root->node = NULL;
1867 info->chunk_root->commit_root = NULL;
1868 }
1869}
1870
1871
1580struct btrfs_root *open_ctree(struct super_block *sb, 1872struct btrfs_root *open_ctree(struct super_block *sb,
1581 struct btrfs_fs_devices *fs_devices, 1873 struct btrfs_fs_devices *fs_devices,
1582 char *options) 1874 char *options)
@@ -1604,6 +1896,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1604 1896
1605 int ret; 1897 int ret;
1606 int err = -EINVAL; 1898 int err = -EINVAL;
1899 int num_backups_tried = 0;
1900 int backup_index = 0;
1607 1901
1608 struct btrfs_super_block *disk_super; 1902 struct btrfs_super_block *disk_super;
1609 1903
@@ -1648,6 +1942,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1648 spin_lock_init(&fs_info->fs_roots_radix_lock); 1942 spin_lock_init(&fs_info->fs_roots_radix_lock);
1649 spin_lock_init(&fs_info->delayed_iput_lock); 1943 spin_lock_init(&fs_info->delayed_iput_lock);
1650 spin_lock_init(&fs_info->defrag_inodes_lock); 1944 spin_lock_init(&fs_info->defrag_inodes_lock);
1945 spin_lock_init(&fs_info->free_chunk_lock);
1651 mutex_init(&fs_info->reloc_mutex); 1946 mutex_init(&fs_info->reloc_mutex);
1652 1947
1653 init_completion(&fs_info->kobj_unregister); 1948 init_completion(&fs_info->kobj_unregister);
@@ -1665,8 +1960,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1665 btrfs_init_block_rsv(&fs_info->trans_block_rsv); 1960 btrfs_init_block_rsv(&fs_info->trans_block_rsv);
1666 btrfs_init_block_rsv(&fs_info->chunk_block_rsv); 1961 btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
1667 btrfs_init_block_rsv(&fs_info->empty_block_rsv); 1962 btrfs_init_block_rsv(&fs_info->empty_block_rsv);
1668 INIT_LIST_HEAD(&fs_info->durable_block_rsv_list); 1963 btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
1669 mutex_init(&fs_info->durable_block_rsv_mutex);
1670 atomic_set(&fs_info->nr_async_submits, 0); 1964 atomic_set(&fs_info->nr_async_submits, 0);
1671 atomic_set(&fs_info->async_delalloc_pages, 0); 1965 atomic_set(&fs_info->async_delalloc_pages, 0);
1672 atomic_set(&fs_info->async_submit_draining, 0); 1966 atomic_set(&fs_info->async_submit_draining, 0);
@@ -1677,6 +1971,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1677 fs_info->metadata_ratio = 0; 1971 fs_info->metadata_ratio = 0;
1678 fs_info->defrag_inodes = RB_ROOT; 1972 fs_info->defrag_inodes = RB_ROOT;
1679 fs_info->trans_no_join = 0; 1973 fs_info->trans_no_join = 0;
1974 fs_info->free_chunk_space = 0;
1975
1976 /* readahead state */
1977 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
1978 spin_lock_init(&fs_info->reada_lock);
1680 1979
1681 fs_info->thread_pool_size = min_t(unsigned long, 1980 fs_info->thread_pool_size = min_t(unsigned long,
1682 num_online_cpus() + 2, 8); 1981 num_online_cpus() + 2, 8);
@@ -1766,14 +2065,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1766 goto fail_alloc; 2065 goto fail_alloc;
1767 } 2066 }
1768 2067
1769 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); 2068 memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
1770 memcpy(&fs_info->super_for_commit, &fs_info->super_copy, 2069 memcpy(fs_info->super_for_commit, fs_info->super_copy,
1771 sizeof(fs_info->super_for_commit)); 2070 sizeof(*fs_info->super_for_commit));
1772 brelse(bh); 2071 brelse(bh);
1773 2072
1774 memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE); 2073 memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
1775 2074
1776 disk_super = &fs_info->super_copy; 2075 disk_super = fs_info->super_copy;
1777 if (!btrfs_super_root(disk_super)) 2076 if (!btrfs_super_root(disk_super))
1778 goto fail_alloc; 2077 goto fail_alloc;
1779 2078
@@ -1783,6 +2082,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1783 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); 2082 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
1784 2083
1785 /* 2084 /*
2085 * run through our array of backup supers and setup
2086 * our ring pointer to the oldest one
2087 */
2088 generation = btrfs_super_generation(disk_super);
2089 find_oldest_super_backup(fs_info, generation);
2090
2091 /*
1786 * In the long term, we'll store the compression type in the super 2092 * In the long term, we'll store the compression type in the super
1787 * block, and it'll be used for per file compression control. 2093 * block, and it'll be used for per file compression control.
1788 */ 2094 */
@@ -1870,6 +2176,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1870 btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta", 2176 btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
1871 fs_info->thread_pool_size, 2177 fs_info->thread_pool_size,
1872 &fs_info->generic_worker); 2178 &fs_info->generic_worker);
2179 btrfs_init_workers(&fs_info->readahead_workers, "readahead",
2180 fs_info->thread_pool_size,
2181 &fs_info->generic_worker);
1873 2182
1874 /* 2183 /*
1875 * endios are largely parallel and should have a very 2184 * endios are largely parallel and should have a very
@@ -1880,6 +2189,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1880 2189
1881 fs_info->endio_write_workers.idle_thresh = 2; 2190 fs_info->endio_write_workers.idle_thresh = 2;
1882 fs_info->endio_meta_write_workers.idle_thresh = 2; 2191 fs_info->endio_meta_write_workers.idle_thresh = 2;
2192 fs_info->readahead_workers.idle_thresh = 2;
1883 2193
1884 btrfs_start_workers(&fs_info->workers, 1); 2194 btrfs_start_workers(&fs_info->workers, 1);
1885 btrfs_start_workers(&fs_info->generic_worker, 1); 2195 btrfs_start_workers(&fs_info->generic_worker, 1);
@@ -1893,6 +2203,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1893 btrfs_start_workers(&fs_info->endio_freespace_worker, 1); 2203 btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
1894 btrfs_start_workers(&fs_info->delayed_workers, 1); 2204 btrfs_start_workers(&fs_info->delayed_workers, 1);
1895 btrfs_start_workers(&fs_info->caching_workers, 1); 2205 btrfs_start_workers(&fs_info->caching_workers, 1);
2206 btrfs_start_workers(&fs_info->readahead_workers, 1);
1896 2207
1897 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 2208 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1898 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 2209 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1939,7 +2250,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1939 if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { 2250 if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
1940 printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", 2251 printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
1941 sb->s_id); 2252 sb->s_id);
1942 goto fail_chunk_root; 2253 goto fail_tree_roots;
1943 } 2254 }
1944 btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); 2255 btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
1945 chunk_root->commit_root = btrfs_root_node(chunk_root); 2256 chunk_root->commit_root = btrfs_root_node(chunk_root);
@@ -1954,11 +2265,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1954 if (ret) { 2265 if (ret) {
1955 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", 2266 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
1956 sb->s_id); 2267 sb->s_id);
1957 goto fail_chunk_root; 2268 goto fail_tree_roots;
1958 } 2269 }
1959 2270
1960 btrfs_close_extra_devices(fs_devices); 2271 btrfs_close_extra_devices(fs_devices);
1961 2272
2273retry_root_backup:
1962 blocksize = btrfs_level_size(tree_root, 2274 blocksize = btrfs_level_size(tree_root,
1963 btrfs_super_root_level(disk_super)); 2275 btrfs_super_root_level(disk_super));
1964 generation = btrfs_super_generation(disk_super); 2276 generation = btrfs_super_generation(disk_super);
@@ -1966,32 +2278,33 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1966 tree_root->node = read_tree_block(tree_root, 2278 tree_root->node = read_tree_block(tree_root,
1967 btrfs_super_root(disk_super), 2279 btrfs_super_root(disk_super),
1968 blocksize, generation); 2280 blocksize, generation);
1969 if (!tree_root->node) 2281 if (!tree_root->node ||
1970 goto fail_chunk_root; 2282 !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
1971 if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
1972 printk(KERN_WARNING "btrfs: failed to read tree root on %s\n", 2283 printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
1973 sb->s_id); 2284 sb->s_id);
1974 goto fail_tree_root; 2285
2286 goto recovery_tree_root;
1975 } 2287 }
2288
1976 btrfs_set_root_node(&tree_root->root_item, tree_root->node); 2289 btrfs_set_root_node(&tree_root->root_item, tree_root->node);
1977 tree_root->commit_root = btrfs_root_node(tree_root); 2290 tree_root->commit_root = btrfs_root_node(tree_root);
1978 2291
1979 ret = find_and_setup_root(tree_root, fs_info, 2292 ret = find_and_setup_root(tree_root, fs_info,
1980 BTRFS_EXTENT_TREE_OBJECTID, extent_root); 2293 BTRFS_EXTENT_TREE_OBJECTID, extent_root);
1981 if (ret) 2294 if (ret)
1982 goto fail_tree_root; 2295 goto recovery_tree_root;
1983 extent_root->track_dirty = 1; 2296 extent_root->track_dirty = 1;
1984 2297
1985 ret = find_and_setup_root(tree_root, fs_info, 2298 ret = find_and_setup_root(tree_root, fs_info,
1986 BTRFS_DEV_TREE_OBJECTID, dev_root); 2299 BTRFS_DEV_TREE_OBJECTID, dev_root);
1987 if (ret) 2300 if (ret)
1988 goto fail_extent_root; 2301 goto recovery_tree_root;
1989 dev_root->track_dirty = 1; 2302 dev_root->track_dirty = 1;
1990 2303
1991 ret = find_and_setup_root(tree_root, fs_info, 2304 ret = find_and_setup_root(tree_root, fs_info,
1992 BTRFS_CSUM_TREE_OBJECTID, csum_root); 2305 BTRFS_CSUM_TREE_OBJECTID, csum_root);
1993 if (ret) 2306 if (ret)
1994 goto fail_dev_root; 2307 goto recovery_tree_root;
1995 2308
1996 csum_root->track_dirty = 1; 2309 csum_root->track_dirty = 1;
1997 2310
@@ -2124,20 +2437,10 @@ fail_cleaner:
2124 2437
2125fail_block_groups: 2438fail_block_groups:
2126 btrfs_free_block_groups(fs_info); 2439 btrfs_free_block_groups(fs_info);
2127 free_extent_buffer(csum_root->node); 2440
2128 free_extent_buffer(csum_root->commit_root); 2441fail_tree_roots:
2129fail_dev_root: 2442 free_root_pointers(fs_info, 1);
2130 free_extent_buffer(dev_root->node); 2443
2131 free_extent_buffer(dev_root->commit_root);
2132fail_extent_root:
2133 free_extent_buffer(extent_root->node);
2134 free_extent_buffer(extent_root->commit_root);
2135fail_tree_root:
2136 free_extent_buffer(tree_root->node);
2137 free_extent_buffer(tree_root->commit_root);
2138fail_chunk_root:
2139 free_extent_buffer(chunk_root->node);
2140 free_extent_buffer(chunk_root->commit_root);
2141fail_sb_buffer: 2444fail_sb_buffer:
2142 btrfs_stop_workers(&fs_info->generic_worker); 2445 btrfs_stop_workers(&fs_info->generic_worker);
2143 btrfs_stop_workers(&fs_info->fixup_workers); 2446 btrfs_stop_workers(&fs_info->fixup_workers);
@@ -2152,7 +2455,6 @@ fail_sb_buffer:
2152 btrfs_stop_workers(&fs_info->delayed_workers); 2455 btrfs_stop_workers(&fs_info->delayed_workers);
2153 btrfs_stop_workers(&fs_info->caching_workers); 2456 btrfs_stop_workers(&fs_info->caching_workers);
2154fail_alloc: 2457fail_alloc:
2155 kfree(fs_info->delayed_root);
2156fail_iput: 2458fail_iput:
2157 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2459 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2158 iput(fs_info->btree_inode); 2460 iput(fs_info->btree_inode);
@@ -2164,13 +2466,27 @@ fail_bdi:
2164fail_srcu: 2466fail_srcu:
2165 cleanup_srcu_struct(&fs_info->subvol_srcu); 2467 cleanup_srcu_struct(&fs_info->subvol_srcu);
2166fail: 2468fail:
2167 kfree(extent_root); 2469 free_fs_info(fs_info);
2168 kfree(tree_root);
2169 kfree(fs_info);
2170 kfree(chunk_root);
2171 kfree(dev_root);
2172 kfree(csum_root);
2173 return ERR_PTR(err); 2470 return ERR_PTR(err);
2471
2472recovery_tree_root:
2473
2474 if (!btrfs_test_opt(tree_root, RECOVERY))
2475 goto fail_tree_roots;
2476
2477 free_root_pointers(fs_info, 0);
2478
2479 /* don't use the log in recovery mode, it won't be valid */
2480 btrfs_set_super_log_root(disk_super, 0);
2481
2482 /* we can't trust the free space cache either */
2483 btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
2484
2485 ret = next_root_backup(fs_info, fs_info->super_copy,
2486 &num_backups_tried, &backup_index);
2487 if (ret == -1)
2488 goto fail_block_groups;
2489 goto retry_root_backup;
2174} 2490}
2175 2491
2176static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) 2492static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
@@ -2338,10 +2654,11 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2338 int total_errors = 0; 2654 int total_errors = 0;
2339 u64 flags; 2655 u64 flags;
2340 2656
2341 max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 2657 max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
2342 do_barriers = !btrfs_test_opt(root, NOBARRIER); 2658 do_barriers = !btrfs_test_opt(root, NOBARRIER);
2659 backup_super_roots(root->fs_info);
2343 2660
2344 sb = &root->fs_info->super_for_commit; 2661 sb = root->fs_info->super_for_commit;
2345 dev_item = &sb->dev_item; 2662 dev_item = &sb->dev_item;
2346 2663
2347 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2664 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
@@ -2545,8 +2862,6 @@ int close_ctree(struct btrfs_root *root)
2545 /* clear out the rbtree of defraggable inodes */ 2862 /* clear out the rbtree of defraggable inodes */
2546 btrfs_run_defrag_inodes(root->fs_info); 2863 btrfs_run_defrag_inodes(root->fs_info);
2547 2864
2548 btrfs_put_block_group_cache(fs_info);
2549
2550 /* 2865 /*
2551 * Here come 2 situations when btrfs is broken to flip readonly: 2866 * Here come 2 situations when btrfs is broken to flip readonly:
2552 * 2867 *
@@ -2572,6 +2887,8 @@ int close_ctree(struct btrfs_root *root)
2572 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 2887 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2573 } 2888 }
2574 2889
2890 btrfs_put_block_group_cache(fs_info);
2891
2575 kthread_stop(root->fs_info->transaction_kthread); 2892 kthread_stop(root->fs_info->transaction_kthread);
2576 kthread_stop(root->fs_info->cleaner_kthread); 2893 kthread_stop(root->fs_info->cleaner_kthread);
2577 2894
@@ -2603,7 +2920,6 @@ int close_ctree(struct btrfs_root *root)
2603 del_fs_roots(fs_info); 2920 del_fs_roots(fs_info);
2604 2921
2605 iput(fs_info->btree_inode); 2922 iput(fs_info->btree_inode);
2606 kfree(fs_info->delayed_root);
2607 2923
2608 btrfs_stop_workers(&fs_info->generic_worker); 2924 btrfs_stop_workers(&fs_info->generic_worker);
2609 btrfs_stop_workers(&fs_info->fixup_workers); 2925 btrfs_stop_workers(&fs_info->fixup_workers);
@@ -2617,6 +2933,7 @@ int close_ctree(struct btrfs_root *root)
2617 btrfs_stop_workers(&fs_info->submit_workers); 2933 btrfs_stop_workers(&fs_info->submit_workers);
2618 btrfs_stop_workers(&fs_info->delayed_workers); 2934 btrfs_stop_workers(&fs_info->delayed_workers);
2619 btrfs_stop_workers(&fs_info->caching_workers); 2935 btrfs_stop_workers(&fs_info->caching_workers);
2936 btrfs_stop_workers(&fs_info->readahead_workers);
2620 2937
2621 btrfs_close_devices(fs_info->fs_devices); 2938 btrfs_close_devices(fs_info->fs_devices);
2622 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2939 btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2624,12 +2941,7 @@ int close_ctree(struct btrfs_root *root)
2624 bdi_destroy(&fs_info->bdi); 2941 bdi_destroy(&fs_info->bdi);
2625 cleanup_srcu_struct(&fs_info->subvol_srcu); 2942 cleanup_srcu_struct(&fs_info->subvol_srcu);
2626 2943
2627 kfree(fs_info->extent_root); 2944 free_fs_info(fs_info);
2628 kfree(fs_info->tree_root);
2629 kfree(fs_info->chunk_root);
2630 kfree(fs_info->dev_root);
2631 kfree(fs_info->csum_root);
2632 kfree(fs_info);
2633 2945
2634 return 0; 2946 return 0;
2635} 2947}
@@ -2735,7 +3047,8 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2735 return ret; 3047 return ret;
2736} 3048}
2737 3049
2738int btree_lock_page_hook(struct page *page) 3050static int btree_lock_page_hook(struct page *page, void *data,
3051 void (*flush_fn)(void *))
2739{ 3052{
2740 struct inode *inode = page->mapping->host; 3053 struct inode *inode = page->mapping->host;
2741 struct btrfs_root *root = BTRFS_I(inode)->root; 3054 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2752,7 +3065,10 @@ int btree_lock_page_hook(struct page *page)
2752 if (!eb) 3065 if (!eb)
2753 goto out; 3066 goto out;
2754 3067
2755 btrfs_tree_lock(eb); 3068 if (!btrfs_try_tree_write_lock(eb)) {
3069 flush_fn(data);
3070 btrfs_tree_lock(eb);
3071 }
2756 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3072 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
2757 3073
2758 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3074 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
@@ -2767,7 +3083,10 @@ int btree_lock_page_hook(struct page *page)
2767 btrfs_tree_unlock(eb); 3083 btrfs_tree_unlock(eb);
2768 free_extent_buffer(eb); 3084 free_extent_buffer(eb);
2769out: 3085out:
2770 lock_page(page); 3086 if (!trylock_page(page)) {
3087 flush_fn(data);
3088 lock_page(page);
3089 }
2771 return 0; 3090 return 0;
2772} 3091}
2773 3092
@@ -3123,6 +3442,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
3123static struct extent_io_ops btree_extent_io_ops = { 3442static struct extent_io_ops btree_extent_io_ops = {
3124 .write_cache_pages_lock_hook = btree_lock_page_hook, 3443 .write_cache_pages_lock_hook = btree_lock_page_hook,
3125 .readpage_end_io_hook = btree_readpage_end_io_hook, 3444 .readpage_end_io_hook = btree_readpage_end_io_hook,
3445 .readpage_io_failed_hook = btree_io_failed_hook,
3126 .submit_bio_hook = btree_submit_bio_hook, 3446 .submit_bio_hook = btree_submit_bio_hook,
3127 /* note we're sharing with inode.c for the merge bio hook */ 3447 /* note we're sharing with inode.c for the merge bio hook */
3128 .merge_bio_hook = btrfs_merge_bio_hook, 3448 .merge_bio_hook = btrfs_merge_bio_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index bec3ea4bd67f..c99d0a8f13fa 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -40,6 +40,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
40 u32 blocksize, u64 parent_transid); 40 u32 blocksize, u64 parent_transid);
41int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, 41int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
42 u64 parent_transid); 42 u64 parent_transid);
43int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
44 int mirror_num, struct extent_buffer **eb);
43struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, 45struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
44 u64 bytenr, u32 blocksize); 46 u64 bytenr, u32 blocksize);
45int clean_tree_block(struct btrfs_trans_handle *trans, 47int clean_tree_block(struct btrfs_trans_handle *trans,
@@ -83,8 +85,6 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
83 struct btrfs_fs_info *fs_info); 85 struct btrfs_fs_info *fs_info);
84int btrfs_add_log_tree(struct btrfs_trans_handle *trans, 86int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
85 struct btrfs_root *root); 87 struct btrfs_root *root);
86int btree_lock_page_hook(struct page *page);
87
88 88
89#ifdef CONFIG_DEBUG_LOCK_ALLOC 89#ifdef CONFIG_DEBUG_LOCK_ALLOC
90void btrfs_init_lockdep(void); 90void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 119f842c1d4f..18ea90c8943b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -23,6 +23,7 @@
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ratelimit.h>
26#include "compat.h" 27#include "compat.h"
27#include "hash.h" 28#include "hash.h"
28#include "ctree.h" 29#include "ctree.h"
@@ -52,6 +53,21 @@ enum {
52 CHUNK_ALLOC_LIMITED = 2, 53 CHUNK_ALLOC_LIMITED = 2,
53}; 54};
54 55
56/*
57 * Control how reservations are dealt with.
58 *
59 * RESERVE_FREE - freeing a reservation.
60 * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
61 * ENOSPC accounting
62 * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
63 * bytes_may_use as the ENOSPC accounting is done elsewhere
64 */
65enum {
66 RESERVE_FREE = 0,
67 RESERVE_ALLOC = 1,
68 RESERVE_ALLOC_NO_ACCOUNT = 2,
69};
70
55static int update_block_group(struct btrfs_trans_handle *trans, 71static int update_block_group(struct btrfs_trans_handle *trans,
56 struct btrfs_root *root, 72 struct btrfs_root *root,
57 u64 bytenr, u64 num_bytes, int alloc); 73 u64 bytenr, u64 num_bytes, int alloc);
@@ -81,6 +97,8 @@ static int find_next_key(struct btrfs_path *path, int level,
81 struct btrfs_key *key); 97 struct btrfs_key *key);
82static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 98static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
83 int dump_block_groups); 99 int dump_block_groups);
100static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
101 u64 num_bytes, int reserve);
84 102
85static noinline int 103static noinline int
86block_group_cache_done(struct btrfs_block_group_cache *cache) 104block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -104,7 +122,6 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
104 if (atomic_dec_and_test(&cache->count)) { 122 if (atomic_dec_and_test(&cache->count)) {
105 WARN_ON(cache->pinned > 0); 123 WARN_ON(cache->pinned > 0);
106 WARN_ON(cache->reserved > 0); 124 WARN_ON(cache->reserved > 0);
107 WARN_ON(cache->reserved_pinned > 0);
108 kfree(cache->free_space_ctl); 125 kfree(cache->free_space_ctl);
109 kfree(cache); 126 kfree(cache);
110 } 127 }
@@ -465,7 +482,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
465 * we likely hold important locks. 482 * we likely hold important locks.
466 */ 483 */
467 if (trans && (!trans->transaction->in_commit) && 484 if (trans && (!trans->transaction->in_commit) &&
468 (root && root != root->fs_info->tree_root)) { 485 (root && root != root->fs_info->tree_root) &&
486 btrfs_test_opt(root, SPACE_CACHE)) {
469 spin_lock(&cache->lock); 487 spin_lock(&cache->lock);
470 if (cache->cached != BTRFS_CACHE_NO) { 488 if (cache->cached != BTRFS_CACHE_NO) {
471 spin_unlock(&cache->lock); 489 spin_unlock(&cache->lock);
@@ -2700,6 +2718,13 @@ again:
2700 goto again; 2718 goto again;
2701 } 2719 }
2702 2720
2721 /* We've already setup this transaction, go ahead and exit */
2722 if (block_group->cache_generation == trans->transid &&
2723 i_size_read(inode)) {
2724 dcs = BTRFS_DC_SETUP;
2725 goto out_put;
2726 }
2727
2703 /* 2728 /*
2704 * We want to set the generation to 0, that way if anything goes wrong 2729 * We want to set the generation to 0, that way if anything goes wrong
2705 * from here on out we know not to trust this cache when we load up next 2730 * from here on out we know not to trust this cache when we load up next
@@ -2749,12 +2774,15 @@ again:
2749 if (!ret) 2774 if (!ret)
2750 dcs = BTRFS_DC_SETUP; 2775 dcs = BTRFS_DC_SETUP;
2751 btrfs_free_reserved_data_space(inode, num_pages); 2776 btrfs_free_reserved_data_space(inode, num_pages);
2777
2752out_put: 2778out_put:
2753 iput(inode); 2779 iput(inode);
2754out_free: 2780out_free:
2755 btrfs_release_path(path); 2781 btrfs_release_path(path);
2756out: 2782out:
2757 spin_lock(&block_group->lock); 2783 spin_lock(&block_group->lock);
2784 if (!ret)
2785 block_group->cache_generation = trans->transid;
2758 block_group->disk_cache_state = dcs; 2786 block_group->disk_cache_state = dcs;
2759 spin_unlock(&block_group->lock); 2787 spin_unlock(&block_group->lock);
2760 2788
@@ -3122,16 +3150,13 @@ commit_trans:
3122 return -ENOSPC; 3150 return -ENOSPC;
3123 } 3151 }
3124 data_sinfo->bytes_may_use += bytes; 3152 data_sinfo->bytes_may_use += bytes;
3125 BTRFS_I(inode)->reserved_bytes += bytes;
3126 spin_unlock(&data_sinfo->lock); 3153 spin_unlock(&data_sinfo->lock);
3127 3154
3128 return 0; 3155 return 0;
3129} 3156}
3130 3157
3131/* 3158/*
3132 * called when we are clearing an delalloc extent from the 3159 * Called if we need to clear a data reservation for this inode.
3133 * inode's io_tree or there was an error for whatever reason
3134 * after calling btrfs_check_data_free_space
3135 */ 3160 */
3136void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) 3161void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3137{ 3162{
@@ -3144,7 +3169,6 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3144 data_sinfo = BTRFS_I(inode)->space_info; 3169 data_sinfo = BTRFS_I(inode)->space_info;
3145 spin_lock(&data_sinfo->lock); 3170 spin_lock(&data_sinfo->lock);
3146 data_sinfo->bytes_may_use -= bytes; 3171 data_sinfo->bytes_may_use -= bytes;
3147 BTRFS_I(inode)->reserved_bytes -= bytes;
3148 spin_unlock(&data_sinfo->lock); 3172 spin_unlock(&data_sinfo->lock);
3149} 3173}
3150 3174
@@ -3165,6 +3189,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
3165 struct btrfs_space_info *sinfo, u64 alloc_bytes, 3189 struct btrfs_space_info *sinfo, u64 alloc_bytes,
3166 int force) 3190 int force)
3167{ 3191{
3192 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3168 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3193 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3169 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; 3194 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3170 u64 thresh; 3195 u64 thresh;
@@ -3173,11 +3198,18 @@ static int should_alloc_chunk(struct btrfs_root *root,
3173 return 1; 3198 return 1;
3174 3199
3175 /* 3200 /*
3201 * We need to take into account the global rsv because for all intents
3202 * and purposes it's used space. Don't worry about locking the
3203 * global_rsv, it doesn't change except when the transaction commits.
3204 */
3205 num_allocated += global_rsv->size;
3206
3207 /*
3176 * in limited mode, we want to have some free space up to 3208 * in limited mode, we want to have some free space up to
3177 * about 1% of the FS size. 3209 * about 1% of the FS size.
3178 */ 3210 */
3179 if (force == CHUNK_ALLOC_LIMITED) { 3211 if (force == CHUNK_ALLOC_LIMITED) {
3180 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); 3212 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3181 thresh = max_t(u64, 64 * 1024 * 1024, 3213 thresh = max_t(u64, 64 * 1024 * 1024,
3182 div_factor_fine(thresh, 1)); 3214 div_factor_fine(thresh, 1));
3183 3215
@@ -3199,7 +3231,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
3199 if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) 3231 if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
3200 return 0; 3232 return 0;
3201 3233
3202 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); 3234 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3203 3235
3204 /* 256MB or 5% of the FS */ 3236 /* 256MB or 5% of the FS */
3205 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); 3237 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
@@ -3302,24 +3334,26 @@ out:
3302/* 3334/*
3303 * shrink metadata reservation for delalloc 3335 * shrink metadata reservation for delalloc
3304 */ 3336 */
3305static int shrink_delalloc(struct btrfs_trans_handle *trans, 3337static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
3306 struct btrfs_root *root, u64 to_reclaim, int sync) 3338 bool wait_ordered)
3307{ 3339{
3308 struct btrfs_block_rsv *block_rsv; 3340 struct btrfs_block_rsv *block_rsv;
3309 struct btrfs_space_info *space_info; 3341 struct btrfs_space_info *space_info;
3342 struct btrfs_trans_handle *trans;
3310 u64 reserved; 3343 u64 reserved;
3311 u64 max_reclaim; 3344 u64 max_reclaim;
3312 u64 reclaimed = 0; 3345 u64 reclaimed = 0;
3313 long time_left; 3346 long time_left;
3314 int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3347 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3315 int loops = 0; 3348 int loops = 0;
3316 unsigned long progress; 3349 unsigned long progress;
3317 3350
3351 trans = (struct btrfs_trans_handle *)current->journal_info;
3318 block_rsv = &root->fs_info->delalloc_block_rsv; 3352 block_rsv = &root->fs_info->delalloc_block_rsv;
3319 space_info = block_rsv->space_info; 3353 space_info = block_rsv->space_info;
3320 3354
3321 smp_mb(); 3355 smp_mb();
3322 reserved = space_info->bytes_reserved; 3356 reserved = space_info->bytes_may_use;
3323 progress = space_info->reservation_progress; 3357 progress = space_info->reservation_progress;
3324 3358
3325 if (reserved == 0) 3359 if (reserved == 0)
@@ -3334,7 +3368,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3334 } 3368 }
3335 3369
3336 max_reclaim = min(reserved, to_reclaim); 3370 max_reclaim = min(reserved, to_reclaim);
3337 3371 nr_pages = max_t(unsigned long, nr_pages,
3372 max_reclaim >> PAGE_CACHE_SHIFT);
3338 while (loops < 1024) { 3373 while (loops < 1024) {
3339 /* have the flusher threads jump in and do some IO */ 3374 /* have the flusher threads jump in and do some IO */
3340 smp_mb(); 3375 smp_mb();
@@ -3343,9 +3378,9 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3343 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); 3378 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
3344 3379
3345 spin_lock(&space_info->lock); 3380 spin_lock(&space_info->lock);
3346 if (reserved > space_info->bytes_reserved) 3381 if (reserved > space_info->bytes_may_use)
3347 reclaimed += reserved - space_info->bytes_reserved; 3382 reclaimed += reserved - space_info->bytes_may_use;
3348 reserved = space_info->bytes_reserved; 3383 reserved = space_info->bytes_may_use;
3349 spin_unlock(&space_info->lock); 3384 spin_unlock(&space_info->lock);
3350 3385
3351 loops++; 3386 loops++;
@@ -3356,11 +3391,15 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3356 if (trans && trans->transaction->blocked) 3391 if (trans && trans->transaction->blocked)
3357 return -EAGAIN; 3392 return -EAGAIN;
3358 3393
3359 time_left = schedule_timeout_interruptible(1); 3394 if (wait_ordered && !trans) {
3395 btrfs_wait_ordered_extents(root, 0, 0);
3396 } else {
3397 time_left = schedule_timeout_interruptible(1);
3360 3398
3361 /* We were interrupted, exit */ 3399 /* We were interrupted, exit */
3362 if (time_left) 3400 if (time_left)
3363 break; 3401 break;
3402 }
3364 3403
3365 /* we've kicked the IO a few times, if anything has been freed, 3404 /* we've kicked the IO a few times, if anything has been freed,
3366 * exit. There is no sense in looping here for a long time 3405 * exit. There is no sense in looping here for a long time
@@ -3375,34 +3414,90 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3375 } 3414 }
3376 3415
3377 } 3416 }
3378 if (reclaimed >= to_reclaim && !trans) 3417
3379 btrfs_wait_ordered_extents(root, 0, 0);
3380 return reclaimed >= to_reclaim; 3418 return reclaimed >= to_reclaim;
3381} 3419}
3382 3420
3383/* 3421/**
3384 * Retries tells us how many times we've called reserve_metadata_bytes. The 3422 * maybe_commit_transaction - possibly commit the transaction if its ok to
3385 * idea is if this is the first call (retries == 0) then we will add to our 3423 * @root - the root we're allocating for
3386 * reserved count if we can't make the allocation in order to hold our place 3424 * @bytes - the number of bytes we want to reserve
3387 * while we go and try and free up space. That way for retries > 1 we don't try 3425 * @force - force the commit
3388 * and add space, we just check to see if the amount of unused space is >= the
3389 * total space, meaning that our reservation is valid.
3390 * 3426 *
3391 * However if we don't intend to retry this reservation, pass -1 as retries so 3427 * This will check to make sure that committing the transaction will actually
3392 * that it short circuits this logic. 3428 * get us somewhere and then commit the transaction if it does. Otherwise it
3429 * will return -ENOSPC.
3393 */ 3430 */
3394static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, 3431static int may_commit_transaction(struct btrfs_root *root,
3395 struct btrfs_root *root, 3432 struct btrfs_space_info *space_info,
3433 u64 bytes, int force)
3434{
3435 struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
3436 struct btrfs_trans_handle *trans;
3437
3438 trans = (struct btrfs_trans_handle *)current->journal_info;
3439 if (trans)
3440 return -EAGAIN;
3441
3442 if (force)
3443 goto commit;
3444
3445 /* See if there is enough pinned space to make this reservation */
3446 spin_lock(&space_info->lock);
3447 if (space_info->bytes_pinned >= bytes) {
3448 spin_unlock(&space_info->lock);
3449 goto commit;
3450 }
3451 spin_unlock(&space_info->lock);
3452
3453 /*
3454 * See if there is some space in the delayed insertion reservation for
3455 * this reservation.
3456 */
3457 if (space_info != delayed_rsv->space_info)
3458 return -ENOSPC;
3459
3460 spin_lock(&delayed_rsv->lock);
3461 if (delayed_rsv->size < bytes) {
3462 spin_unlock(&delayed_rsv->lock);
3463 return -ENOSPC;
3464 }
3465 spin_unlock(&delayed_rsv->lock);
3466
3467commit:
3468 trans = btrfs_join_transaction(root);
3469 if (IS_ERR(trans))
3470 return -ENOSPC;
3471
3472 return btrfs_commit_transaction(trans, root);
3473}
3474
3475/**
3476 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
3477 * @root - the root we're allocating for
3478 * @block_rsv - the block_rsv we're allocating for
3479 * @orig_bytes - the number of bytes we want
3480 * @flush - wether or not we can flush to make our reservation
3481 *
3482 * This will reserve orgi_bytes number of bytes from the space info associated
3483 * with the block_rsv. If there is not enough space it will make an attempt to
3484 * flush out space to make room. It will do this by flushing delalloc if
3485 * possible or committing the transaction. If flush is 0 then no attempts to
3486 * regain reservations will be made and this will fail if there is not enough
3487 * space already.
3488 */
3489static int reserve_metadata_bytes(struct btrfs_root *root,
3396 struct btrfs_block_rsv *block_rsv, 3490 struct btrfs_block_rsv *block_rsv,
3397 u64 orig_bytes, int flush) 3491 u64 orig_bytes, int flush)
3398{ 3492{
3399 struct btrfs_space_info *space_info = block_rsv->space_info; 3493 struct btrfs_space_info *space_info = block_rsv->space_info;
3400 u64 unused; 3494 u64 used;
3401 u64 num_bytes = orig_bytes; 3495 u64 num_bytes = orig_bytes;
3402 int retries = 0; 3496 int retries = 0;
3403 int ret = 0; 3497 int ret = 0;
3404 bool committed = false; 3498 bool committed = false;
3405 bool flushing = false; 3499 bool flushing = false;
3500 bool wait_ordered = false;
3406 3501
3407again: 3502again:
3408 ret = 0; 3503 ret = 0;
@@ -3419,7 +3514,7 @@ again:
3419 * deadlock since we are waiting for the flusher to finish, but 3514 * deadlock since we are waiting for the flusher to finish, but
3420 * hold the current transaction open. 3515 * hold the current transaction open.
3421 */ 3516 */
3422 if (trans) 3517 if (current->journal_info)
3423 return -EAGAIN; 3518 return -EAGAIN;
3424 ret = wait_event_interruptible(space_info->wait, 3519 ret = wait_event_interruptible(space_info->wait,
3425 !space_info->flush); 3520 !space_info->flush);
@@ -3431,9 +3526,9 @@ again:
3431 } 3526 }
3432 3527
3433 ret = -ENOSPC; 3528 ret = -ENOSPC;
3434 unused = space_info->bytes_used + space_info->bytes_reserved + 3529 used = space_info->bytes_used + space_info->bytes_reserved +
3435 space_info->bytes_pinned + space_info->bytes_readonly + 3530 space_info->bytes_pinned + space_info->bytes_readonly +
3436 space_info->bytes_may_use; 3531 space_info->bytes_may_use;
3437 3532
3438 /* 3533 /*
3439 * The idea here is that we've not already over-reserved the block group 3534 * The idea here is that we've not already over-reserved the block group
@@ -3442,10 +3537,9 @@ again:
3442 * lets start flushing stuff first and then come back and try to make 3537 * lets start flushing stuff first and then come back and try to make
3443 * our reservation. 3538 * our reservation.
3444 */ 3539 */
3445 if (unused <= space_info->total_bytes) { 3540 if (used <= space_info->total_bytes) {
3446 unused = space_info->total_bytes - unused; 3541 if (used + orig_bytes <= space_info->total_bytes) {
3447 if (unused >= num_bytes) { 3542 space_info->bytes_may_use += orig_bytes;
3448 space_info->bytes_reserved += orig_bytes;
3449 ret = 0; 3543 ret = 0;
3450 } else { 3544 } else {
3451 /* 3545 /*
@@ -3461,10 +3555,64 @@ again:
3461 * amount plus the amount of bytes that we need for this 3555 * amount plus the amount of bytes that we need for this
3462 * reservation. 3556 * reservation.
3463 */ 3557 */
3464 num_bytes = unused - space_info->total_bytes + 3558 wait_ordered = true;
3559 num_bytes = used - space_info->total_bytes +
3465 (orig_bytes * (retries + 1)); 3560 (orig_bytes * (retries + 1));
3466 } 3561 }
3467 3562
3563 if (ret) {
3564 u64 profile = btrfs_get_alloc_profile(root, 0);
3565 u64 avail;
3566
3567 /*
3568 * If we have a lot of space that's pinned, don't bother doing
3569 * the overcommit dance yet and just commit the transaction.
3570 */
3571 avail = (space_info->total_bytes - space_info->bytes_used) * 8;
3572 do_div(avail, 10);
3573 if (space_info->bytes_pinned >= avail && flush && !committed) {
3574 space_info->flush = 1;
3575 flushing = true;
3576 spin_unlock(&space_info->lock);
3577 ret = may_commit_transaction(root, space_info,
3578 orig_bytes, 1);
3579 if (ret)
3580 goto out;
3581 committed = true;
3582 goto again;
3583 }
3584
3585 spin_lock(&root->fs_info->free_chunk_lock);
3586 avail = root->fs_info->free_chunk_space;
3587
3588 /*
3589 * If we have dup, raid1 or raid10 then only half of the free
3590 * space is actually useable.
3591 */
3592 if (profile & (BTRFS_BLOCK_GROUP_DUP |
3593 BTRFS_BLOCK_GROUP_RAID1 |
3594 BTRFS_BLOCK_GROUP_RAID10))
3595 avail >>= 1;
3596
3597 /*
3598 * If we aren't flushing don't let us overcommit too much, say
3599 * 1/8th of the space. If we can flush, let it overcommit up to
3600 * 1/2 of the space.
3601 */
3602 if (flush)
3603 avail >>= 3;
3604 else
3605 avail >>= 1;
3606 spin_unlock(&root->fs_info->free_chunk_lock);
3607
3608 if (used + num_bytes < space_info->total_bytes + avail) {
3609 space_info->bytes_may_use += orig_bytes;
3610 ret = 0;
3611 } else {
3612 wait_ordered = true;
3613 }
3614 }
3615
3468 /* 3616 /*
3469 * Couldn't make our reservation, save our place so while we're trying 3617 * Couldn't make our reservation, save our place so while we're trying
3470 * to reclaim space we can actually use it instead of somebody else 3618 * to reclaim space we can actually use it instead of somebody else
@@ -3484,7 +3632,7 @@ again:
3484 * We do synchronous shrinking since we don't actually unreserve 3632 * We do synchronous shrinking since we don't actually unreserve
3485 * metadata until after the IO is completed. 3633 * metadata until after the IO is completed.
3486 */ 3634 */
3487 ret = shrink_delalloc(trans, root, num_bytes, 1); 3635 ret = shrink_delalloc(root, num_bytes, wait_ordered);
3488 if (ret < 0) 3636 if (ret < 0)
3489 goto out; 3637 goto out;
3490 3638
@@ -3496,35 +3644,17 @@ again:
3496 * so go back around and try again. 3644 * so go back around and try again.
3497 */ 3645 */
3498 if (retries < 2) { 3646 if (retries < 2) {
3647 wait_ordered = true;
3499 retries++; 3648 retries++;
3500 goto again; 3649 goto again;
3501 } 3650 }
3502 3651
3503 /*
3504 * Not enough space to be reclaimed, don't bother committing the
3505 * transaction.
3506 */
3507 spin_lock(&space_info->lock);
3508 if (space_info->bytes_pinned < orig_bytes)
3509 ret = -ENOSPC;
3510 spin_unlock(&space_info->lock);
3511 if (ret)
3512 goto out;
3513
3514 ret = -EAGAIN;
3515 if (trans)
3516 goto out;
3517
3518 ret = -ENOSPC; 3652 ret = -ENOSPC;
3519 if (committed) 3653 if (committed)
3520 goto out; 3654 goto out;
3521 3655
3522 trans = btrfs_join_transaction(root); 3656 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3523 if (IS_ERR(trans))
3524 goto out;
3525 ret = btrfs_commit_transaction(trans, root);
3526 if (!ret) { 3657 if (!ret) {
3527 trans = NULL;
3528 committed = true; 3658 committed = true;
3529 goto again; 3659 goto again;
3530 } 3660 }
@@ -3542,10 +3672,12 @@ out:
3542static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans, 3672static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
3543 struct btrfs_root *root) 3673 struct btrfs_root *root)
3544{ 3674{
3545 struct btrfs_block_rsv *block_rsv; 3675 struct btrfs_block_rsv *block_rsv = NULL;
3546 if (root->ref_cows) 3676
3677 if (root->ref_cows || root == root->fs_info->csum_root)
3547 block_rsv = trans->block_rsv; 3678 block_rsv = trans->block_rsv;
3548 else 3679
3680 if (!block_rsv)
3549 block_rsv = root->block_rsv; 3681 block_rsv = root->block_rsv;
3550 3682
3551 if (!block_rsv) 3683 if (!block_rsv)
@@ -3616,7 +3748,7 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3616 } 3748 }
3617 if (num_bytes) { 3749 if (num_bytes) {
3618 spin_lock(&space_info->lock); 3750 spin_lock(&space_info->lock);
3619 space_info->bytes_reserved -= num_bytes; 3751 space_info->bytes_may_use -= num_bytes;
3620 space_info->reservation_progress++; 3752 space_info->reservation_progress++;
3621 spin_unlock(&space_info->lock); 3753 spin_unlock(&space_info->lock);
3622 } 3754 }
@@ -3640,9 +3772,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
3640{ 3772{
3641 memset(rsv, 0, sizeof(*rsv)); 3773 memset(rsv, 0, sizeof(*rsv));
3642 spin_lock_init(&rsv->lock); 3774 spin_lock_init(&rsv->lock);
3643 atomic_set(&rsv->usage, 1);
3644 rsv->priority = 6;
3645 INIT_LIST_HEAD(&rsv->list);
3646} 3775}
3647 3776
3648struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) 3777struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
@@ -3663,38 +3792,38 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3663void btrfs_free_block_rsv(struct btrfs_root *root, 3792void btrfs_free_block_rsv(struct btrfs_root *root,
3664 struct btrfs_block_rsv *rsv) 3793 struct btrfs_block_rsv *rsv)
3665{ 3794{
3666 if (rsv && atomic_dec_and_test(&rsv->usage)) { 3795 btrfs_block_rsv_release(root, rsv, (u64)-1);
3667 btrfs_block_rsv_release(root, rsv, (u64)-1); 3796 kfree(rsv);
3668 if (!rsv->durable)
3669 kfree(rsv);
3670 }
3671} 3797}
3672 3798
3673/* 3799int btrfs_block_rsv_add(struct btrfs_root *root,
3674 * make the block_rsv struct be able to capture freed space. 3800 struct btrfs_block_rsv *block_rsv,
3675 * the captured space will re-add to the the block_rsv struct 3801 u64 num_bytes)
3676 * after transaction commit
3677 */
3678void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
3679 struct btrfs_block_rsv *block_rsv)
3680{ 3802{
3681 block_rsv->durable = 1; 3803 int ret;
3682 mutex_lock(&fs_info->durable_block_rsv_mutex); 3804
3683 list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list); 3805 if (num_bytes == 0)
3684 mutex_unlock(&fs_info->durable_block_rsv_mutex); 3806 return 0;
3807
3808 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
3809 if (!ret) {
3810 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3811 return 0;
3812 }
3813
3814 return ret;
3685} 3815}
3686 3816
3687int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, 3817int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
3688 struct btrfs_root *root, 3818 struct btrfs_block_rsv *block_rsv,
3689 struct btrfs_block_rsv *block_rsv, 3819 u64 num_bytes)
3690 u64 num_bytes)
3691{ 3820{
3692 int ret; 3821 int ret;
3693 3822
3694 if (num_bytes == 0) 3823 if (num_bytes == 0)
3695 return 0; 3824 return 0;
3696 3825
3697 ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1); 3826 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 0);
3698 if (!ret) { 3827 if (!ret) {
3699 block_rsv_add_bytes(block_rsv, num_bytes, 1); 3828 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3700 return 0; 3829 return 0;
@@ -3703,55 +3832,52 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
3703 return ret; 3832 return ret;
3704} 3833}
3705 3834
3706int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, 3835int btrfs_block_rsv_check(struct btrfs_root *root,
3707 struct btrfs_root *root, 3836 struct btrfs_block_rsv *block_rsv, int min_factor)
3708 struct btrfs_block_rsv *block_rsv,
3709 u64 min_reserved, int min_factor)
3710{ 3837{
3711 u64 num_bytes = 0; 3838 u64 num_bytes = 0;
3712 int commit_trans = 0;
3713 int ret = -ENOSPC; 3839 int ret = -ENOSPC;
3714 3840
3715 if (!block_rsv) 3841 if (!block_rsv)
3716 return 0; 3842 return 0;
3717 3843
3718 spin_lock(&block_rsv->lock); 3844 spin_lock(&block_rsv->lock);
3719 if (min_factor > 0) 3845 num_bytes = div_factor(block_rsv->size, min_factor);
3720 num_bytes = div_factor(block_rsv->size, min_factor); 3846 if (block_rsv->reserved >= num_bytes)
3721 if (min_reserved > num_bytes) 3847 ret = 0;
3722 num_bytes = min_reserved; 3848 spin_unlock(&block_rsv->lock);
3723 3849
3724 if (block_rsv->reserved >= num_bytes) { 3850 return ret;
3851}
3852
3853int btrfs_block_rsv_refill(struct btrfs_root *root,
3854 struct btrfs_block_rsv *block_rsv,
3855 u64 min_reserved)
3856{
3857 u64 num_bytes = 0;
3858 int ret = -ENOSPC;
3859
3860 if (!block_rsv)
3861 return 0;
3862
3863 spin_lock(&block_rsv->lock);
3864 num_bytes = min_reserved;
3865 if (block_rsv->reserved >= num_bytes)
3725 ret = 0; 3866 ret = 0;
3726 } else { 3867 else
3727 num_bytes -= block_rsv->reserved; 3868 num_bytes -= block_rsv->reserved;
3728 if (block_rsv->durable &&
3729 block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
3730 commit_trans = 1;
3731 }
3732 spin_unlock(&block_rsv->lock); 3869 spin_unlock(&block_rsv->lock);
3870
3733 if (!ret) 3871 if (!ret)
3734 return 0; 3872 return 0;
3735 3873
3736 if (block_rsv->refill_used) { 3874 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
3737 ret = reserve_metadata_bytes(trans, root, block_rsv, 3875 if (!ret) {
3738 num_bytes, 0); 3876 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3739 if (!ret) {
3740 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3741 return 0;
3742 }
3743 }
3744
3745 if (commit_trans) {
3746 if (trans)
3747 return -EAGAIN;
3748 trans = btrfs_join_transaction(root);
3749 BUG_ON(IS_ERR(trans));
3750 ret = btrfs_commit_transaction(trans, root);
3751 return 0; 3877 return 0;
3752 } 3878 }
3753 3879
3754 return -ENOSPC; 3880 return ret;
3755} 3881}
3756 3882
3757int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 3883int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
@@ -3783,7 +3909,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
3783 u64 num_bytes; 3909 u64 num_bytes;
3784 u64 meta_used; 3910 u64 meta_used;
3785 u64 data_used; 3911 u64 data_used;
3786 int csum_size = btrfs_super_csum_size(&fs_info->super_copy); 3912 int csum_size = btrfs_super_csum_size(fs_info->super_copy);
3787 3913
3788 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); 3914 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
3789 spin_lock(&sinfo->lock); 3915 spin_lock(&sinfo->lock);
@@ -3827,12 +3953,12 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3827 if (sinfo->total_bytes > num_bytes) { 3953 if (sinfo->total_bytes > num_bytes) {
3828 num_bytes = sinfo->total_bytes - num_bytes; 3954 num_bytes = sinfo->total_bytes - num_bytes;
3829 block_rsv->reserved += num_bytes; 3955 block_rsv->reserved += num_bytes;
3830 sinfo->bytes_reserved += num_bytes; 3956 sinfo->bytes_may_use += num_bytes;
3831 } 3957 }
3832 3958
3833 if (block_rsv->reserved >= block_rsv->size) { 3959 if (block_rsv->reserved >= block_rsv->size) {
3834 num_bytes = block_rsv->reserved - block_rsv->size; 3960 num_bytes = block_rsv->reserved - block_rsv->size;
3835 sinfo->bytes_reserved -= num_bytes; 3961 sinfo->bytes_may_use -= num_bytes;
3836 sinfo->reservation_progress++; 3962 sinfo->reservation_progress++;
3837 block_rsv->reserved = block_rsv->size; 3963 block_rsv->reserved = block_rsv->size;
3838 block_rsv->full = 1; 3964 block_rsv->full = 1;
@@ -3848,16 +3974,13 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3848 3974
3849 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 3975 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3850 fs_info->chunk_block_rsv.space_info = space_info; 3976 fs_info->chunk_block_rsv.space_info = space_info;
3851 fs_info->chunk_block_rsv.priority = 10;
3852 3977
3853 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 3978 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3854 fs_info->global_block_rsv.space_info = space_info; 3979 fs_info->global_block_rsv.space_info = space_info;
3855 fs_info->global_block_rsv.priority = 10;
3856 fs_info->global_block_rsv.refill_used = 1;
3857 fs_info->delalloc_block_rsv.space_info = space_info; 3980 fs_info->delalloc_block_rsv.space_info = space_info;
3858 fs_info->trans_block_rsv.space_info = space_info; 3981 fs_info->trans_block_rsv.space_info = space_info;
3859 fs_info->empty_block_rsv.space_info = space_info; 3982 fs_info->empty_block_rsv.space_info = space_info;
3860 fs_info->empty_block_rsv.priority = 10; 3983 fs_info->delayed_block_rsv.space_info = space_info;
3861 3984
3862 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; 3985 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
3863 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; 3986 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
@@ -3865,10 +3988,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3865 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 3988 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
3866 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 3989 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
3867 3990
3868 btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
3869
3870 btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
3871
3872 update_global_block_rsv(fs_info); 3991 update_global_block_rsv(fs_info);
3873} 3992}
3874 3993
@@ -3881,37 +4000,8 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
3881 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 4000 WARN_ON(fs_info->trans_block_rsv.reserved > 0);
3882 WARN_ON(fs_info->chunk_block_rsv.size > 0); 4001 WARN_ON(fs_info->chunk_block_rsv.size > 0);
3883 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 4002 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
3884} 4003 WARN_ON(fs_info->delayed_block_rsv.size > 0);
3885 4004 WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
3886int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
3887 struct btrfs_root *root,
3888 struct btrfs_block_rsv *rsv)
3889{
3890 struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv;
3891 u64 num_bytes;
3892 int ret;
3893
3894 /*
3895 * Truncate should be freeing data, but give us 2 items just in case it
3896 * needs to use some space. We may want to be smarter about this in the
3897 * future.
3898 */
3899 num_bytes = btrfs_calc_trans_metadata_size(root, 2);
3900
3901 /* We already have enough bytes, just return */
3902 if (rsv->reserved >= num_bytes)
3903 return 0;
3904
3905 num_bytes -= rsv->reserved;
3906
3907 /*
3908 * You should have reserved enough space before hand to do this, so this
3909 * should not fail.
3910 */
3911 ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes);
3912 BUG_ON(ret);
3913
3914 return 0;
3915} 4005}
3916 4006
3917void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 4007void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -3920,9 +4010,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3920 if (!trans->bytes_reserved) 4010 if (!trans->bytes_reserved)
3921 return; 4011 return;
3922 4012
3923 BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv); 4013 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
3924 btrfs_block_rsv_release(root, trans->block_rsv,
3925 trans->bytes_reserved);
3926 trans->bytes_reserved = 0; 4014 trans->bytes_reserved = 0;
3927} 4015}
3928 4016
@@ -3964,11 +4052,19 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
3964 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4052 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3965} 4053}
3966 4054
4055/**
4056 * drop_outstanding_extent - drop an outstanding extent
4057 * @inode: the inode we're dropping the extent for
4058 *
4059 * This is called when we are freeing up an outstanding extent, either called
4060 * after an error or after an extent is written. This will return the number of
4061 * reserved extents that need to be freed. This must be called with
4062 * BTRFS_I(inode)->lock held.
4063 */
3967static unsigned drop_outstanding_extent(struct inode *inode) 4064static unsigned drop_outstanding_extent(struct inode *inode)
3968{ 4065{
3969 unsigned dropped_extents = 0; 4066 unsigned dropped_extents = 0;
3970 4067
3971 spin_lock(&BTRFS_I(inode)->lock);
3972 BUG_ON(!BTRFS_I(inode)->outstanding_extents); 4068 BUG_ON(!BTRFS_I(inode)->outstanding_extents);
3973 BTRFS_I(inode)->outstanding_extents--; 4069 BTRFS_I(inode)->outstanding_extents--;
3974 4070
@@ -3978,19 +4074,70 @@ static unsigned drop_outstanding_extent(struct inode *inode)
3978 */ 4074 */
3979 if (BTRFS_I(inode)->outstanding_extents >= 4075 if (BTRFS_I(inode)->outstanding_extents >=
3980 BTRFS_I(inode)->reserved_extents) 4076 BTRFS_I(inode)->reserved_extents)
3981 goto out; 4077 return 0;
3982 4078
3983 dropped_extents = BTRFS_I(inode)->reserved_extents - 4079 dropped_extents = BTRFS_I(inode)->reserved_extents -
3984 BTRFS_I(inode)->outstanding_extents; 4080 BTRFS_I(inode)->outstanding_extents;
3985 BTRFS_I(inode)->reserved_extents -= dropped_extents; 4081 BTRFS_I(inode)->reserved_extents -= dropped_extents;
3986out:
3987 spin_unlock(&BTRFS_I(inode)->lock);
3988 return dropped_extents; 4082 return dropped_extents;
3989} 4083}
3990 4084
3991static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes) 4085/**
4086 * calc_csum_metadata_size - return the amount of metada space that must be
4087 * reserved/free'd for the given bytes.
4088 * @inode: the inode we're manipulating
4089 * @num_bytes: the number of bytes in question
4090 * @reserve: 1 if we are reserving space, 0 if we are freeing space
4091 *
4092 * This adjusts the number of csum_bytes in the inode and then returns the
4093 * correct amount of metadata that must either be reserved or freed. We
4094 * calculate how many checksums we can fit into one leaf and then divide the
4095 * number of bytes that will need to be checksumed by this value to figure out
4096 * how many checksums will be required. If we are adding bytes then the number
4097 * may go up and we will return the number of additional bytes that must be
4098 * reserved. If it is going down we will return the number of bytes that must
4099 * be freed.
4100 *
4101 * This must be called with BTRFS_I(inode)->lock held.
4102 */
4103static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
4104 int reserve)
3992{ 4105{
3993 return num_bytes >>= 3; 4106 struct btrfs_root *root = BTRFS_I(inode)->root;
4107 u64 csum_size;
4108 int num_csums_per_leaf;
4109 int num_csums;
4110 int old_csums;
4111
4112 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
4113 BTRFS_I(inode)->csum_bytes == 0)
4114 return 0;
4115
4116 old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4117 if (reserve)
4118 BTRFS_I(inode)->csum_bytes += num_bytes;
4119 else
4120 BTRFS_I(inode)->csum_bytes -= num_bytes;
4121 csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
4122 num_csums_per_leaf = (int)div64_u64(csum_size,
4123 sizeof(struct btrfs_csum_item) +
4124 sizeof(struct btrfs_disk_key));
4125 num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4126 num_csums = num_csums + num_csums_per_leaf - 1;
4127 num_csums = num_csums / num_csums_per_leaf;
4128
4129 old_csums = old_csums + num_csums_per_leaf - 1;
4130 old_csums = old_csums / num_csums_per_leaf;
4131
4132 /* No change, no need to reserve more */
4133 if (old_csums == num_csums)
4134 return 0;
4135
4136 if (reserve)
4137 return btrfs_calc_trans_metadata_size(root,
4138 num_csums - old_csums);
4139
4140 return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
3994} 4141}
3995 4142
3996int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) 4143int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
@@ -3999,9 +4146,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3999 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 4146 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4000 u64 to_reserve = 0; 4147 u64 to_reserve = 0;
4001 unsigned nr_extents = 0; 4148 unsigned nr_extents = 0;
4149 int flush = 1;
4002 int ret; 4150 int ret;
4003 4151
4004 if (btrfs_transaction_in_commit(root->fs_info)) 4152 if (btrfs_is_free_space_inode(root, inode))
4153 flush = 0;
4154
4155 if (flush && btrfs_transaction_in_commit(root->fs_info))
4005 schedule_timeout(1); 4156 schedule_timeout(1);
4006 4157
4007 num_bytes = ALIGN(num_bytes, root->sectorsize); 4158 num_bytes = ALIGN(num_bytes, root->sectorsize);
@@ -4017,18 +4168,29 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4017 4168
4018 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); 4169 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4019 } 4170 }
4171 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4020 spin_unlock(&BTRFS_I(inode)->lock); 4172 spin_unlock(&BTRFS_I(inode)->lock);
4021 4173
4022 to_reserve += calc_csum_metadata_size(inode, num_bytes); 4174 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4023 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
4024 if (ret) { 4175 if (ret) {
4176 u64 to_free = 0;
4025 unsigned dropped; 4177 unsigned dropped;
4178
4179 spin_lock(&BTRFS_I(inode)->lock);
4180 dropped = drop_outstanding_extent(inode);
4181 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4182 spin_unlock(&BTRFS_I(inode)->lock);
4183 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4184
4026 /* 4185 /*
4027 * We don't need the return value since our reservation failed, 4186 * Somebody could have come in and twiddled with the
4028 * we just need to clean up our counter. 4187 * reservation, so if we have to free more than we would have
4188 * reserved from this reservation go ahead and release those
4189 * bytes.
4029 */ 4190 */
4030 dropped = drop_outstanding_extent(inode); 4191 to_free -= to_reserve;
4031 WARN_ON(dropped > 1); 4192 if (to_free)
4193 btrfs_block_rsv_release(root, block_rsv, to_free);
4032 return ret; 4194 return ret;
4033 } 4195 }
4034 4196
@@ -4037,6 +4199,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4037 return 0; 4199 return 0;
4038} 4200}
4039 4201
4202/**
4203 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
4204 * @inode: the inode to release the reservation for
4205 * @num_bytes: the number of bytes we're releasing
4206 *
4207 * This will release the metadata reservation for an inode. This can be called
4208 * once we complete IO for a given set of bytes to release their metadata
4209 * reservations.
4210 */
4040void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) 4211void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4041{ 4212{
4042 struct btrfs_root *root = BTRFS_I(inode)->root; 4213 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4044,9 +4215,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4044 unsigned dropped; 4215 unsigned dropped;
4045 4216
4046 num_bytes = ALIGN(num_bytes, root->sectorsize); 4217 num_bytes = ALIGN(num_bytes, root->sectorsize);
4218 spin_lock(&BTRFS_I(inode)->lock);
4047 dropped = drop_outstanding_extent(inode); 4219 dropped = drop_outstanding_extent(inode);
4048 4220
4049 to_free = calc_csum_metadata_size(inode, num_bytes); 4221 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4222 spin_unlock(&BTRFS_I(inode)->lock);
4050 if (dropped > 0) 4223 if (dropped > 0)
4051 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4224 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4052 4225
@@ -4054,6 +4227,21 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4054 to_free); 4227 to_free);
4055} 4228}
4056 4229
4230/**
4231 * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
4232 * @inode: inode we're writing to
4233 * @num_bytes: the number of bytes we want to allocate
4234 *
4235 * This will do the following things
4236 *
4237 * o reserve space in the data space info for num_bytes
4238 * o reserve space in the metadata space info based on number of outstanding
4239 * extents and how much csums will be needed
4240 * o add to the inodes ->delalloc_bytes
4241 * o add it to the fs_info's delalloc inodes list.
4242 *
4243 * This will return 0 for success and -ENOSPC if there is no space left.
4244 */
4057int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) 4245int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4058{ 4246{
4059 int ret; 4247 int ret;
@@ -4071,6 +4259,19 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4071 return 0; 4259 return 0;
4072} 4260}
4073 4261
4262/**
4263 * btrfs_delalloc_release_space - release data and metadata space for delalloc
4264 * @inode: inode we're releasing space for
4265 * @num_bytes: the number of bytes we want to free up
4266 *
4267 * This must be matched with a call to btrfs_delalloc_reserve_space. This is
4268 * called in the case that we don't need the metadata AND data reservations
4269 * anymore. So if there is an error or we insert an inline extent.
4270 *
4271 * This function will release the metadata space that was not used and will
4272 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
4273 * list if there are no delalloc bytes left.
4274 */
4074void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) 4275void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4075{ 4276{
4076 btrfs_delalloc_release_metadata(inode, num_bytes); 4277 btrfs_delalloc_release_metadata(inode, num_bytes);
@@ -4090,12 +4291,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4090 4291
4091 /* block accounting for super block */ 4292 /* block accounting for super block */
4092 spin_lock(&info->delalloc_lock); 4293 spin_lock(&info->delalloc_lock);
4093 old_val = btrfs_super_bytes_used(&info->super_copy); 4294 old_val = btrfs_super_bytes_used(info->super_copy);
4094 if (alloc) 4295 if (alloc)
4095 old_val += num_bytes; 4296 old_val += num_bytes;
4096 else 4297 else
4097 old_val -= num_bytes; 4298 old_val -= num_bytes;
4098 btrfs_set_super_bytes_used(&info->super_copy, old_val); 4299 btrfs_set_super_bytes_used(info->super_copy, old_val);
4099 spin_unlock(&info->delalloc_lock); 4300 spin_unlock(&info->delalloc_lock);
4100 4301
4101 while (total) { 4302 while (total) {
@@ -4123,7 +4324,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4123 spin_lock(&cache->space_info->lock); 4324 spin_lock(&cache->space_info->lock);
4124 spin_lock(&cache->lock); 4325 spin_lock(&cache->lock);
4125 4326
4126 if (btrfs_super_cache_generation(&info->super_copy) != 0 && 4327 if (btrfs_test_opt(root, SPACE_CACHE) &&
4127 cache->disk_cache_state < BTRFS_DC_CLEAR) 4328 cache->disk_cache_state < BTRFS_DC_CLEAR)
4128 cache->disk_cache_state = BTRFS_DC_CLEAR; 4329 cache->disk_cache_state = BTRFS_DC_CLEAR;
4129 4330
@@ -4135,7 +4336,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4135 btrfs_set_block_group_used(&cache->item, old_val); 4336 btrfs_set_block_group_used(&cache->item, old_val);
4136 cache->reserved -= num_bytes; 4337 cache->reserved -= num_bytes;
4137 cache->space_info->bytes_reserved -= num_bytes; 4338 cache->space_info->bytes_reserved -= num_bytes;
4138 cache->space_info->reservation_progress++;
4139 cache->space_info->bytes_used += num_bytes; 4339 cache->space_info->bytes_used += num_bytes;
4140 cache->space_info->disk_used += num_bytes * factor; 4340 cache->space_info->disk_used += num_bytes * factor;
4141 spin_unlock(&cache->lock); 4341 spin_unlock(&cache->lock);
@@ -4187,7 +4387,6 @@ static int pin_down_extent(struct btrfs_root *root,
4187 if (reserved) { 4387 if (reserved) {
4188 cache->reserved -= num_bytes; 4388 cache->reserved -= num_bytes;
4189 cache->space_info->bytes_reserved -= num_bytes; 4389 cache->space_info->bytes_reserved -= num_bytes;
4190 cache->space_info->reservation_progress++;
4191 } 4390 }
4192 spin_unlock(&cache->lock); 4391 spin_unlock(&cache->lock);
4193 spin_unlock(&cache->space_info->lock); 4392 spin_unlock(&cache->space_info->lock);
@@ -4215,45 +4414,82 @@ int btrfs_pin_extent(struct btrfs_root *root,
4215} 4414}
4216 4415
4217/* 4416/*
4218 * update size of reserved extents. this function may return -EAGAIN 4417 * this function must be called within transaction
4219 * if 'reserve' is true or 'sinfo' is false. 4418 */
4419int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
4420 struct btrfs_root *root,
4421 u64 bytenr, u64 num_bytes)
4422{
4423 struct btrfs_block_group_cache *cache;
4424
4425 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4426 BUG_ON(!cache);
4427
4428 /*
4429 * pull in the free space cache (if any) so that our pin
4430 * removes the free space from the cache. We have load_only set
4431 * to one because the slow code to read in the free extents does check
4432 * the pinned extents.
4433 */
4434 cache_block_group(cache, trans, root, 1);
4435
4436 pin_down_extent(root, cache, bytenr, num_bytes, 0);
4437
4438 /* remove us from the free space cache (if we're there at all) */
4439 btrfs_remove_free_space(cache, bytenr, num_bytes);
4440 btrfs_put_block_group(cache);
4441 return 0;
4442}
4443
4444/**
4445 * btrfs_update_reserved_bytes - update the block_group and space info counters
4446 * @cache: The cache we are manipulating
4447 * @num_bytes: The number of bytes in question
4448 * @reserve: One of the reservation enums
4449 *
4450 * This is called by the allocator when it reserves space, or by somebody who is
4451 * freeing space that was never actually used on disk. For example if you
4452 * reserve some space for a new leaf in transaction A and before transaction A
4453 * commits you free that leaf, you call this with reserve set to 0 in order to
4454 * clear the reservation.
4455 *
4456 * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
4457 * ENOSPC accounting. For data we handle the reservation through clearing the
4458 * delalloc bits in the io_tree. We have to do this since we could end up
4459 * allocating less disk space for the amount of data we have reserved in the
4460 * case of compression.
4461 *
4462 * If this is a reservation and the block group has become read only we cannot
4463 * make the reservation and return -EAGAIN, otherwise this function always
4464 * succeeds.
4220 */ 4465 */
4221int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 4466static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4222 u64 num_bytes, int reserve, int sinfo) 4467 u64 num_bytes, int reserve)
4223{ 4468{
4469 struct btrfs_space_info *space_info = cache->space_info;
4224 int ret = 0; 4470 int ret = 0;
4225 if (sinfo) { 4471 spin_lock(&space_info->lock);
4226 struct btrfs_space_info *space_info = cache->space_info; 4472 spin_lock(&cache->lock);
4227 spin_lock(&space_info->lock); 4473 if (reserve != RESERVE_FREE) {
4228 spin_lock(&cache->lock);
4229 if (reserve) {
4230 if (cache->ro) {
4231 ret = -EAGAIN;
4232 } else {
4233 cache->reserved += num_bytes;
4234 space_info->bytes_reserved += num_bytes;
4235 }
4236 } else {
4237 if (cache->ro)
4238 space_info->bytes_readonly += num_bytes;
4239 cache->reserved -= num_bytes;
4240 space_info->bytes_reserved -= num_bytes;
4241 space_info->reservation_progress++;
4242 }
4243 spin_unlock(&cache->lock);
4244 spin_unlock(&space_info->lock);
4245 } else {
4246 spin_lock(&cache->lock);
4247 if (cache->ro) { 4474 if (cache->ro) {
4248 ret = -EAGAIN; 4475 ret = -EAGAIN;
4249 } else { 4476 } else {
4250 if (reserve) 4477 cache->reserved += num_bytes;
4251 cache->reserved += num_bytes; 4478 space_info->bytes_reserved += num_bytes;
4252 else 4479 if (reserve == RESERVE_ALLOC) {
4253 cache->reserved -= num_bytes; 4480 BUG_ON(space_info->bytes_may_use < num_bytes);
4481 space_info->bytes_may_use -= num_bytes;
4482 }
4254 } 4483 }
4255 spin_unlock(&cache->lock); 4484 } else {
4485 if (cache->ro)
4486 space_info->bytes_readonly += num_bytes;
4487 cache->reserved -= num_bytes;
4488 space_info->bytes_reserved -= num_bytes;
4489 space_info->reservation_progress++;
4256 } 4490 }
4491 spin_unlock(&cache->lock);
4492 spin_unlock(&space_info->lock);
4257 return ret; 4493 return ret;
4258} 4494}
4259 4495
@@ -4319,13 +4555,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4319 spin_lock(&cache->lock); 4555 spin_lock(&cache->lock);
4320 cache->pinned -= len; 4556 cache->pinned -= len;
4321 cache->space_info->bytes_pinned -= len; 4557 cache->space_info->bytes_pinned -= len;
4322 if (cache->ro) { 4558 if (cache->ro)
4323 cache->space_info->bytes_readonly += len; 4559 cache->space_info->bytes_readonly += len;
4324 } else if (cache->reserved_pinned > 0) {
4325 len = min(len, cache->reserved_pinned);
4326 cache->reserved_pinned -= len;
4327 cache->space_info->bytes_reserved += len;
4328 }
4329 spin_unlock(&cache->lock); 4560 spin_unlock(&cache->lock);
4330 spin_unlock(&cache->space_info->lock); 4561 spin_unlock(&cache->space_info->lock);
4331 } 4562 }
@@ -4340,11 +4571,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4340{ 4571{
4341 struct btrfs_fs_info *fs_info = root->fs_info; 4572 struct btrfs_fs_info *fs_info = root->fs_info;
4342 struct extent_io_tree *unpin; 4573 struct extent_io_tree *unpin;
4343 struct btrfs_block_rsv *block_rsv;
4344 struct btrfs_block_rsv *next_rsv;
4345 u64 start; 4574 u64 start;
4346 u64 end; 4575 u64 end;
4347 int idx;
4348 int ret; 4576 int ret;
4349 4577
4350 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 4578 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -4367,30 +4595,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4367 cond_resched(); 4595 cond_resched();
4368 } 4596 }
4369 4597
4370 mutex_lock(&fs_info->durable_block_rsv_mutex);
4371 list_for_each_entry_safe(block_rsv, next_rsv,
4372 &fs_info->durable_block_rsv_list, list) {
4373
4374 idx = trans->transid & 0x1;
4375 if (block_rsv->freed[idx] > 0) {
4376 block_rsv_add_bytes(block_rsv,
4377 block_rsv->freed[idx], 0);
4378 block_rsv->freed[idx] = 0;
4379 }
4380 if (atomic_read(&block_rsv->usage) == 0) {
4381 btrfs_block_rsv_release(root, block_rsv, (u64)-1);
4382
4383 if (block_rsv->freed[0] == 0 &&
4384 block_rsv->freed[1] == 0) {
4385 list_del_init(&block_rsv->list);
4386 kfree(block_rsv);
4387 }
4388 } else {
4389 btrfs_block_rsv_release(root, block_rsv, 0);
4390 }
4391 }
4392 mutex_unlock(&fs_info->durable_block_rsv_mutex);
4393
4394 return 0; 4598 return 0;
4395} 4599}
4396 4600
@@ -4668,7 +4872,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4668 struct extent_buffer *buf, 4872 struct extent_buffer *buf,
4669 u64 parent, int last_ref) 4873 u64 parent, int last_ref)
4670{ 4874{
4671 struct btrfs_block_rsv *block_rsv;
4672 struct btrfs_block_group_cache *cache = NULL; 4875 struct btrfs_block_group_cache *cache = NULL;
4673 int ret; 4876 int ret;
4674 4877
@@ -4683,64 +4886,24 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4683 if (!last_ref) 4886 if (!last_ref)
4684 return; 4887 return;
4685 4888
4686 block_rsv = get_block_rsv(trans, root);
4687 cache = btrfs_lookup_block_group(root->fs_info, buf->start); 4889 cache = btrfs_lookup_block_group(root->fs_info, buf->start);
4688 if (block_rsv->space_info != cache->space_info)
4689 goto out;
4690 4890
4691 if (btrfs_header_generation(buf) == trans->transid) { 4891 if (btrfs_header_generation(buf) == trans->transid) {
4692 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 4892 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4693 ret = check_ref_cleanup(trans, root, buf->start); 4893 ret = check_ref_cleanup(trans, root, buf->start);
4694 if (!ret) 4894 if (!ret)
4695 goto pin; 4895 goto out;
4696 } 4896 }
4697 4897
4698 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 4898 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
4699 pin_down_extent(root, cache, buf->start, buf->len, 1); 4899 pin_down_extent(root, cache, buf->start, buf->len, 1);
4700 goto pin; 4900 goto out;
4701 } 4901 }
4702 4902
4703 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 4903 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4704 4904
4705 btrfs_add_free_space(cache, buf->start, buf->len); 4905 btrfs_add_free_space(cache, buf->start, buf->len);
4706 ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0); 4906 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
4707 if (ret == -EAGAIN) {
4708 /* block group became read-only */
4709 btrfs_update_reserved_bytes(cache, buf->len, 0, 1);
4710 goto out;
4711 }
4712
4713 ret = 1;
4714 spin_lock(&block_rsv->lock);
4715 if (block_rsv->reserved < block_rsv->size) {
4716 block_rsv->reserved += buf->len;
4717 ret = 0;
4718 }
4719 spin_unlock(&block_rsv->lock);
4720
4721 if (ret) {
4722 spin_lock(&cache->space_info->lock);
4723 cache->space_info->bytes_reserved -= buf->len;
4724 cache->space_info->reservation_progress++;
4725 spin_unlock(&cache->space_info->lock);
4726 }
4727 goto out;
4728 }
4729pin:
4730 if (block_rsv->durable && !cache->ro) {
4731 ret = 0;
4732 spin_lock(&cache->lock);
4733 if (!cache->ro) {
4734 cache->reserved_pinned += buf->len;
4735 ret = 1;
4736 }
4737 spin_unlock(&cache->lock);
4738
4739 if (ret) {
4740 spin_lock(&block_rsv->lock);
4741 block_rsv->freed[trans->transid & 0x1] += buf->len;
4742 spin_unlock(&block_rsv->lock);
4743 }
4744 } 4907 }
4745out: 4908out:
4746 /* 4909 /*
@@ -4883,10 +5046,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4883 int last_ptr_loop = 0; 5046 int last_ptr_loop = 0;
4884 int loop = 0; 5047 int loop = 0;
4885 int index = 0; 5048 int index = 0;
5049 int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
5050 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
4886 bool found_uncached_bg = false; 5051 bool found_uncached_bg = false;
4887 bool failed_cluster_refill = false; 5052 bool failed_cluster_refill = false;
4888 bool failed_alloc = false; 5053 bool failed_alloc = false;
4889 bool use_cluster = true; 5054 bool use_cluster = true;
5055 bool have_caching_bg = false;
4890 u64 ideal_cache_percent = 0; 5056 u64 ideal_cache_percent = 0;
4891 u64 ideal_cache_offset = 0; 5057 u64 ideal_cache_offset = 0;
4892 5058
@@ -4969,6 +5135,7 @@ ideal_cache:
4969 } 5135 }
4970 } 5136 }
4971search: 5137search:
5138 have_caching_bg = false;
4972 down_read(&space_info->groups_sem); 5139 down_read(&space_info->groups_sem);
4973 list_for_each_entry(block_group, &space_info->block_groups[index], 5140 list_for_each_entry(block_group, &space_info->block_groups[index],
4974 list) { 5141 list) {
@@ -5177,6 +5344,8 @@ refill_cluster:
5177 failed_alloc = true; 5344 failed_alloc = true;
5178 goto have_block_group; 5345 goto have_block_group;
5179 } else if (!offset) { 5346 } else if (!offset) {
5347 if (!cached)
5348 have_caching_bg = true;
5180 goto loop; 5349 goto loop;
5181 } 5350 }
5182checks: 5351checks:
@@ -5202,8 +5371,8 @@ checks:
5202 search_start - offset); 5371 search_start - offset);
5203 BUG_ON(offset > search_start); 5372 BUG_ON(offset > search_start);
5204 5373
5205 ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1, 5374 ret = btrfs_update_reserved_bytes(block_group, num_bytes,
5206 (data & BTRFS_BLOCK_GROUP_DATA)); 5375 alloc_type);
5207 if (ret == -EAGAIN) { 5376 if (ret == -EAGAIN) {
5208 btrfs_add_free_space(block_group, offset, num_bytes); 5377 btrfs_add_free_space(block_group, offset, num_bytes);
5209 goto loop; 5378 goto loop;
@@ -5227,6 +5396,9 @@ loop:
5227 } 5396 }
5228 up_read(&space_info->groups_sem); 5397 up_read(&space_info->groups_sem);
5229 5398
5399 if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
5400 goto search;
5401
5230 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) 5402 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
5231 goto search; 5403 goto search;
5232 5404
@@ -5325,7 +5497,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
5325 int index = 0; 5497 int index = 0;
5326 5498
5327 spin_lock(&info->lock); 5499 spin_lock(&info->lock);
5328 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 5500 printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
5501 (unsigned long long)info->flags,
5329 (unsigned long long)(info->total_bytes - info->bytes_used - 5502 (unsigned long long)(info->total_bytes - info->bytes_used -
5330 info->bytes_pinned - info->bytes_reserved - 5503 info->bytes_pinned - info->bytes_reserved -
5331 info->bytes_readonly), 5504 info->bytes_readonly),
@@ -5411,7 +5584,8 @@ again:
5411 return ret; 5584 return ret;
5412} 5585}
5413 5586
5414int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) 5587static int __btrfs_free_reserved_extent(struct btrfs_root *root,
5588 u64 start, u64 len, int pin)
5415{ 5589{
5416 struct btrfs_block_group_cache *cache; 5590 struct btrfs_block_group_cache *cache;
5417 int ret = 0; 5591 int ret = 0;
@@ -5426,8 +5600,12 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
5426 if (btrfs_test_opt(root, DISCARD)) 5600 if (btrfs_test_opt(root, DISCARD))
5427 ret = btrfs_discard_extent(root, start, len, NULL); 5601 ret = btrfs_discard_extent(root, start, len, NULL);
5428 5602
5429 btrfs_add_free_space(cache, start, len); 5603 if (pin)
5430 btrfs_update_reserved_bytes(cache, len, 0, 1); 5604 pin_down_extent(root, cache, start, len, 1);
5605 else {
5606 btrfs_add_free_space(cache, start, len);
5607 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
5608 }
5431 btrfs_put_block_group(cache); 5609 btrfs_put_block_group(cache);
5432 5610
5433 trace_btrfs_reserved_extent_free(root, start, len); 5611 trace_btrfs_reserved_extent_free(root, start, len);
@@ -5435,6 +5613,18 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
5435 return ret; 5613 return ret;
5436} 5614}
5437 5615
5616int btrfs_free_reserved_extent(struct btrfs_root *root,
5617 u64 start, u64 len)
5618{
5619 return __btrfs_free_reserved_extent(root, start, len, 0);
5620}
5621
5622int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
5623 u64 start, u64 len)
5624{
5625 return __btrfs_free_reserved_extent(root, start, len, 1);
5626}
5627
5438static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 5628static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5439 struct btrfs_root *root, 5629 struct btrfs_root *root,
5440 u64 parent, u64 root_objectid, 5630 u64 parent, u64 root_objectid,
@@ -5630,7 +5820,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5630 put_caching_control(caching_ctl); 5820 put_caching_control(caching_ctl);
5631 } 5821 }
5632 5822
5633 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1); 5823 ret = btrfs_update_reserved_bytes(block_group, ins->offset,
5824 RESERVE_ALLOC_NO_ACCOUNT);
5634 BUG_ON(ret); 5825 BUG_ON(ret);
5635 btrfs_put_block_group(block_group); 5826 btrfs_put_block_group(block_group);
5636 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 5827 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
@@ -5687,8 +5878,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5687 block_rsv = get_block_rsv(trans, root); 5878 block_rsv = get_block_rsv(trans, root);
5688 5879
5689 if (block_rsv->size == 0) { 5880 if (block_rsv->size == 0) {
5690 ret = reserve_metadata_bytes(trans, root, block_rsv, 5881 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
5691 blocksize, 0);
5692 /* 5882 /*
5693 * If we couldn't reserve metadata bytes try and use some from 5883 * If we couldn't reserve metadata bytes try and use some from
5694 * the global reserve. 5884 * the global reserve.
@@ -5708,13 +5898,15 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5708 if (!ret) 5898 if (!ret)
5709 return block_rsv; 5899 return block_rsv;
5710 if (ret) { 5900 if (ret) {
5711 WARN_ON(1); 5901 static DEFINE_RATELIMIT_STATE(_rs,
5712 ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize, 5902 DEFAULT_RATELIMIT_INTERVAL,
5713 0); 5903 /*DEFAULT_RATELIMIT_BURST*/ 2);
5904 if (__ratelimit(&_rs)) {
5905 printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
5906 WARN_ON(1);
5907 }
5908 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
5714 if (!ret) { 5909 if (!ret) {
5715 spin_lock(&block_rsv->lock);
5716 block_rsv->size += blocksize;
5717 spin_unlock(&block_rsv->lock);
5718 return block_rsv; 5910 return block_rsv;
5719 } else if (ret && block_rsv != global_rsv) { 5911 } else if (ret && block_rsv != global_rsv) {
5720 ret = block_rsv_use_bytes(global_rsv, blocksize); 5912 ret = block_rsv_use_bytes(global_rsv, blocksize);
@@ -6592,12 +6784,9 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
6592 cache->bytes_super - btrfs_block_group_used(&cache->item); 6784 cache->bytes_super - btrfs_block_group_used(&cache->item);
6593 6785
6594 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 6786 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
6595 sinfo->bytes_may_use + sinfo->bytes_readonly + 6787 sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
6596 cache->reserved_pinned + num_bytes + min_allocable_bytes <= 6788 min_allocable_bytes <= sinfo->total_bytes) {
6597 sinfo->total_bytes) {
6598 sinfo->bytes_readonly += num_bytes; 6789 sinfo->bytes_readonly += num_bytes;
6599 sinfo->bytes_reserved += cache->reserved_pinned;
6600 cache->reserved_pinned = 0;
6601 cache->ro = 1; 6790 cache->ro = 1;
6602 ret = 0; 6791 ret = 0;
6603 } 6792 }
@@ -6964,7 +7153,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
6964 struct btrfs_space_info, 7153 struct btrfs_space_info,
6965 list); 7154 list);
6966 if (space_info->bytes_pinned > 0 || 7155 if (space_info->bytes_pinned > 0 ||
6967 space_info->bytes_reserved > 0) { 7156 space_info->bytes_reserved > 0 ||
7157 space_info->bytes_may_use > 0) {
6968 WARN_ON(1); 7158 WARN_ON(1);
6969 dump_space_info(space_info, 0, 0); 7159 dump_space_info(space_info, 0, 0);
6970 } 7160 }
@@ -7006,14 +7196,12 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7006 return -ENOMEM; 7196 return -ENOMEM;
7007 path->reada = 1; 7197 path->reada = 1;
7008 7198
7009 cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); 7199 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
7010 if (cache_gen != 0 && 7200 if (btrfs_test_opt(root, SPACE_CACHE) &&
7011 btrfs_super_generation(&root->fs_info->super_copy) != cache_gen) 7201 btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
7012 need_clear = 1; 7202 need_clear = 1;
7013 if (btrfs_test_opt(root, CLEAR_CACHE)) 7203 if (btrfs_test_opt(root, CLEAR_CACHE))
7014 need_clear = 1; 7204 need_clear = 1;
7015 if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
7016 printk(KERN_INFO "btrfs: disk space caching is enabled\n");
7017 7205
7018 while (1) { 7206 while (1) {
7019 ret = find_first_block_group(root, path, &key); 7207 ret = find_first_block_group(root, path, &key);
@@ -7252,7 +7440,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7252 goto out; 7440 goto out;
7253 } 7441 }
7254 7442
7255 inode = lookup_free_space_inode(root, block_group, path); 7443 inode = lookup_free_space_inode(tree_root, block_group, path);
7256 if (!IS_ERR(inode)) { 7444 if (!IS_ERR(inode)) {
7257 ret = btrfs_orphan_add(trans, inode); 7445 ret = btrfs_orphan_add(trans, inode);
7258 BUG_ON(ret); 7446 BUG_ON(ret);
@@ -7268,7 +7456,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7268 spin_unlock(&block_group->lock); 7456 spin_unlock(&block_group->lock);
7269 } 7457 }
7270 /* One for our lookup ref */ 7458 /* One for our lookup ref */
7271 iput(inode); 7459 btrfs_add_delayed_iput(inode);
7272 } 7460 }
7273 7461
7274 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 7462 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@ -7339,7 +7527,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
7339 int mixed = 0; 7527 int mixed = 0;
7340 int ret; 7528 int ret;
7341 7529
7342 disk_super = &fs_info->super_copy; 7530 disk_super = fs_info->super_copy;
7343 if (!btrfs_super_root(disk_super)) 7531 if (!btrfs_super_root(disk_super))
7344 return 1; 7532 return 1;
7345 7533
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 624ef10d36cc..1f87c4d0e7a0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -895,6 +895,194 @@ search_again:
895 goto again; 895 goto again;
896} 896}
897 897
898/**
899 * convert_extent - convert all bits in a given range from one bit to another
900 * @tree: the io tree to search
901 * @start: the start offset in bytes
902 * @end: the end offset in bytes (inclusive)
903 * @bits: the bits to set in this range
904 * @clear_bits: the bits to clear in this range
905 * @mask: the allocation mask
906 *
907 * This will go through and set bits for the given range. If any states exist
908 * already in this range they are set with the given bit and cleared of the
909 * clear_bits. This is only meant to be used by things that are mergeable, ie
910 * converting from say DELALLOC to DIRTY. This is not meant to be used with
911 * boundary bits like LOCK.
912 */
913int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
914 int bits, int clear_bits, gfp_t mask)
915{
916 struct extent_state *state;
917 struct extent_state *prealloc = NULL;
918 struct rb_node *node;
919 int err = 0;
920 u64 last_start;
921 u64 last_end;
922
923again:
924 if (!prealloc && (mask & __GFP_WAIT)) {
925 prealloc = alloc_extent_state(mask);
926 if (!prealloc)
927 return -ENOMEM;
928 }
929
930 spin_lock(&tree->lock);
931 /*
932 * this search will find all the extents that end after
933 * our range starts.
934 */
935 node = tree_search(tree, start);
936 if (!node) {
937 prealloc = alloc_extent_state_atomic(prealloc);
938 if (!prealloc)
939 return -ENOMEM;
940 err = insert_state(tree, prealloc, start, end, &bits);
941 prealloc = NULL;
942 BUG_ON(err == -EEXIST);
943 goto out;
944 }
945 state = rb_entry(node, struct extent_state, rb_node);
946hit_next:
947 last_start = state->start;
948 last_end = state->end;
949
950 /*
951 * | ---- desired range ---- |
952 * | state |
953 *
954 * Just lock what we found and keep going
955 */
956 if (state->start == start && state->end <= end) {
957 struct rb_node *next_node;
958
959 set_state_bits(tree, state, &bits);
960 clear_state_bit(tree, state, &clear_bits, 0);
961
962 merge_state(tree, state);
963 if (last_end == (u64)-1)
964 goto out;
965
966 start = last_end + 1;
967 next_node = rb_next(&state->rb_node);
968 if (next_node && start < end && prealloc && !need_resched()) {
969 state = rb_entry(next_node, struct extent_state,
970 rb_node);
971 if (state->start == start)
972 goto hit_next;
973 }
974 goto search_again;
975 }
976
977 /*
978 * | ---- desired range ---- |
979 * | state |
980 * or
981 * | ------------- state -------------- |
982 *
983 * We need to split the extent we found, and may flip bits on
984 * second half.
985 *
986 * If the extent we found extends past our
987 * range, we just split and search again. It'll get split
988 * again the next time though.
989 *
990 * If the extent we found is inside our range, we set the
991 * desired bit on it.
992 */
993 if (state->start < start) {
994 prealloc = alloc_extent_state_atomic(prealloc);
995 if (!prealloc)
996 return -ENOMEM;
997 err = split_state(tree, state, prealloc, start);
998 BUG_ON(err == -EEXIST);
999 prealloc = NULL;
1000 if (err)
1001 goto out;
1002 if (state->end <= end) {
1003 set_state_bits(tree, state, &bits);
1004 clear_state_bit(tree, state, &clear_bits, 0);
1005 merge_state(tree, state);
1006 if (last_end == (u64)-1)
1007 goto out;
1008 start = last_end + 1;
1009 }
1010 goto search_again;
1011 }
1012 /*
1013 * | ---- desired range ---- |
1014 * | state | or | state |
1015 *
1016 * There's a hole, we need to insert something in it and
1017 * ignore the extent we found.
1018 */
1019 if (state->start > start) {
1020 u64 this_end;
1021 if (end < last_start)
1022 this_end = end;
1023 else
1024 this_end = last_start - 1;
1025
1026 prealloc = alloc_extent_state_atomic(prealloc);
1027 if (!prealloc)
1028 return -ENOMEM;
1029
1030 /*
1031 * Avoid to free 'prealloc' if it can be merged with
1032 * the later extent.
1033 */
1034 err = insert_state(tree, prealloc, start, this_end,
1035 &bits);
1036 BUG_ON(err == -EEXIST);
1037 if (err) {
1038 free_extent_state(prealloc);
1039 prealloc = NULL;
1040 goto out;
1041 }
1042 prealloc = NULL;
1043 start = this_end + 1;
1044 goto search_again;
1045 }
1046 /*
1047 * | ---- desired range ---- |
1048 * | state |
1049 * We need to split the extent, and set the bit
1050 * on the first half
1051 */
1052 if (state->start <= end && state->end > end) {
1053 prealloc = alloc_extent_state_atomic(prealloc);
1054 if (!prealloc)
1055 return -ENOMEM;
1056
1057 err = split_state(tree, state, prealloc, end + 1);
1058 BUG_ON(err == -EEXIST);
1059
1060 set_state_bits(tree, prealloc, &bits);
1061 clear_state_bit(tree, prealloc, &clear_bits, 0);
1062
1063 merge_state(tree, prealloc);
1064 prealloc = NULL;
1065 goto out;
1066 }
1067
1068 goto search_again;
1069
1070out:
1071 spin_unlock(&tree->lock);
1072 if (prealloc)
1073 free_extent_state(prealloc);
1074
1075 return err;
1076
1077search_again:
1078 if (start > end)
1079 goto out;
1080 spin_unlock(&tree->lock);
1081 if (mask & __GFP_WAIT)
1082 cond_resched();
1083 goto again;
1084}
1085
898/* wrappers around set/clear extent bit */ 1086/* wrappers around set/clear extent bit */
899int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1087int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
900 gfp_t mask) 1088 gfp_t mask)
@@ -920,7 +1108,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
920 struct extent_state **cached_state, gfp_t mask) 1108 struct extent_state **cached_state, gfp_t mask)
921{ 1109{
922 return set_extent_bit(tree, start, end, 1110 return set_extent_bit(tree, start, end,
923 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, 1111 EXTENT_DELALLOC | EXTENT_UPTODATE,
924 0, NULL, cached_state, mask); 1112 0, NULL, cached_state, mask);
925} 1113}
926 1114
@@ -2102,7 +2290,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2102 if (tree->ops && tree->ops->readpage_io_failed_hook) 2290 if (tree->ops && tree->ops->readpage_io_failed_hook)
2103 ret = tree->ops->readpage_io_failed_hook( 2291 ret = tree->ops->readpage_io_failed_hook(
2104 bio, page, start, end, 2292 bio, page, start, end,
2105 failed_mirror, NULL); 2293 failed_mirror, state);
2106 else 2294 else
2107 ret = bio_readpage_error(bio, page, start, end, 2295 ret = bio_readpage_error(bio, page, start, end,
2108 failed_mirror, NULL); 2296 failed_mirror, NULL);
@@ -2511,6 +2699,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2511 int compressed; 2699 int compressed;
2512 int write_flags; 2700 int write_flags;
2513 unsigned long nr_written = 0; 2701 unsigned long nr_written = 0;
2702 bool fill_delalloc = true;
2514 2703
2515 if (wbc->sync_mode == WB_SYNC_ALL) 2704 if (wbc->sync_mode == WB_SYNC_ALL)
2516 write_flags = WRITE_SYNC; 2705 write_flags = WRITE_SYNC;
@@ -2520,6 +2709,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2520 trace___extent_writepage(page, inode, wbc); 2709 trace___extent_writepage(page, inode, wbc);
2521 2710
2522 WARN_ON(!PageLocked(page)); 2711 WARN_ON(!PageLocked(page));
2712
2713 ClearPageError(page);
2714
2523 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2715 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2524 if (page->index > end_index || 2716 if (page->index > end_index ||
2525 (page->index == end_index && !pg_offset)) { 2717 (page->index == end_index && !pg_offset)) {
@@ -2541,10 +2733,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2541 2733
2542 set_page_extent_mapped(page); 2734 set_page_extent_mapped(page);
2543 2735
2736 if (!tree->ops || !tree->ops->fill_delalloc)
2737 fill_delalloc = false;
2738
2544 delalloc_start = start; 2739 delalloc_start = start;
2545 delalloc_end = 0; 2740 delalloc_end = 0;
2546 page_started = 0; 2741 page_started = 0;
2547 if (!epd->extent_locked) { 2742 if (!epd->extent_locked && fill_delalloc) {
2548 u64 delalloc_to_write = 0; 2743 u64 delalloc_to_write = 0;
2549 /* 2744 /*
2550 * make sure the wbc mapping index is at least updated 2745 * make sure the wbc mapping index is at least updated
@@ -2796,10 +2991,16 @@ retry:
2796 * swizzled back from swapper_space to tmpfs file 2991 * swizzled back from swapper_space to tmpfs file
2797 * mapping 2992 * mapping
2798 */ 2993 */
2799 if (tree->ops && tree->ops->write_cache_pages_lock_hook) 2994 if (tree->ops &&
2800 tree->ops->write_cache_pages_lock_hook(page); 2995 tree->ops->write_cache_pages_lock_hook) {
2801 else 2996 tree->ops->write_cache_pages_lock_hook(page,
2802 lock_page(page); 2997 data, flush_fn);
2998 } else {
2999 if (!trylock_page(page)) {
3000 flush_fn(data);
3001 lock_page(page);
3002 }
3003 }
2803 3004
2804 if (unlikely(page->mapping != mapping)) { 3005 if (unlikely(page->mapping != mapping)) {
2805 unlock_page(page); 3006 unlock_page(page);
@@ -3579,6 +3780,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3579 PAGECACHE_TAG_DIRTY); 3780 PAGECACHE_TAG_DIRTY);
3580 } 3781 }
3581 spin_unlock_irq(&page->mapping->tree_lock); 3782 spin_unlock_irq(&page->mapping->tree_lock);
3783 ClearPageError(page);
3582 unlock_page(page); 3784 unlock_page(page);
3583 } 3785 }
3584 return 0; 3786 return 0;
@@ -3724,8 +3926,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3724} 3926}
3725 3927
3726int read_extent_buffer_pages(struct extent_io_tree *tree, 3928int read_extent_buffer_pages(struct extent_io_tree *tree,
3727 struct extent_buffer *eb, 3929 struct extent_buffer *eb, u64 start, int wait,
3728 u64 start, int wait,
3729 get_extent_t *get_extent, int mirror_num) 3930 get_extent_t *get_extent, int mirror_num)
3730{ 3931{
3731 unsigned long i; 3932 unsigned long i;
@@ -3761,7 +3962,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3761 num_pages = num_extent_pages(eb->start, eb->len); 3962 num_pages = num_extent_pages(eb->start, eb->len);
3762 for (i = start_i; i < num_pages; i++) { 3963 for (i = start_i; i < num_pages; i++) {
3763 page = extent_buffer_page(eb, i); 3964 page = extent_buffer_page(eb, i);
3764 if (!wait) { 3965 if (wait == WAIT_NONE) {
3765 if (!trylock_page(page)) 3966 if (!trylock_page(page))
3766 goto unlock_exit; 3967 goto unlock_exit;
3767 } else { 3968 } else {
@@ -3805,7 +4006,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3805 if (bio) 4006 if (bio)
3806 submit_one_bio(READ, bio, mirror_num, bio_flags); 4007 submit_one_bio(READ, bio, mirror_num, bio_flags);
3807 4008
3808 if (ret || !wait) 4009 if (ret || wait != WAIT_COMPLETE)
3809 return ret; 4010 return ret;
3810 4011
3811 for (i = start_i; i < num_pages; i++) { 4012 for (i = start_i; i < num_pages; i++) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index a8e20b672922..feb9be0e23bc 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -17,7 +17,8 @@
17#define EXTENT_NODATASUM (1 << 10) 17#define EXTENT_NODATASUM (1 << 10)
18#define EXTENT_DO_ACCOUNTING (1 << 11) 18#define EXTENT_DO_ACCOUNTING (1 << 11)
19#define EXTENT_FIRST_DELALLOC (1 << 12) 19#define EXTENT_FIRST_DELALLOC (1 << 12)
20#define EXTENT_DAMAGED (1 << 13) 20#define EXTENT_NEED_WAIT (1 << 13)
21#define EXTENT_DAMAGED (1 << 14)
21#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 22#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
22#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) 23#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
23 24
@@ -33,6 +34,7 @@
33#define EXTENT_BUFFER_BLOCKING 1 34#define EXTENT_BUFFER_BLOCKING 1
34#define EXTENT_BUFFER_DIRTY 2 35#define EXTENT_BUFFER_DIRTY 2
35#define EXTENT_BUFFER_CORRUPT 3 36#define EXTENT_BUFFER_CORRUPT 3
37#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */
36 38
37/* these are flags for extent_clear_unlock_delalloc */ 39/* these are flags for extent_clear_unlock_delalloc */
38#define EXTENT_CLEAR_UNLOCK_PAGE 0x1 40#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -86,7 +88,8 @@ struct extent_io_ops {
86 struct extent_state *other); 88 struct extent_state *other);
87 void (*split_extent_hook)(struct inode *inode, 89 void (*split_extent_hook)(struct inode *inode,
88 struct extent_state *orig, u64 split); 90 struct extent_state *orig, u64 split);
89 int (*write_cache_pages_lock_hook)(struct page *page); 91 int (*write_cache_pages_lock_hook)(struct page *page, void *data,
92 void (*flush_fn)(void *));
90}; 93};
91 94
92struct extent_io_tree { 95struct extent_io_tree {
@@ -215,6 +218,8 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
215 gfp_t mask); 218 gfp_t mask);
216int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 219int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
217 gfp_t mask); 220 gfp_t mask);
221int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
222 int bits, int clear_bits, gfp_t mask);
218int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 223int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
219 struct extent_state **cached_state, gfp_t mask); 224 struct extent_state **cached_state, gfp_t mask);
220int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 225int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@ -249,6 +254,9 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
249struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 254struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
250 u64 start, unsigned long len); 255 u64 start, unsigned long len);
251void free_extent_buffer(struct extent_buffer *eb); 256void free_extent_buffer(struct extent_buffer *eb);
257#define WAIT_NONE 0
258#define WAIT_COMPLETE 1
259#define WAIT_PAGE_LOCK 2
252int read_extent_buffer_pages(struct extent_io_tree *tree, 260int read_extent_buffer_pages(struct extent_io_tree *tree,
253 struct extent_buffer *eb, u64 start, int wait, 261 struct extent_buffer *eb, u64 start, int wait,
254 get_extent_t *get_extent, int mirror_num); 262 get_extent_t *get_extent, int mirror_num);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a1cb7821becd..c7fb3a4247d3 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -91,8 +91,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
91 struct btrfs_csum_item *item; 91 struct btrfs_csum_item *item;
92 struct extent_buffer *leaf; 92 struct extent_buffer *leaf;
93 u64 csum_offset = 0; 93 u64 csum_offset = 0;
94 u16 csum_size = 94 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
95 btrfs_super_csum_size(&root->fs_info->super_copy);
96 int csums_in_item; 95 int csums_in_item;
97 96
98 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 97 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -162,8 +161,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
162 u64 item_last_offset = 0; 161 u64 item_last_offset = 0;
163 u64 disk_bytenr; 162 u64 disk_bytenr;
164 u32 diff; 163 u32 diff;
165 u16 csum_size = 164 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
166 btrfs_super_csum_size(&root->fs_info->super_copy);
167 int ret; 165 int ret;
168 struct btrfs_path *path; 166 struct btrfs_path *path;
169 struct btrfs_csum_item *item = NULL; 167 struct btrfs_csum_item *item = NULL;
@@ -290,7 +288,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
290 int ret; 288 int ret;
291 size_t size; 289 size_t size;
292 u64 csum_end; 290 u64 csum_end;
293 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); 291 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
294 292
295 path = btrfs_alloc_path(); 293 path = btrfs_alloc_path();
296 if (!path) 294 if (!path)
@@ -492,8 +490,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
492 u64 bytenr, u64 len) 490 u64 bytenr, u64 len)
493{ 491{
494 struct extent_buffer *leaf; 492 struct extent_buffer *leaf;
495 u16 csum_size = 493 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
496 btrfs_super_csum_size(&root->fs_info->super_copy);
497 u64 csum_end; 494 u64 csum_end;
498 u64 end_byte = bytenr + len; 495 u64 end_byte = bytenr + len;
499 u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; 496 u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
@@ -549,8 +546,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
549 u64 csum_end; 546 u64 csum_end;
550 struct extent_buffer *leaf; 547 struct extent_buffer *leaf;
551 int ret; 548 int ret;
552 u16 csum_size = 549 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
553 btrfs_super_csum_size(&root->fs_info->super_copy);
554 int blocksize_bits = root->fs_info->sb->s_blocksize_bits; 550 int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
555 551
556 root = root->fs_info->csum_root; 552 root = root->fs_info->csum_root;
@@ -676,8 +672,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
676 struct btrfs_sector_sum *sector_sum; 672 struct btrfs_sector_sum *sector_sum;
677 u32 nritems; 673 u32 nritems;
678 u32 ins_size; 674 u32 ins_size;
679 u16 csum_size = 675 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
680 btrfs_super_csum_size(&root->fs_info->super_copy);
681 676
682 path = btrfs_alloc_path(); 677 path = btrfs_alloc_path();
683 if (!path) 678 if (!path)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a381cd22f518..f2e928289600 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1036,11 +1036,13 @@ out:
1036 * on error we return an unlocked page and the error value 1036 * on error we return an unlocked page and the error value
1037 * on success we return a locked page and 0 1037 * on success we return a locked page and 0
1038 */ 1038 */
1039static int prepare_uptodate_page(struct page *page, u64 pos) 1039static int prepare_uptodate_page(struct page *page, u64 pos,
1040 bool force_uptodate)
1040{ 1041{
1041 int ret = 0; 1042 int ret = 0;
1042 1043
1043 if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) { 1044 if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) &&
1045 !PageUptodate(page)) {
1044 ret = btrfs_readpage(NULL, page); 1046 ret = btrfs_readpage(NULL, page);
1045 if (ret) 1047 if (ret)
1046 return ret; 1048 return ret;
@@ -1061,12 +1063,13 @@ static int prepare_uptodate_page(struct page *page, u64 pos)
1061static noinline int prepare_pages(struct btrfs_root *root, struct file *file, 1063static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1062 struct page **pages, size_t num_pages, 1064 struct page **pages, size_t num_pages,
1063 loff_t pos, unsigned long first_index, 1065 loff_t pos, unsigned long first_index,
1064 size_t write_bytes) 1066 size_t write_bytes, bool force_uptodate)
1065{ 1067{
1066 struct extent_state *cached_state = NULL; 1068 struct extent_state *cached_state = NULL;
1067 int i; 1069 int i;
1068 unsigned long index = pos >> PAGE_CACHE_SHIFT; 1070 unsigned long index = pos >> PAGE_CACHE_SHIFT;
1069 struct inode *inode = fdentry(file)->d_inode; 1071 struct inode *inode = fdentry(file)->d_inode;
1072 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1070 int err = 0; 1073 int err = 0;
1071 int faili = 0; 1074 int faili = 0;
1072 u64 start_pos; 1075 u64 start_pos;
@@ -1078,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1078again: 1081again:
1079 for (i = 0; i < num_pages; i++) { 1082 for (i = 0; i < num_pages; i++) {
1080 pages[i] = find_or_create_page(inode->i_mapping, index + i, 1083 pages[i] = find_or_create_page(inode->i_mapping, index + i,
1081 GFP_NOFS); 1084 mask);
1082 if (!pages[i]) { 1085 if (!pages[i]) {
1083 faili = i - 1; 1086 faili = i - 1;
1084 err = -ENOMEM; 1087 err = -ENOMEM;
@@ -1086,10 +1089,11 @@ again:
1086 } 1089 }
1087 1090
1088 if (i == 0) 1091 if (i == 0)
1089 err = prepare_uptodate_page(pages[i], pos); 1092 err = prepare_uptodate_page(pages[i], pos,
1093 force_uptodate);
1090 if (i == num_pages - 1) 1094 if (i == num_pages - 1)
1091 err = prepare_uptodate_page(pages[i], 1095 err = prepare_uptodate_page(pages[i],
1092 pos + write_bytes); 1096 pos + write_bytes, false);
1093 if (err) { 1097 if (err) {
1094 page_cache_release(pages[i]); 1098 page_cache_release(pages[i]);
1095 faili = i - 1; 1099 faili = i - 1;
@@ -1158,6 +1162,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1158 size_t num_written = 0; 1162 size_t num_written = 0;
1159 int nrptrs; 1163 int nrptrs;
1160 int ret = 0; 1164 int ret = 0;
1165 bool force_page_uptodate = false;
1161 1166
1162 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / 1167 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
1163 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / 1168 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
@@ -1200,7 +1205,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1200 * contents of pages from loop to loop 1205 * contents of pages from loop to loop
1201 */ 1206 */
1202 ret = prepare_pages(root, file, pages, num_pages, 1207 ret = prepare_pages(root, file, pages, num_pages,
1203 pos, first_index, write_bytes); 1208 pos, first_index, write_bytes,
1209 force_page_uptodate);
1204 if (ret) { 1210 if (ret) {
1205 btrfs_delalloc_release_space(inode, 1211 btrfs_delalloc_release_space(inode,
1206 num_pages << PAGE_CACHE_SHIFT); 1212 num_pages << PAGE_CACHE_SHIFT);
@@ -1217,12 +1223,15 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1217 if (copied < write_bytes) 1223 if (copied < write_bytes)
1218 nrptrs = 1; 1224 nrptrs = 1;
1219 1225
1220 if (copied == 0) 1226 if (copied == 0) {
1227 force_page_uptodate = true;
1221 dirty_pages = 0; 1228 dirty_pages = 0;
1222 else 1229 } else {
1230 force_page_uptodate = false;
1223 dirty_pages = (copied + offset + 1231 dirty_pages = (copied + offset +
1224 PAGE_CACHE_SIZE - 1) >> 1232 PAGE_CACHE_SIZE - 1) >>
1225 PAGE_CACHE_SHIFT; 1233 PAGE_CACHE_SHIFT;
1234 }
1226 1235
1227 /* 1236 /*
1228 * If we had a short copy we need to release the excess delaloc 1237 * If we had a short copy we need to release the excess delaloc
@@ -1607,10 +1616,6 @@ static long btrfs_fallocate(struct file *file, int mode,
1607 goto out; 1616 goto out;
1608 } 1617 }
1609 1618
1610 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
1611 if (ret)
1612 goto out;
1613
1614 locked_end = alloc_end - 1; 1619 locked_end = alloc_end - 1;
1615 while (1) { 1620 while (1) {
1616 struct btrfs_ordered_extent *ordered; 1621 struct btrfs_ordered_extent *ordered;
@@ -1656,11 +1661,27 @@ static long btrfs_fallocate(struct file *file, int mode,
1656 if (em->block_start == EXTENT_MAP_HOLE || 1661 if (em->block_start == EXTENT_MAP_HOLE ||
1657 (cur_offset >= inode->i_size && 1662 (cur_offset >= inode->i_size &&
1658 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 1663 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
1664
1665 /*
1666 * Make sure we have enough space before we do the
1667 * allocation.
1668 */
1669 ret = btrfs_check_data_free_space(inode, last_byte -
1670 cur_offset);
1671 if (ret) {
1672 free_extent_map(em);
1673 break;
1674 }
1675
1659 ret = btrfs_prealloc_file_range(inode, mode, cur_offset, 1676 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
1660 last_byte - cur_offset, 1677 last_byte - cur_offset,
1661 1 << inode->i_blkbits, 1678 1 << inode->i_blkbits,
1662 offset + len, 1679 offset + len,
1663 &alloc_hint); 1680 &alloc_hint);
1681
1682 /* Let go of our reservation. */
1683 btrfs_free_reserved_data_space(inode, last_byte -
1684 cur_offset);
1664 if (ret < 0) { 1685 if (ret < 0) {
1665 free_extent_map(em); 1686 free_extent_map(em);
1666 break; 1687 break;
@@ -1686,8 +1707,6 @@ static long btrfs_fallocate(struct file *file, int mode,
1686 } 1707 }
1687 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 1708 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
1688 &cached_state, GFP_NOFS); 1709 &cached_state, GFP_NOFS);
1689
1690 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
1691out: 1710out:
1692 mutex_unlock(&inode->i_mutex); 1711 mutex_unlock(&inode->i_mutex);
1693 return ret; 1712 return ret;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 41ac927401d0..7a15fcfb3e1f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -20,6 +20,7 @@
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/math64.h> 22#include <linux/math64.h>
23#include <linux/ratelimit.h>
23#include "ctree.h" 24#include "ctree.h"
24#include "free-space-cache.h" 25#include "free-space-cache.h"
25#include "transaction.h" 26#include "transaction.h"
@@ -84,6 +85,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
84 *block_group, struct btrfs_path *path) 85 *block_group, struct btrfs_path *path)
85{ 86{
86 struct inode *inode = NULL; 87 struct inode *inode = NULL;
88 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
87 89
88 spin_lock(&block_group->lock); 90 spin_lock(&block_group->lock);
89 if (block_group->inode) 91 if (block_group->inode)
@@ -98,13 +100,14 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
98 return inode; 100 return inode;
99 101
100 spin_lock(&block_group->lock); 102 spin_lock(&block_group->lock);
101 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) { 103 if (!((BTRFS_I(inode)->flags & flags) == flags)) {
102 printk(KERN_INFO "Old style space inode found, converting.\n"); 104 printk(KERN_INFO "Old style space inode found, converting.\n");
103 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM; 105 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
106 BTRFS_INODE_NODATACOW;
104 block_group->disk_cache_state = BTRFS_DC_CLEAR; 107 block_group->disk_cache_state = BTRFS_DC_CLEAR;
105 } 108 }
106 109
107 if (!btrfs_fs_closing(root->fs_info)) { 110 if (!block_group->iref) {
108 block_group->inode = igrab(inode); 111 block_group->inode = igrab(inode);
109 block_group->iref = 1; 112 block_group->iref = 1;
110 } 113 }
@@ -122,12 +125,17 @@ int __create_free_space_inode(struct btrfs_root *root,
122 struct btrfs_free_space_header *header; 125 struct btrfs_free_space_header *header;
123 struct btrfs_inode_item *inode_item; 126 struct btrfs_inode_item *inode_item;
124 struct extent_buffer *leaf; 127 struct extent_buffer *leaf;
128 u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
125 int ret; 129 int ret;
126 130
127 ret = btrfs_insert_empty_inode(trans, root, path, ino); 131 ret = btrfs_insert_empty_inode(trans, root, path, ino);
128 if (ret) 132 if (ret)
129 return ret; 133 return ret;
130 134
135 /* We inline crc's for the free disk space cache */
136 if (ino != BTRFS_FREE_INO_OBJECTID)
137 flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
138
131 leaf = path->nodes[0]; 139 leaf = path->nodes[0];
132 inode_item = btrfs_item_ptr(leaf, path->slots[0], 140 inode_item = btrfs_item_ptr(leaf, path->slots[0],
133 struct btrfs_inode_item); 141 struct btrfs_inode_item);
@@ -140,8 +148,7 @@ int __create_free_space_inode(struct btrfs_root *root,
140 btrfs_set_inode_uid(leaf, inode_item, 0); 148 btrfs_set_inode_uid(leaf, inode_item, 0);
141 btrfs_set_inode_gid(leaf, inode_item, 0); 149 btrfs_set_inode_gid(leaf, inode_item, 0);
142 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 150 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
143 btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | 151 btrfs_set_inode_flags(leaf, inode_item, flags);
144 BTRFS_INODE_PREALLOC);
145 btrfs_set_inode_nlink(leaf, inode_item, 1); 152 btrfs_set_inode_nlink(leaf, inode_item, 1);
146 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 153 btrfs_set_inode_transid(leaf, inode_item, trans->transid);
147 btrfs_set_inode_block_group(leaf, inode_item, offset); 154 btrfs_set_inode_block_group(leaf, inode_item, offset);
@@ -191,16 +198,24 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
191 struct inode *inode) 198 struct inode *inode)
192{ 199{
193 struct btrfs_block_rsv *rsv; 200 struct btrfs_block_rsv *rsv;
201 u64 needed_bytes;
194 loff_t oldsize; 202 loff_t oldsize;
195 int ret = 0; 203 int ret = 0;
196 204
197 rsv = trans->block_rsv; 205 rsv = trans->block_rsv;
198 trans->block_rsv = root->orphan_block_rsv; 206 trans->block_rsv = &root->fs_info->global_block_rsv;
199 ret = btrfs_block_rsv_check(trans, root, 207
200 root->orphan_block_rsv, 208 /* 1 for slack space, 1 for updating the inode */
201 0, 5); 209 needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) +
202 if (ret) 210 btrfs_calc_trans_metadata_size(root, 1);
203 return ret; 211
212 spin_lock(&trans->block_rsv->lock);
213 if (trans->block_rsv->reserved < needed_bytes) {
214 spin_unlock(&trans->block_rsv->lock);
215 trans->block_rsv = rsv;
216 return -ENOSPC;
217 }
218 spin_unlock(&trans->block_rsv->lock);
204 219
205 oldsize = i_size_read(inode); 220 oldsize = i_size_read(inode);
206 btrfs_i_size_write(inode, 0); 221 btrfs_i_size_write(inode, 0);
@@ -213,13 +228,15 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
213 ret = btrfs_truncate_inode_items(trans, root, inode, 228 ret = btrfs_truncate_inode_items(trans, root, inode,
214 0, BTRFS_EXTENT_DATA_KEY); 229 0, BTRFS_EXTENT_DATA_KEY);
215 230
216 trans->block_rsv = rsv;
217 if (ret) { 231 if (ret) {
232 trans->block_rsv = rsv;
218 WARN_ON(1); 233 WARN_ON(1);
219 return ret; 234 return ret;
220 } 235 }
221 236
222 ret = btrfs_update_inode(trans, root, inode); 237 ret = btrfs_update_inode(trans, root, inode);
238 trans->block_rsv = rsv;
239
223 return ret; 240 return ret;
224} 241}
225 242
@@ -242,26 +259,342 @@ static int readahead_cache(struct inode *inode)
242 return 0; 259 return 0;
243} 260}
244 261
262struct io_ctl {
263 void *cur, *orig;
264 struct page *page;
265 struct page **pages;
266 struct btrfs_root *root;
267 unsigned long size;
268 int index;
269 int num_pages;
270 unsigned check_crcs:1;
271};
272
273static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
274 struct btrfs_root *root)
275{
276 memset(io_ctl, 0, sizeof(struct io_ctl));
277 io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
278 PAGE_CACHE_SHIFT;
279 io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages,
280 GFP_NOFS);
281 if (!io_ctl->pages)
282 return -ENOMEM;
283 io_ctl->root = root;
284 if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
285 io_ctl->check_crcs = 1;
286 return 0;
287}
288
289static void io_ctl_free(struct io_ctl *io_ctl)
290{
291 kfree(io_ctl->pages);
292}
293
294static void io_ctl_unmap_page(struct io_ctl *io_ctl)
295{
296 if (io_ctl->cur) {
297 kunmap(io_ctl->page);
298 io_ctl->cur = NULL;
299 io_ctl->orig = NULL;
300 }
301}
302
303static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
304{
305 WARN_ON(io_ctl->cur);
306 BUG_ON(io_ctl->index >= io_ctl->num_pages);
307 io_ctl->page = io_ctl->pages[io_ctl->index++];
308 io_ctl->cur = kmap(io_ctl->page);
309 io_ctl->orig = io_ctl->cur;
310 io_ctl->size = PAGE_CACHE_SIZE;
311 if (clear)
312 memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
313}
314
315static void io_ctl_drop_pages(struct io_ctl *io_ctl)
316{
317 int i;
318
319 io_ctl_unmap_page(io_ctl);
320
321 for (i = 0; i < io_ctl->num_pages; i++) {
322 ClearPageChecked(io_ctl->pages[i]);
323 unlock_page(io_ctl->pages[i]);
324 page_cache_release(io_ctl->pages[i]);
325 }
326}
327
328static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
329 int uptodate)
330{
331 struct page *page;
332 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
333 int i;
334
335 for (i = 0; i < io_ctl->num_pages; i++) {
336 page = find_or_create_page(inode->i_mapping, i, mask);
337 if (!page) {
338 io_ctl_drop_pages(io_ctl);
339 return -ENOMEM;
340 }
341 io_ctl->pages[i] = page;
342 if (uptodate && !PageUptodate(page)) {
343 btrfs_readpage(NULL, page);
344 lock_page(page);
345 if (!PageUptodate(page)) {
346 printk(KERN_ERR "btrfs: error reading free "
347 "space cache\n");
348 io_ctl_drop_pages(io_ctl);
349 return -EIO;
350 }
351 }
352 }
353
354 return 0;
355}
356
357static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
358{
359 u64 *val;
360
361 io_ctl_map_page(io_ctl, 1);
362
363 /*
364 * Skip the csum areas. If we don't check crcs then we just have a
365 * 64bit chunk at the front of the first page.
366 */
367 if (io_ctl->check_crcs) {
368 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
369 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
370 } else {
371 io_ctl->cur += sizeof(u64);
372 io_ctl->size -= sizeof(u64) * 2;
373 }
374
375 val = io_ctl->cur;
376 *val = cpu_to_le64(generation);
377 io_ctl->cur += sizeof(u64);
378}
379
380static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
381{
382 u64 *gen;
383
384 /*
385 * Skip the crc area. If we don't check crcs then we just have a 64bit
386 * chunk at the front of the first page.
387 */
388 if (io_ctl->check_crcs) {
389 io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
390 io_ctl->size -= sizeof(u64) +
391 (sizeof(u32) * io_ctl->num_pages);
392 } else {
393 io_ctl->cur += sizeof(u64);
394 io_ctl->size -= sizeof(u64) * 2;
395 }
396
397 gen = io_ctl->cur;
398 if (le64_to_cpu(*gen) != generation) {
399 printk_ratelimited(KERN_ERR "btrfs: space cache generation "
400 "(%Lu) does not match inode (%Lu)\n", *gen,
401 generation);
402 io_ctl_unmap_page(io_ctl);
403 return -EIO;
404 }
405 io_ctl->cur += sizeof(u64);
406 return 0;
407}
408
409static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
410{
411 u32 *tmp;
412 u32 crc = ~(u32)0;
413 unsigned offset = 0;
414
415 if (!io_ctl->check_crcs) {
416 io_ctl_unmap_page(io_ctl);
417 return;
418 }
419
420 if (index == 0)
421 offset = sizeof(u32) * io_ctl->num_pages;;
422
423 crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
424 PAGE_CACHE_SIZE - offset);
425 btrfs_csum_final(crc, (char *)&crc);
426 io_ctl_unmap_page(io_ctl);
427 tmp = kmap(io_ctl->pages[0]);
428 tmp += index;
429 *tmp = crc;
430 kunmap(io_ctl->pages[0]);
431}
432
433static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
434{
435 u32 *tmp, val;
436 u32 crc = ~(u32)0;
437 unsigned offset = 0;
438
439 if (!io_ctl->check_crcs) {
440 io_ctl_map_page(io_ctl, 0);
441 return 0;
442 }
443
444 if (index == 0)
445 offset = sizeof(u32) * io_ctl->num_pages;
446
447 tmp = kmap(io_ctl->pages[0]);
448 tmp += index;
449 val = *tmp;
450 kunmap(io_ctl->pages[0]);
451
452 io_ctl_map_page(io_ctl, 0);
453 crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
454 PAGE_CACHE_SIZE - offset);
455 btrfs_csum_final(crc, (char *)&crc);
456 if (val != crc) {
457 printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free "
458 "space cache\n");
459 io_ctl_unmap_page(io_ctl);
460 return -EIO;
461 }
462
463 return 0;
464}
465
466static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
467 void *bitmap)
468{
469 struct btrfs_free_space_entry *entry;
470
471 if (!io_ctl->cur)
472 return -ENOSPC;
473
474 entry = io_ctl->cur;
475 entry->offset = cpu_to_le64(offset);
476 entry->bytes = cpu_to_le64(bytes);
477 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
478 BTRFS_FREE_SPACE_EXTENT;
479 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
480 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
481
482 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
483 return 0;
484
485 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
486
487 /* No more pages to map */
488 if (io_ctl->index >= io_ctl->num_pages)
489 return 0;
490
491 /* map the next page */
492 io_ctl_map_page(io_ctl, 1);
493 return 0;
494}
495
496static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
497{
498 if (!io_ctl->cur)
499 return -ENOSPC;
500
501 /*
502 * If we aren't at the start of the current page, unmap this one and
503 * map the next one if there is any left.
504 */
505 if (io_ctl->cur != io_ctl->orig) {
506 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
507 if (io_ctl->index >= io_ctl->num_pages)
508 return -ENOSPC;
509 io_ctl_map_page(io_ctl, 0);
510 }
511
512 memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
513 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
514 if (io_ctl->index < io_ctl->num_pages)
515 io_ctl_map_page(io_ctl, 0);
516 return 0;
517}
518
519static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
520{
521 /*
522 * If we're not on the boundary we know we've modified the page and we
523 * need to crc the page.
524 */
525 if (io_ctl->cur != io_ctl->orig)
526 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
527 else
528 io_ctl_unmap_page(io_ctl);
529
530 while (io_ctl->index < io_ctl->num_pages) {
531 io_ctl_map_page(io_ctl, 1);
532 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
533 }
534}
535
536static int io_ctl_read_entry(struct io_ctl *io_ctl,
537 struct btrfs_free_space *entry, u8 *type)
538{
539 struct btrfs_free_space_entry *e;
540
541 e = io_ctl->cur;
542 entry->offset = le64_to_cpu(e->offset);
543 entry->bytes = le64_to_cpu(e->bytes);
544 *type = e->type;
545 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
546 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
547
548 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
549 return 0;
550
551 io_ctl_unmap_page(io_ctl);
552
553 if (io_ctl->index >= io_ctl->num_pages)
554 return 0;
555
556 return io_ctl_check_crc(io_ctl, io_ctl->index);
557}
558
559static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
560 struct btrfs_free_space *entry)
561{
562 int ret;
563
564 if (io_ctl->cur && io_ctl->cur != io_ctl->orig)
565 io_ctl_unmap_page(io_ctl);
566
567 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
568 if (ret)
569 return ret;
570
571 memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE);
572 io_ctl_unmap_page(io_ctl);
573
574 return 0;
575}
576
245int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, 577int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
246 struct btrfs_free_space_ctl *ctl, 578 struct btrfs_free_space_ctl *ctl,
247 struct btrfs_path *path, u64 offset) 579 struct btrfs_path *path, u64 offset)
248{ 580{
249 struct btrfs_free_space_header *header; 581 struct btrfs_free_space_header *header;
250 struct extent_buffer *leaf; 582 struct extent_buffer *leaf;
251 struct page *page; 583 struct io_ctl io_ctl;
252 struct btrfs_key key; 584 struct btrfs_key key;
585 struct btrfs_free_space *e, *n;
253 struct list_head bitmaps; 586 struct list_head bitmaps;
254 u64 num_entries; 587 u64 num_entries;
255 u64 num_bitmaps; 588 u64 num_bitmaps;
256 u64 generation; 589 u64 generation;
257 pgoff_t index = 0; 590 u8 type;
258 int ret = 0; 591 int ret = 0;
259 592
260 INIT_LIST_HEAD(&bitmaps); 593 INIT_LIST_HEAD(&bitmaps);
261 594
262 /* Nothing in the space cache, goodbye */ 595 /* Nothing in the space cache, goodbye */
263 if (!i_size_read(inode)) 596 if (!i_size_read(inode))
264 goto out; 597 return 0;
265 598
266 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 599 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
267 key.offset = offset; 600 key.offset = offset;
@@ -269,11 +602,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
269 602
270 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 603 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
271 if (ret < 0) 604 if (ret < 0)
272 goto out; 605 return 0;
273 else if (ret > 0) { 606 else if (ret > 0) {
274 btrfs_release_path(path); 607 btrfs_release_path(path);
275 ret = 0; 608 return 0;
276 goto out;
277 } 609 }
278 610
279 ret = -1; 611 ret = -1;
@@ -291,169 +623,100 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
291 " not match free space cache generation (%llu)\n", 623 " not match free space cache generation (%llu)\n",
292 (unsigned long long)BTRFS_I(inode)->generation, 624 (unsigned long long)BTRFS_I(inode)->generation,
293 (unsigned long long)generation); 625 (unsigned long long)generation);
294 goto out; 626 return 0;
295 } 627 }
296 628
297 if (!num_entries) 629 if (!num_entries)
298 goto out; 630 return 0;
299 631
632 io_ctl_init(&io_ctl, inode, root);
300 ret = readahead_cache(inode); 633 ret = readahead_cache(inode);
301 if (ret) 634 if (ret)
302 goto out; 635 goto out;
303 636
304 while (1) { 637 ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
305 struct btrfs_free_space_entry *entry; 638 if (ret)
306 struct btrfs_free_space *e; 639 goto out;
307 void *addr;
308 unsigned long offset = 0;
309 int need_loop = 0;
310 640
311 if (!num_entries && !num_bitmaps) 641 ret = io_ctl_check_crc(&io_ctl, 0);
312 break; 642 if (ret)
643 goto free_cache;
313 644
314 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 645 ret = io_ctl_check_generation(&io_ctl, generation);
315 if (!page) 646 if (ret)
647 goto free_cache;
648
649 while (num_entries) {
650 e = kmem_cache_zalloc(btrfs_free_space_cachep,
651 GFP_NOFS);
652 if (!e)
316 goto free_cache; 653 goto free_cache;
317 654
318 if (!PageUptodate(page)) { 655 ret = io_ctl_read_entry(&io_ctl, e, &type);
319 btrfs_readpage(NULL, page); 656 if (ret) {
320 lock_page(page); 657 kmem_cache_free(btrfs_free_space_cachep, e);
321 if (!PageUptodate(page)) { 658 goto free_cache;
322 unlock_page(page);
323 page_cache_release(page);
324 printk(KERN_ERR "btrfs: error reading free "
325 "space cache\n");
326 goto free_cache;
327 }
328 } 659 }
329 addr = kmap(page);
330 660
331 if (index == 0) { 661 if (!e->bytes) {
332 u64 *gen; 662 kmem_cache_free(btrfs_free_space_cachep, e);
663 goto free_cache;
664 }
333 665
334 /* 666 if (type == BTRFS_FREE_SPACE_EXTENT) {
335 * We put a bogus crc in the front of the first page in 667 spin_lock(&ctl->tree_lock);
336 * case old kernels try to mount a fs with the new 668 ret = link_free_space(ctl, e);
337 * format to make sure they discard the cache. 669 spin_unlock(&ctl->tree_lock);
338 */ 670 if (ret) {
339 addr += sizeof(u64); 671 printk(KERN_ERR "Duplicate entries in "
340 offset += sizeof(u64); 672 "free space cache, dumping\n");
341 673 kmem_cache_free(btrfs_free_space_cachep, e);
342 gen = addr;
343 if (*gen != BTRFS_I(inode)->generation) {
344 printk(KERN_ERR "btrfs: space cache generation"
345 " (%llu) does not match inode (%llu)\n",
346 (unsigned long long)*gen,
347 (unsigned long long)
348 BTRFS_I(inode)->generation);
349 kunmap(page);
350 unlock_page(page);
351 page_cache_release(page);
352 goto free_cache; 674 goto free_cache;
353 } 675 }
354 addr += sizeof(u64); 676 } else {
355 offset += sizeof(u64); 677 BUG_ON(!num_bitmaps);
356 } 678 num_bitmaps--;
357 entry = addr; 679 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
358 680 if (!e->bitmap) {
359 while (1) { 681 kmem_cache_free(
360 if (!num_entries) 682 btrfs_free_space_cachep, e);
361 break;
362
363 need_loop = 1;
364 e = kmem_cache_zalloc(btrfs_free_space_cachep,
365 GFP_NOFS);
366 if (!e) {
367 kunmap(page);
368 unlock_page(page);
369 page_cache_release(page);
370 goto free_cache; 683 goto free_cache;
371 } 684 }
372 685 spin_lock(&ctl->tree_lock);
373 e->offset = le64_to_cpu(entry->offset); 686 ret = link_free_space(ctl, e);
374 e->bytes = le64_to_cpu(entry->bytes); 687 ctl->total_bitmaps++;
375 if (!e->bytes) { 688 ctl->op->recalc_thresholds(ctl);
376 kunmap(page); 689 spin_unlock(&ctl->tree_lock);
690 if (ret) {
691 printk(KERN_ERR "Duplicate entries in "
692 "free space cache, dumping\n");
377 kmem_cache_free(btrfs_free_space_cachep, e); 693 kmem_cache_free(btrfs_free_space_cachep, e);
378 unlock_page(page);
379 page_cache_release(page);
380 goto free_cache; 694 goto free_cache;
381 } 695 }
382 696 list_add_tail(&e->list, &bitmaps);
383 if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
384 spin_lock(&ctl->tree_lock);
385 ret = link_free_space(ctl, e);
386 spin_unlock(&ctl->tree_lock);
387 if (ret) {
388 printk(KERN_ERR "Duplicate entries in "
389 "free space cache, dumping\n");
390 kunmap(page);
391 unlock_page(page);
392 page_cache_release(page);
393 goto free_cache;
394 }
395 } else {
396 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
397 if (!e->bitmap) {
398 kunmap(page);
399 kmem_cache_free(
400 btrfs_free_space_cachep, e);
401 unlock_page(page);
402 page_cache_release(page);
403 goto free_cache;
404 }
405 spin_lock(&ctl->tree_lock);
406 ret = link_free_space(ctl, e);
407 ctl->total_bitmaps++;
408 ctl->op->recalc_thresholds(ctl);
409 spin_unlock(&ctl->tree_lock);
410 if (ret) {
411 printk(KERN_ERR "Duplicate entries in "
412 "free space cache, dumping\n");
413 kunmap(page);
414 unlock_page(page);
415 page_cache_release(page);
416 goto free_cache;
417 }
418 list_add_tail(&e->list, &bitmaps);
419 }
420
421 num_entries--;
422 offset += sizeof(struct btrfs_free_space_entry);
423 if (offset + sizeof(struct btrfs_free_space_entry) >=
424 PAGE_CACHE_SIZE)
425 break;
426 entry++;
427 } 697 }
428 698
429 /* 699 num_entries--;
430 * We read an entry out of this page, we need to move on to the 700 }
431 * next page.
432 */
433 if (need_loop) {
434 kunmap(page);
435 goto next;
436 }
437 701
438 /* 702 /*
439 * We add the bitmaps at the end of the entries in order that 703 * We add the bitmaps at the end of the entries in order that
440 * the bitmap entries are added to the cache. 704 * the bitmap entries are added to the cache.
441 */ 705 */
442 e = list_entry(bitmaps.next, struct btrfs_free_space, list); 706 list_for_each_entry_safe(e, n, &bitmaps, list) {
443 list_del_init(&e->list); 707 list_del_init(&e->list);
444 memcpy(e->bitmap, addr, PAGE_CACHE_SIZE); 708 ret = io_ctl_read_bitmap(&io_ctl, e);
445 kunmap(page); 709 if (ret)
446 num_bitmaps--; 710 goto free_cache;
447next:
448 unlock_page(page);
449 page_cache_release(page);
450 index++;
451 } 711 }
452 712
713 io_ctl_drop_pages(&io_ctl);
453 ret = 1; 714 ret = 1;
454out: 715out:
716 io_ctl_free(&io_ctl);
455 return ret; 717 return ret;
456free_cache: 718free_cache:
719 io_ctl_drop_pages(&io_ctl);
457 __btrfs_remove_free_space_cache(ctl); 720 __btrfs_remove_free_space_cache(ctl);
458 goto out; 721 goto out;
459} 722}
@@ -465,7 +728,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
465 struct btrfs_root *root = fs_info->tree_root; 728 struct btrfs_root *root = fs_info->tree_root;
466 struct inode *inode; 729 struct inode *inode;
467 struct btrfs_path *path; 730 struct btrfs_path *path;
468 int ret; 731 int ret = 0;
469 bool matched; 732 bool matched;
470 u64 used = btrfs_block_group_used(&block_group->item); 733 u64 used = btrfs_block_group_used(&block_group->item);
471 734
@@ -497,6 +760,14 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
497 return 0; 760 return 0;
498 } 761 }
499 762
763 /* We may have converted the inode and made the cache invalid. */
764 spin_lock(&block_group->lock);
765 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
766 spin_unlock(&block_group->lock);
767 goto out;
768 }
769 spin_unlock(&block_group->lock);
770
500 ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, 771 ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
501 path, block_group->key.objectid); 772 path, block_group->key.objectid);
502 btrfs_free_path(path); 773 btrfs_free_path(path);
@@ -530,6 +801,19 @@ out:
530 return ret; 801 return ret;
531} 802}
532 803
804/**
805 * __btrfs_write_out_cache - write out cached info to an inode
806 * @root - the root the inode belongs to
807 * @ctl - the free space cache we are going to write out
808 * @block_group - the block_group for this cache if it belongs to a block_group
809 * @trans - the trans handle
810 * @path - the path to use
811 * @offset - the offset for the key we'll insert
812 *
813 * This function writes out a free space cache struct to disk for quick recovery
814 * on mount. This will return 0 if it was successfull in writing the cache out,
815 * and -1 if it was not.
816 */
533int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, 817int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
534 struct btrfs_free_space_ctl *ctl, 818 struct btrfs_free_space_ctl *ctl,
535 struct btrfs_block_group_cache *block_group, 819 struct btrfs_block_group_cache *block_group,
@@ -540,42 +824,24 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
540 struct extent_buffer *leaf; 824 struct extent_buffer *leaf;
541 struct rb_node *node; 825 struct rb_node *node;
542 struct list_head *pos, *n; 826 struct list_head *pos, *n;
543 struct page **pages;
544 struct page *page;
545 struct extent_state *cached_state = NULL; 827 struct extent_state *cached_state = NULL;
546 struct btrfs_free_cluster *cluster = NULL; 828 struct btrfs_free_cluster *cluster = NULL;
547 struct extent_io_tree *unpin = NULL; 829 struct extent_io_tree *unpin = NULL;
830 struct io_ctl io_ctl;
548 struct list_head bitmap_list; 831 struct list_head bitmap_list;
549 struct btrfs_key key; 832 struct btrfs_key key;
550 u64 start, end, len; 833 u64 start, end, len;
551 u64 bytes = 0;
552 u32 crc = ~(u32)0;
553 int index = 0, num_pages = 0;
554 int entries = 0; 834 int entries = 0;
555 int bitmaps = 0; 835 int bitmaps = 0;
556 int ret = -1; 836 int ret;
557 bool next_page = false; 837 int err = -1;
558 bool out_of_space = false;
559 838
560 INIT_LIST_HEAD(&bitmap_list); 839 INIT_LIST_HEAD(&bitmap_list);
561 840
562 node = rb_first(&ctl->free_space_offset);
563 if (!node)
564 return 0;
565
566 if (!i_size_read(inode)) 841 if (!i_size_read(inode))
567 return -1; 842 return -1;
568 843
569 num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> 844 io_ctl_init(&io_ctl, inode, root);
570 PAGE_CACHE_SHIFT;
571
572 filemap_write_and_wait(inode->i_mapping);
573 btrfs_wait_ordered_range(inode, inode->i_size &
574 ~(root->sectorsize - 1), (u64)-1);
575
576 pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
577 if (!pages)
578 return -1;
579 845
580 /* Get the cluster for this block_group if it exists */ 846 /* Get the cluster for this block_group if it exists */
581 if (block_group && !list_empty(&block_group->cluster_list)) 847 if (block_group && !list_empty(&block_group->cluster_list))
@@ -589,30 +855,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
589 */ 855 */
590 unpin = root->fs_info->pinned_extents; 856 unpin = root->fs_info->pinned_extents;
591 857
592 /* 858 /* Lock all pages first so we can lock the extent safely. */
593 * Lock all pages first so we can lock the extent safely. 859 io_ctl_prepare_pages(&io_ctl, inode, 0);
594 *
595 * NOTE: Because we hold the ref the entire time we're going to write to
596 * the page find_get_page should never fail, so we don't do a check
597 * after find_get_page at this point. Just putting this here so people
598 * know and don't freak out.
599 */
600 while (index < num_pages) {
601 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
602 if (!page) {
603 int i;
604
605 for (i = 0; i < num_pages; i++) {
606 unlock_page(pages[i]);
607 page_cache_release(pages[i]);
608 }
609 goto out;
610 }
611 pages[index] = page;
612 index++;
613 }
614 860
615 index = 0;
616 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 861 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
617 0, &cached_state, GFP_NOFS); 862 0, &cached_state, GFP_NOFS);
618 863
@@ -623,189 +868,111 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
623 if (block_group) 868 if (block_group)
624 start = block_group->key.objectid; 869 start = block_group->key.objectid;
625 870
626 /* Write out the extent entries */ 871 node = rb_first(&ctl->free_space_offset);
627 do { 872 if (!node && cluster) {
628 struct btrfs_free_space_entry *entry; 873 node = rb_first(&cluster->root);
629 void *addr, *orig; 874 cluster = NULL;
630 unsigned long offset = 0; 875 }
631 876
632 next_page = false; 877 /* Make sure we can fit our crcs into the first page */
878 if (io_ctl.check_crcs &&
879 (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) {
880 WARN_ON(1);
881 goto out_nospc;
882 }
633 883
634 if (index >= num_pages) { 884 io_ctl_set_generation(&io_ctl, trans->transid);
635 out_of_space = true;
636 break;
637 }
638 885
639 page = pages[index]; 886 /* Write out the extent entries */
887 while (node) {
888 struct btrfs_free_space *e;
640 889
641 orig = addr = kmap(page); 890 e = rb_entry(node, struct btrfs_free_space, offset_index);
642 if (index == 0) { 891 entries++;
643 u64 *gen;
644 892
645 /* 893 ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes,
646 * We're going to put in a bogus crc for this page to 894 e->bitmap);
647 * make sure that old kernels who aren't aware of this 895 if (ret)
648 * format will be sure to discard the cache. 896 goto out_nospc;
649 */
650 addr += sizeof(u64);
651 offset += sizeof(u64);
652 897
653 gen = addr; 898 if (e->bitmap) {
654 *gen = trans->transid; 899 list_add_tail(&e->list, &bitmap_list);
655 addr += sizeof(u64); 900 bitmaps++;
656 offset += sizeof(u64);
657 } 901 }
658 entry = addr; 902 node = rb_next(node);
659 903 if (!node && cluster) {
660 memset(addr, 0, PAGE_CACHE_SIZE - offset); 904 node = rb_first(&cluster->root);
661 while (node && !next_page) { 905 cluster = NULL;
662 struct btrfs_free_space *e;
663
664 e = rb_entry(node, struct btrfs_free_space, offset_index);
665 entries++;
666
667 entry->offset = cpu_to_le64(e->offset);
668 entry->bytes = cpu_to_le64(e->bytes);
669 if (e->bitmap) {
670 entry->type = BTRFS_FREE_SPACE_BITMAP;
671 list_add_tail(&e->list, &bitmap_list);
672 bitmaps++;
673 } else {
674 entry->type = BTRFS_FREE_SPACE_EXTENT;
675 }
676 node = rb_next(node);
677 if (!node && cluster) {
678 node = rb_first(&cluster->root);
679 cluster = NULL;
680 }
681 offset += sizeof(struct btrfs_free_space_entry);
682 if (offset + sizeof(struct btrfs_free_space_entry) >=
683 PAGE_CACHE_SIZE)
684 next_page = true;
685 entry++;
686 } 906 }
907 }
687 908
688 /* 909 /*
689 * We want to add any pinned extents to our free space cache 910 * We want to add any pinned extents to our free space cache
690 * so we don't leak the space 911 * so we don't leak the space
691 */ 912 */
692 while (block_group && !next_page && 913 while (block_group && (start < block_group->key.objectid +
693 (start < block_group->key.objectid + 914 block_group->key.offset)) {
694 block_group->key.offset)) { 915 ret = find_first_extent_bit(unpin, start, &start, &end,
695 ret = find_first_extent_bit(unpin, start, &start, &end, 916 EXTENT_DIRTY);
696 EXTENT_DIRTY); 917 if (ret) {
697 if (ret) { 918 ret = 0;
698 ret = 0; 919 break;
699 break;
700 }
701
702 /* This pinned extent is out of our range */
703 if (start >= block_group->key.objectid +
704 block_group->key.offset)
705 break;
706
707 len = block_group->key.objectid +
708 block_group->key.offset - start;
709 len = min(len, end + 1 - start);
710
711 entries++;
712 entry->offset = cpu_to_le64(start);
713 entry->bytes = cpu_to_le64(len);
714 entry->type = BTRFS_FREE_SPACE_EXTENT;
715
716 start = end + 1;
717 offset += sizeof(struct btrfs_free_space_entry);
718 if (offset + sizeof(struct btrfs_free_space_entry) >=
719 PAGE_CACHE_SIZE)
720 next_page = true;
721 entry++;
722 } 920 }
723 921
724 /* Generate bogus crc value */ 922 /* This pinned extent is out of our range */
725 if (index == 0) { 923 if (start >= block_group->key.objectid +
726 u32 *tmp; 924 block_group->key.offset)
727 crc = btrfs_csum_data(root, orig + sizeof(u64), crc, 925 break;
728 PAGE_CACHE_SIZE - sizeof(u64));
729 btrfs_csum_final(crc, (char *)&crc);
730 crc++;
731 tmp = orig;
732 *tmp = crc;
733 }
734 926
735 kunmap(page); 927 len = block_group->key.objectid +
928 block_group->key.offset - start;
929 len = min(len, end + 1 - start);
736 930
737 bytes += PAGE_CACHE_SIZE; 931 entries++;
932 ret = io_ctl_add_entry(&io_ctl, start, len, NULL);
933 if (ret)
934 goto out_nospc;
738 935
739 index++; 936 start = end + 1;
740 } while (node || next_page); 937 }
741 938
742 /* Write out the bitmaps */ 939 /* Write out the bitmaps */
743 list_for_each_safe(pos, n, &bitmap_list) { 940 list_for_each_safe(pos, n, &bitmap_list) {
744 void *addr;
745 struct btrfs_free_space *entry = 941 struct btrfs_free_space *entry =
746 list_entry(pos, struct btrfs_free_space, list); 942 list_entry(pos, struct btrfs_free_space, list);
747 943
748 if (index >= num_pages) { 944 ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap);
749 out_of_space = true; 945 if (ret)
750 break; 946 goto out_nospc;
751 }
752 page = pages[index];
753
754 addr = kmap(page);
755 memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
756 kunmap(page);
757 bytes += PAGE_CACHE_SIZE;
758
759 list_del_init(&entry->list); 947 list_del_init(&entry->list);
760 index++;
761 }
762
763 if (out_of_space) {
764 btrfs_drop_pages(pages, num_pages);
765 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
766 i_size_read(inode) - 1, &cached_state,
767 GFP_NOFS);
768 ret = 0;
769 goto out;
770 } 948 }
771 949
772 /* Zero out the rest of the pages just to make sure */ 950 /* Zero out the rest of the pages just to make sure */
773 while (index < num_pages) { 951 io_ctl_zero_remaining_pages(&io_ctl);
774 void *addr;
775
776 page = pages[index];
777 addr = kmap(page);
778 memset(addr, 0, PAGE_CACHE_SIZE);
779 kunmap(page);
780 bytes += PAGE_CACHE_SIZE;
781 index++;
782 }
783 952
784 ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0, 953 ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
785 bytes, &cached_state); 954 0, i_size_read(inode), &cached_state);
786 btrfs_drop_pages(pages, num_pages); 955 io_ctl_drop_pages(&io_ctl);
787 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 956 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
788 i_size_read(inode) - 1, &cached_state, GFP_NOFS); 957 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
789 958
790 if (ret) { 959 if (ret)
791 ret = 0;
792 goto out; 960 goto out;
793 }
794 961
795 BTRFS_I(inode)->generation = trans->transid;
796 962
797 filemap_write_and_wait(inode->i_mapping); 963 ret = filemap_write_and_wait(inode->i_mapping);
964 if (ret)
965 goto out;
798 966
799 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 967 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
800 key.offset = offset; 968 key.offset = offset;
801 key.type = 0; 969 key.type = 0;
802 970
803 ret = btrfs_search_slot(trans, root, &key, path, 1, 1); 971 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
804 if (ret < 0) { 972 if (ret < 0) {
805 ret = -1; 973 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
806 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, 974 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
807 EXTENT_DIRTY | EXTENT_DELALLOC | 975 GFP_NOFS);
808 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
809 goto out; 976 goto out;
810 } 977 }
811 leaf = path->nodes[0]; 978 leaf = path->nodes[0];
@@ -816,15 +983,16 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
816 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 983 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
817 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 984 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
818 found_key.offset != offset) { 985 found_key.offset != offset) {
819 ret = -1; 986 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
820 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, 987 inode->i_size - 1,
821 EXTENT_DIRTY | EXTENT_DELALLOC | 988 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
822 EXTENT_DO_ACCOUNTING, 0, 0, NULL, 989 NULL, GFP_NOFS);
823 GFP_NOFS);
824 btrfs_release_path(path); 990 btrfs_release_path(path);
825 goto out; 991 goto out;
826 } 992 }
827 } 993 }
994
995 BTRFS_I(inode)->generation = trans->transid;
828 header = btrfs_item_ptr(leaf, path->slots[0], 996 header = btrfs_item_ptr(leaf, path->slots[0],
829 struct btrfs_free_space_header); 997 struct btrfs_free_space_header);
830 btrfs_set_free_space_entries(leaf, header, entries); 998 btrfs_set_free_space_entries(leaf, header, entries);
@@ -833,16 +1001,26 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
833 btrfs_mark_buffer_dirty(leaf); 1001 btrfs_mark_buffer_dirty(leaf);
834 btrfs_release_path(path); 1002 btrfs_release_path(path);
835 1003
836 ret = 1; 1004 err = 0;
837
838out: 1005out:
839 kfree(pages); 1006 io_ctl_free(&io_ctl);
840 if (ret != 1) { 1007 if (err) {
841 invalidate_inode_pages2_range(inode->i_mapping, 0, index); 1008 invalidate_inode_pages2(inode->i_mapping);
842 BTRFS_I(inode)->generation = 0; 1009 BTRFS_I(inode)->generation = 0;
843 } 1010 }
844 btrfs_update_inode(trans, root, inode); 1011 btrfs_update_inode(trans, root, inode);
845 return ret; 1012 return err;
1013
1014out_nospc:
1015 list_for_each_safe(pos, n, &bitmap_list) {
1016 struct btrfs_free_space *entry =
1017 list_entry(pos, struct btrfs_free_space, list);
1018 list_del_init(&entry->list);
1019 }
1020 io_ctl_drop_pages(&io_ctl);
1021 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1022 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
1023 goto out;
846} 1024}
847 1025
848int btrfs_write_out_cache(struct btrfs_root *root, 1026int btrfs_write_out_cache(struct btrfs_root *root,
@@ -869,14 +1047,15 @@ int btrfs_write_out_cache(struct btrfs_root *root,
869 1047
870 ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, 1048 ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
871 path, block_group->key.objectid); 1049 path, block_group->key.objectid);
872 if (ret < 0) { 1050 if (ret) {
873 spin_lock(&block_group->lock); 1051 spin_lock(&block_group->lock);
874 block_group->disk_cache_state = BTRFS_DC_ERROR; 1052 block_group->disk_cache_state = BTRFS_DC_ERROR;
875 spin_unlock(&block_group->lock); 1053 spin_unlock(&block_group->lock);
876 ret = 0; 1054 ret = 0;
877 1055#ifdef DEBUG
878 printk(KERN_ERR "btrfs: failed to write free space cace " 1056 printk(KERN_ERR "btrfs: failed to write free space cace "
879 "for block group %llu\n", block_group->key.objectid); 1057 "for block group %llu\n", block_group->key.objectid);
1058#endif
880 } 1059 }
881 1060
882 iput(inode); 1061 iput(inode);
@@ -1701,6 +1880,7 @@ again:
1701 ctl->total_bitmaps--; 1880 ctl->total_bitmaps--;
1702 } 1881 }
1703 kmem_cache_free(btrfs_free_space_cachep, info); 1882 kmem_cache_free(btrfs_free_space_cachep, info);
1883 ret = 0;
1704 goto out_lock; 1884 goto out_lock;
1705 } 1885 }
1706 1886
@@ -1708,7 +1888,8 @@ again:
1708 unlink_free_space(ctl, info); 1888 unlink_free_space(ctl, info);
1709 info->offset += bytes; 1889 info->offset += bytes;
1710 info->bytes -= bytes; 1890 info->bytes -= bytes;
1711 link_free_space(ctl, info); 1891 ret = link_free_space(ctl, info);
1892 WARN_ON(ret);
1712 goto out_lock; 1893 goto out_lock;
1713 } 1894 }
1714 1895
@@ -2472,9 +2653,19 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2472 spin_unlock(&ctl->tree_lock); 2653 spin_unlock(&ctl->tree_lock);
2473 2654
2474 if (bytes >= minlen) { 2655 if (bytes >= minlen) {
2475 int update_ret; 2656 struct btrfs_space_info *space_info;
2476 update_ret = btrfs_update_reserved_bytes(block_group, 2657 int update = 0;
2477 bytes, 1, 1); 2658
2659 space_info = block_group->space_info;
2660 spin_lock(&space_info->lock);
2661 spin_lock(&block_group->lock);
2662 if (!block_group->ro) {
2663 block_group->reserved += bytes;
2664 space_info->bytes_reserved += bytes;
2665 update = 1;
2666 }
2667 spin_unlock(&block_group->lock);
2668 spin_unlock(&space_info->lock);
2478 2669
2479 ret = btrfs_error_discard_extent(fs_info->extent_root, 2670 ret = btrfs_error_discard_extent(fs_info->extent_root,
2480 start, 2671 start,
@@ -2482,9 +2673,16 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2482 &actually_trimmed); 2673 &actually_trimmed);
2483 2674
2484 btrfs_add_free_space(block_group, start, bytes); 2675 btrfs_add_free_space(block_group, start, bytes);
2485 if (!update_ret) 2676 if (update) {
2486 btrfs_update_reserved_bytes(block_group, 2677 spin_lock(&space_info->lock);
2487 bytes, 0, 1); 2678 spin_lock(&block_group->lock);
2679 if (block_group->ro)
2680 space_info->bytes_readonly += bytes;
2681 block_group->reserved -= bytes;
2682 space_info->bytes_reserved -= bytes;
2683 spin_unlock(&space_info->lock);
2684 spin_unlock(&block_group->lock);
2685 }
2488 2686
2489 if (ret) 2687 if (ret)
2490 break; 2688 break;
@@ -2643,9 +2841,13 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
2643 return 0; 2841 return 0;
2644 2842
2645 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); 2843 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
2646 if (ret < 0) 2844 if (ret) {
2845 btrfs_delalloc_release_metadata(inode, inode->i_size);
2846#ifdef DEBUG
2647 printk(KERN_ERR "btrfs: failed to write free ino cache " 2847 printk(KERN_ERR "btrfs: failed to write free ino cache "
2648 "for root %llu\n", root->root_key.objectid); 2848 "for root %llu\n", root->root_key.objectid);
2849#endif
2850 }
2649 2851
2650 iput(inode); 2852 iput(inode);
2651 return ret; 2853 return ret;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index b4087e0fa871..53dcbdf446cd 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -465,14 +465,16 @@ again:
465 /* Just to make sure we have enough space */ 465 /* Just to make sure we have enough space */
466 prealloc += 8 * PAGE_CACHE_SIZE; 466 prealloc += 8 * PAGE_CACHE_SIZE;
467 467
468 ret = btrfs_check_data_free_space(inode, prealloc); 468 ret = btrfs_delalloc_reserve_space(inode, prealloc);
469 if (ret) 469 if (ret)
470 goto out_put; 470 goto out_put;
471 471
472 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, 472 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
473 prealloc, prealloc, &alloc_hint); 473 prealloc, prealloc, &alloc_hint);
474 if (ret) 474 if (ret) {
475 btrfs_delalloc_release_space(inode, prealloc);
475 goto out_put; 476 goto out_put;
477 }
476 btrfs_free_reserved_data_space(inode, prealloc); 478 btrfs_free_reserved_data_space(inode, prealloc);
477 479
478out_put: 480out_put:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9327f45434e8..9d0eaa57d4ee 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -393,7 +393,10 @@ again:
393 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { 393 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
394 WARN_ON(pages); 394 WARN_ON(pages);
395 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 395 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
396 BUG_ON(!pages); 396 if (!pages) {
397 /* just bail out to the uncompressed code */
398 goto cont;
399 }
397 400
398 if (BTRFS_I(inode)->force_compress) 401 if (BTRFS_I(inode)->force_compress)
399 compress_type = BTRFS_I(inode)->force_compress; 402 compress_type = BTRFS_I(inode)->force_compress;
@@ -424,6 +427,7 @@ again:
424 will_compress = 1; 427 will_compress = 1;
425 } 428 }
426 } 429 }
430cont:
427 if (start == 0) { 431 if (start == 0) {
428 trans = btrfs_join_transaction(root); 432 trans = btrfs_join_transaction(root);
429 BUG_ON(IS_ERR(trans)); 433 BUG_ON(IS_ERR(trans));
@@ -820,7 +824,7 @@ static noinline int cow_file_range(struct inode *inode,
820 } 824 }
821 825
822 BUG_ON(disk_num_bytes > 826 BUG_ON(disk_num_bytes >
823 btrfs_super_total_bytes(&root->fs_info->super_copy)); 827 btrfs_super_total_bytes(root->fs_info->super_copy));
824 828
825 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); 829 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
826 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 830 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
@@ -1792,12 +1796,12 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1792 } 1796 }
1793 ret = 0; 1797 ret = 0;
1794out: 1798out:
1795 if (nolock) { 1799 if (root != root->fs_info->tree_root)
1796 if (trans)
1797 btrfs_end_transaction_nolock(trans, root);
1798 } else {
1799 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 1800 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
1800 if (trans) 1801 if (trans) {
1802 if (nolock)
1803 btrfs_end_transaction_nolock(trans, root);
1804 else
1801 btrfs_end_transaction(trans, root); 1805 btrfs_end_transaction(trans, root);
1802 } 1806 }
1803 1807
@@ -1931,89 +1935,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
1931 up_read(&root->fs_info->cleanup_work_sem); 1935 up_read(&root->fs_info->cleanup_work_sem);
1932} 1936}
1933 1937
1934/*
1935 * calculate extra metadata reservation when snapshotting a subvolume
1936 * contains orphan files.
1937 */
1938void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
1939 struct btrfs_pending_snapshot *pending,
1940 u64 *bytes_to_reserve)
1941{
1942 struct btrfs_root *root;
1943 struct btrfs_block_rsv *block_rsv;
1944 u64 num_bytes;
1945 int index;
1946
1947 root = pending->root;
1948 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
1949 return;
1950
1951 block_rsv = root->orphan_block_rsv;
1952
1953 /* orphan block reservation for the snapshot */
1954 num_bytes = block_rsv->size;
1955
1956 /*
1957 * after the snapshot is created, COWing tree blocks may use more
1958 * space than it frees. So we should make sure there is enough
1959 * reserved space.
1960 */
1961 index = trans->transid & 0x1;
1962 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
1963 num_bytes += block_rsv->size -
1964 (block_rsv->reserved + block_rsv->freed[index]);
1965 }
1966
1967 *bytes_to_reserve += num_bytes;
1968}
1969
1970void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
1971 struct btrfs_pending_snapshot *pending)
1972{
1973 struct btrfs_root *root = pending->root;
1974 struct btrfs_root *snap = pending->snap;
1975 struct btrfs_block_rsv *block_rsv;
1976 u64 num_bytes;
1977 int index;
1978 int ret;
1979
1980 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
1981 return;
1982
1983 /* refill source subvolume's orphan block reservation */
1984 block_rsv = root->orphan_block_rsv;
1985 index = trans->transid & 0x1;
1986 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
1987 num_bytes = block_rsv->size -
1988 (block_rsv->reserved + block_rsv->freed[index]);
1989 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
1990 root->orphan_block_rsv,
1991 num_bytes);
1992 BUG_ON(ret);
1993 }
1994
1995 /* setup orphan block reservation for the snapshot */
1996 block_rsv = btrfs_alloc_block_rsv(snap);
1997 BUG_ON(!block_rsv);
1998
1999 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2000 snap->orphan_block_rsv = block_rsv;
2001
2002 num_bytes = root->orphan_block_rsv->size;
2003 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2004 block_rsv, num_bytes);
2005 BUG_ON(ret);
2006
2007#if 0
2008 /* insert orphan item for the snapshot */
2009 WARN_ON(!root->orphan_item_inserted);
2010 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2011 snap->root_key.objectid);
2012 BUG_ON(ret);
2013 snap->orphan_item_inserted = 1;
2014#endif
2015}
2016
2017enum btrfs_orphan_cleanup_state { 1938enum btrfs_orphan_cleanup_state {
2018 ORPHAN_CLEANUP_STARTED = 1, 1939 ORPHAN_CLEANUP_STARTED = 1,
2019 ORPHAN_CLEANUP_DONE = 2, 1940 ORPHAN_CLEANUP_DONE = 2,
@@ -2099,9 +2020,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2099 } 2020 }
2100 spin_unlock(&root->orphan_lock); 2021 spin_unlock(&root->orphan_lock);
2101 2022
2102 if (block_rsv)
2103 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2104
2105 /* grab metadata reservation from transaction handle */ 2023 /* grab metadata reservation from transaction handle */
2106 if (reserve) { 2024 if (reserve) {
2107 ret = btrfs_orphan_reserve_metadata(trans, inode); 2025 ret = btrfs_orphan_reserve_metadata(trans, inode);
@@ -2168,6 +2086,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2168 struct btrfs_key key, found_key; 2086 struct btrfs_key key, found_key;
2169 struct btrfs_trans_handle *trans; 2087 struct btrfs_trans_handle *trans;
2170 struct inode *inode; 2088 struct inode *inode;
2089 u64 last_objectid = 0;
2171 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2090 int ret = 0, nr_unlink = 0, nr_truncate = 0;
2172 2091
2173 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) 2092 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
@@ -2219,41 +2138,49 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2219 * crossing root thing. we store the inode number in the 2138 * crossing root thing. we store the inode number in the
2220 * offset of the orphan item. 2139 * offset of the orphan item.
2221 */ 2140 */
2141
2142 if (found_key.offset == last_objectid) {
2143 printk(KERN_ERR "btrfs: Error removing orphan entry, "
2144 "stopping orphan cleanup\n");
2145 ret = -EINVAL;
2146 goto out;
2147 }
2148
2149 last_objectid = found_key.offset;
2150
2222 found_key.objectid = found_key.offset; 2151 found_key.objectid = found_key.offset;
2223 found_key.type = BTRFS_INODE_ITEM_KEY; 2152 found_key.type = BTRFS_INODE_ITEM_KEY;
2224 found_key.offset = 0; 2153 found_key.offset = 0;
2225 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 2154 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
2226 if (IS_ERR(inode)) { 2155 ret = PTR_RET(inode);
2227 ret = PTR_ERR(inode); 2156 if (ret && ret != -ESTALE)
2228 goto out; 2157 goto out;
2229 }
2230 2158
2231 /* 2159 /*
2232 * add this inode to the orphan list so btrfs_orphan_del does 2160 * Inode is already gone but the orphan item is still there,
2233 * the proper thing when we hit it 2161 * kill the orphan item.
2234 */
2235 spin_lock(&root->orphan_lock);
2236 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2237 spin_unlock(&root->orphan_lock);
2238
2239 /*
2240 * if this is a bad inode, means we actually succeeded in
2241 * removing the inode, but not the orphan record, which means
2242 * we need to manually delete the orphan since iput will just
2243 * do a destroy_inode
2244 */ 2162 */
2245 if (is_bad_inode(inode)) { 2163 if (ret == -ESTALE) {
2246 trans = btrfs_start_transaction(root, 0); 2164 trans = btrfs_start_transaction(root, 1);
2247 if (IS_ERR(trans)) { 2165 if (IS_ERR(trans)) {
2248 ret = PTR_ERR(trans); 2166 ret = PTR_ERR(trans);
2249 goto out; 2167 goto out;
2250 } 2168 }
2251 btrfs_orphan_del(trans, inode); 2169 ret = btrfs_del_orphan_item(trans, root,
2170 found_key.objectid);
2171 BUG_ON(ret);
2252 btrfs_end_transaction(trans, root); 2172 btrfs_end_transaction(trans, root);
2253 iput(inode);
2254 continue; 2173 continue;
2255 } 2174 }
2256 2175
2176 /*
2177 * add this inode to the orphan list so btrfs_orphan_del does
2178 * the proper thing when we hit it
2179 */
2180 spin_lock(&root->orphan_lock);
2181 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2182 spin_unlock(&root->orphan_lock);
2183
2257 /* if we have links, this was a truncate, lets do that */ 2184 /* if we have links, this was a truncate, lets do that */
2258 if (inode->i_nlink) { 2185 if (inode->i_nlink) {
2259 if (!S_ISREG(inode->i_mode)) { 2186 if (!S_ISREG(inode->i_mode)) {
@@ -2687,7 +2614,16 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2687 u64 ino = btrfs_ino(inode); 2614 u64 ino = btrfs_ino(inode);
2688 u64 dir_ino = btrfs_ino(dir); 2615 u64 dir_ino = btrfs_ino(dir);
2689 2616
2690 trans = btrfs_start_transaction(root, 10); 2617 /*
2618 * 1 for the possible orphan item
2619 * 1 for the dir item
2620 * 1 for the dir index
2621 * 1 for the inode ref
2622 * 1 for the inode ref in the tree log
2623 * 2 for the dir entries in the log
2624 * 1 for the inode
2625 */
2626 trans = btrfs_start_transaction(root, 8);
2691 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) 2627 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
2692 return trans; 2628 return trans;
2693 2629
@@ -2710,7 +2646,8 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2710 return ERR_PTR(-ENOMEM); 2646 return ERR_PTR(-ENOMEM);
2711 } 2647 }
2712 2648
2713 trans = btrfs_start_transaction(root, 0); 2649 /* 1 for the orphan item */
2650 trans = btrfs_start_transaction(root, 1);
2714 if (IS_ERR(trans)) { 2651 if (IS_ERR(trans)) {
2715 btrfs_free_path(path); 2652 btrfs_free_path(path);
2716 root->fs_info->enospc_unlink = 0; 2653 root->fs_info->enospc_unlink = 0;
@@ -2815,6 +2752,12 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2815 err = 0; 2752 err = 0;
2816out: 2753out:
2817 btrfs_free_path(path); 2754 btrfs_free_path(path);
2755 /* Migrate the orphan reservation over */
2756 if (!err)
2757 err = btrfs_block_rsv_migrate(trans->block_rsv,
2758 &root->fs_info->global_block_rsv,
2759 trans->bytes_reserved);
2760
2818 if (err) { 2761 if (err) {
2819 btrfs_end_transaction(trans, root); 2762 btrfs_end_transaction(trans, root);
2820 root->fs_info->enospc_unlink = 0; 2763 root->fs_info->enospc_unlink = 0;
@@ -2829,6 +2772,9 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
2829 struct btrfs_root *root) 2772 struct btrfs_root *root)
2830{ 2773{
2831 if (trans->block_rsv == &root->fs_info->global_block_rsv) { 2774 if (trans->block_rsv == &root->fs_info->global_block_rsv) {
2775 btrfs_block_rsv_release(root, trans->block_rsv,
2776 trans->bytes_reserved);
2777 trans->block_rsv = &root->fs_info->trans_block_rsv;
2832 BUG_ON(!root->fs_info->enospc_unlink); 2778 BUG_ON(!root->fs_info->enospc_unlink);
2833 root->fs_info->enospc_unlink = 0; 2779 root->fs_info->enospc_unlink = 0;
2834 } 2780 }
@@ -3220,6 +3166,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3220 pgoff_t index = from >> PAGE_CACHE_SHIFT; 3166 pgoff_t index = from >> PAGE_CACHE_SHIFT;
3221 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3167 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3222 struct page *page; 3168 struct page *page;
3169 gfp_t mask = btrfs_alloc_write_mask(mapping);
3223 int ret = 0; 3170 int ret = 0;
3224 u64 page_start; 3171 u64 page_start;
3225 u64 page_end; 3172 u64 page_end;
@@ -3232,7 +3179,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3232 3179
3233 ret = -ENOMEM; 3180 ret = -ENOMEM;
3234again: 3181again:
3235 page = find_or_create_page(mapping, index, GFP_NOFS); 3182 page = find_or_create_page(mapping, index, mask);
3236 if (!page) { 3183 if (!page) {
3237 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3184 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3238 goto out; 3185 goto out;
@@ -3465,6 +3412,8 @@ void btrfs_evict_inode(struct inode *inode)
3465{ 3412{
3466 struct btrfs_trans_handle *trans; 3413 struct btrfs_trans_handle *trans;
3467 struct btrfs_root *root = BTRFS_I(inode)->root; 3414 struct btrfs_root *root = BTRFS_I(inode)->root;
3415 struct btrfs_block_rsv *rsv, *global_rsv;
3416 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
3468 unsigned long nr; 3417 unsigned long nr;
3469 int ret; 3418 int ret;
3470 3419
@@ -3492,22 +3441,55 @@ void btrfs_evict_inode(struct inode *inode)
3492 goto no_delete; 3441 goto no_delete;
3493 } 3442 }
3494 3443
3444 rsv = btrfs_alloc_block_rsv(root);
3445 if (!rsv) {
3446 btrfs_orphan_del(NULL, inode);
3447 goto no_delete;
3448 }
3449 rsv->size = min_size;
3450 global_rsv = &root->fs_info->global_block_rsv;
3451
3495 btrfs_i_size_write(inode, 0); 3452 btrfs_i_size_write(inode, 0);
3496 3453
3454 /*
3455 * This is a bit simpler than btrfs_truncate since
3456 *
3457 * 1) We've already reserved our space for our orphan item in the
3458 * unlink.
3459 * 2) We're going to delete the inode item, so we don't need to update
3460 * it at all.
3461 *
3462 * So we just need to reserve some slack space in case we add bytes when
3463 * doing the truncate.
3464 */
3497 while (1) { 3465 while (1) {
3498 trans = btrfs_join_transaction(root); 3466 ret = btrfs_block_rsv_refill(root, rsv, min_size);
3499 BUG_ON(IS_ERR(trans)); 3467
3500 trans->block_rsv = root->orphan_block_rsv; 3468 /*
3469 * Try and steal from the global reserve since we will
3470 * likely not use this space anyway, we want to try as
3471 * hard as possible to get this to work.
3472 */
3473 if (ret)
3474 ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
3501 3475
3502 ret = btrfs_block_rsv_check(trans, root,
3503 root->orphan_block_rsv, 0, 5);
3504 if (ret) { 3476 if (ret) {
3505 BUG_ON(ret != -EAGAIN); 3477 printk(KERN_WARNING "Could not get space for a "
3506 ret = btrfs_commit_transaction(trans, root); 3478 "delete, will truncate on mount %d\n", ret);
3507 BUG_ON(ret); 3479 btrfs_orphan_del(NULL, inode);
3508 continue; 3480 btrfs_free_block_rsv(root, rsv);
3481 goto no_delete;
3482 }
3483
3484 trans = btrfs_start_transaction(root, 0);
3485 if (IS_ERR(trans)) {
3486 btrfs_orphan_del(NULL, inode);
3487 btrfs_free_block_rsv(root, rsv);
3488 goto no_delete;
3509 } 3489 }
3510 3490
3491 trans->block_rsv = rsv;
3492
3511 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 3493 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
3512 if (ret != -EAGAIN) 3494 if (ret != -EAGAIN)
3513 break; 3495 break;
@@ -3516,14 +3498,17 @@ void btrfs_evict_inode(struct inode *inode)
3516 btrfs_end_transaction(trans, root); 3498 btrfs_end_transaction(trans, root);
3517 trans = NULL; 3499 trans = NULL;
3518 btrfs_btree_balance_dirty(root, nr); 3500 btrfs_btree_balance_dirty(root, nr);
3519
3520 } 3501 }
3521 3502
3503 btrfs_free_block_rsv(root, rsv);
3504
3522 if (ret == 0) { 3505 if (ret == 0) {
3506 trans->block_rsv = root->orphan_block_rsv;
3523 ret = btrfs_orphan_del(trans, inode); 3507 ret = btrfs_orphan_del(trans, inode);
3524 BUG_ON(ret); 3508 BUG_ON(ret);
3525 } 3509 }
3526 3510
3511 trans->block_rsv = &root->fs_info->trans_block_rsv;
3527 if (!(root == root->fs_info->tree_root || 3512 if (!(root == root->fs_info->tree_root ||
3528 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) 3513 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
3529 btrfs_return_ino(root, btrfs_ino(inode)); 3514 btrfs_return_ino(root, btrfs_ino(inode));
@@ -5647,8 +5632,7 @@ again:
5647 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { 5632 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
5648 ret = btrfs_ordered_update_i_size(inode, 0, ordered); 5633 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5649 if (!ret) 5634 if (!ret)
5650 ret = btrfs_update_inode(trans, root, inode); 5635 err = btrfs_update_inode(trans, root, inode);
5651 err = ret;
5652 goto out; 5636 goto out;
5653 } 5637 }
5654 5638
@@ -6393,6 +6377,7 @@ static int btrfs_truncate(struct inode *inode)
6393 struct btrfs_trans_handle *trans; 6377 struct btrfs_trans_handle *trans;
6394 unsigned long nr; 6378 unsigned long nr;
6395 u64 mask = root->sectorsize - 1; 6379 u64 mask = root->sectorsize - 1;
6380 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
6396 6381
6397 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); 6382 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
6398 if (ret) 6383 if (ret)
@@ -6440,19 +6425,23 @@ static int btrfs_truncate(struct inode *inode)
6440 rsv = btrfs_alloc_block_rsv(root); 6425 rsv = btrfs_alloc_block_rsv(root);
6441 if (!rsv) 6426 if (!rsv)
6442 return -ENOMEM; 6427 return -ENOMEM;
6443 btrfs_add_durable_block_rsv(root->fs_info, rsv); 6428 rsv->size = min_size;
6444 6429
6430 /*
6431 * 1 for the truncate slack space
6432 * 1 for the orphan item we're going to add
6433 * 1 for the orphan item deletion
6434 * 1 for updating the inode.
6435 */
6445 trans = btrfs_start_transaction(root, 4); 6436 trans = btrfs_start_transaction(root, 4);
6446 if (IS_ERR(trans)) { 6437 if (IS_ERR(trans)) {
6447 err = PTR_ERR(trans); 6438 err = PTR_ERR(trans);
6448 goto out; 6439 goto out;
6449 } 6440 }
6450 6441
6451 /* 6442 /* Migrate the slack space for the truncate to our reserve */
6452 * Reserve space for the truncate process. Truncate should be adding 6443 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
6453 * space, but if there are snapshots it may end up using space. 6444 min_size);
6454 */
6455 ret = btrfs_truncate_reserve_metadata(trans, root, rsv);
6456 BUG_ON(ret); 6445 BUG_ON(ret);
6457 6446
6458 ret = btrfs_orphan_add(trans, inode); 6447 ret = btrfs_orphan_add(trans, inode);
@@ -6461,21 +6450,6 @@ static int btrfs_truncate(struct inode *inode)
6461 goto out; 6450 goto out;
6462 } 6451 }
6463 6452
6464 nr = trans->blocks_used;
6465 btrfs_end_transaction(trans, root);
6466 btrfs_btree_balance_dirty(root, nr);
6467
6468 /*
6469 * Ok so we've already migrated our bytes over for the truncate, so here
6470 * just reserve the one slot we need for updating the inode.
6471 */
6472 trans = btrfs_start_transaction(root, 1);
6473 if (IS_ERR(trans)) {
6474 err = PTR_ERR(trans);
6475 goto out;
6476 }
6477 trans->block_rsv = rsv;
6478
6479 /* 6453 /*
6480 * setattr is responsible for setting the ordered_data_close flag, 6454 * setattr is responsible for setting the ordered_data_close flag,
6481 * but that is only tested during the last file release. That 6455 * but that is only tested during the last file release. That
@@ -6497,20 +6471,30 @@ static int btrfs_truncate(struct inode *inode)
6497 btrfs_add_ordered_operation(trans, root, inode); 6471 btrfs_add_ordered_operation(trans, root, inode);
6498 6472
6499 while (1) { 6473 while (1) {
6474 ret = btrfs_block_rsv_refill(root, rsv, min_size);
6475 if (ret) {
6476 /*
6477 * This can only happen with the original transaction we
6478 * started above, every other time we shouldn't have a
6479 * transaction started yet.
6480 */
6481 if (ret == -EAGAIN)
6482 goto end_trans;
6483 err = ret;
6484 break;
6485 }
6486
6500 if (!trans) { 6487 if (!trans) {
6501 trans = btrfs_start_transaction(root, 3); 6488 /* Just need the 1 for updating the inode */
6489 trans = btrfs_start_transaction(root, 1);
6502 if (IS_ERR(trans)) { 6490 if (IS_ERR(trans)) {
6503 err = PTR_ERR(trans); 6491 err = PTR_ERR(trans);
6504 goto out; 6492 goto out;
6505 } 6493 }
6506
6507 ret = btrfs_truncate_reserve_metadata(trans, root,
6508 rsv);
6509 BUG_ON(ret);
6510
6511 trans->block_rsv = rsv;
6512 } 6494 }
6513 6495
6496 trans->block_rsv = rsv;
6497
6514 ret = btrfs_truncate_inode_items(trans, root, inode, 6498 ret = btrfs_truncate_inode_items(trans, root, inode,
6515 inode->i_size, 6499 inode->i_size,
6516 BTRFS_EXTENT_DATA_KEY); 6500 BTRFS_EXTENT_DATA_KEY);
@@ -6525,7 +6509,7 @@ static int btrfs_truncate(struct inode *inode)
6525 err = ret; 6509 err = ret;
6526 break; 6510 break;
6527 } 6511 }
6528 6512end_trans:
6529 nr = trans->blocks_used; 6513 nr = trans->blocks_used;
6530 btrfs_end_transaction(trans, root); 6514 btrfs_end_transaction(trans, root);
6531 trans = NULL; 6515 trans = NULL;
@@ -6607,9 +6591,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6607 ei->last_sub_trans = 0; 6591 ei->last_sub_trans = 0;
6608 ei->logged_trans = 0; 6592 ei->logged_trans = 0;
6609 ei->delalloc_bytes = 0; 6593 ei->delalloc_bytes = 0;
6610 ei->reserved_bytes = 0;
6611 ei->disk_i_size = 0; 6594 ei->disk_i_size = 0;
6612 ei->flags = 0; 6595 ei->flags = 0;
6596 ei->csum_bytes = 0;
6613 ei->index_cnt = (u64)-1; 6597 ei->index_cnt = (u64)-1;
6614 ei->last_unlink_trans = 0; 6598 ei->last_unlink_trans = 0;
6615 6599
@@ -6655,6 +6639,8 @@ void btrfs_destroy_inode(struct inode *inode)
6655 WARN_ON(inode->i_data.nrpages); 6639 WARN_ON(inode->i_data.nrpages);
6656 WARN_ON(BTRFS_I(inode)->outstanding_extents); 6640 WARN_ON(BTRFS_I(inode)->outstanding_extents);
6657 WARN_ON(BTRFS_I(inode)->reserved_extents); 6641 WARN_ON(BTRFS_I(inode)->reserved_extents);
6642 WARN_ON(BTRFS_I(inode)->delalloc_bytes);
6643 WARN_ON(BTRFS_I(inode)->csum_bytes);
6658 6644
6659 /* 6645 /*
6660 * This can happen where we create an inode, but somebody else also 6646 * This can happen where we create an inode, but somebody else also
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7f57efa76d11..cc9893990341 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -118,7 +118,7 @@ void btrfs_update_iflags(struct inode *inode)
118/* 118/*
119 * Inherit flags from the parent inode. 119 * Inherit flags from the parent inode.
120 * 120 *
121 * Unlike extN we don't have any flags we don't want to inherit currently. 121 * Currently only the compression flags and the cow flags are inherited.
122 */ 122 */
123void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) 123void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
124{ 124{
@@ -129,12 +129,17 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
129 129
130 flags = BTRFS_I(dir)->flags; 130 flags = BTRFS_I(dir)->flags;
131 131
132 if (S_ISREG(inode->i_mode)) 132 if (flags & BTRFS_INODE_NOCOMPRESS) {
133 flags &= ~BTRFS_INODE_DIRSYNC; 133 BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
134 else if (!S_ISDIR(inode->i_mode)) 134 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
135 flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME); 135 } else if (flags & BTRFS_INODE_COMPRESS) {
136 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
137 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
138 }
139
140 if (flags & BTRFS_INODE_NODATACOW)
141 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
136 142
137 BTRFS_I(inode)->flags = flags;
138 btrfs_update_iflags(inode); 143 btrfs_update_iflags(inode);
139} 144}
140 145
@@ -278,6 +283,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
278 struct fstrim_range range; 283 struct fstrim_range range;
279 u64 minlen = ULLONG_MAX; 284 u64 minlen = ULLONG_MAX;
280 u64 num_devices = 0; 285 u64 num_devices = 0;
286 u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
281 int ret; 287 int ret;
282 288
283 if (!capable(CAP_SYS_ADMIN)) 289 if (!capable(CAP_SYS_ADMIN))
@@ -296,12 +302,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
296 } 302 }
297 } 303 }
298 rcu_read_unlock(); 304 rcu_read_unlock();
305
299 if (!num_devices) 306 if (!num_devices)
300 return -EOPNOTSUPP; 307 return -EOPNOTSUPP;
301
302 if (copy_from_user(&range, arg, sizeof(range))) 308 if (copy_from_user(&range, arg, sizeof(range)))
303 return -EFAULT; 309 return -EFAULT;
310 if (range.start > total_bytes)
311 return -EINVAL;
304 312
313 range.len = min(range.len, total_bytes - range.start);
305 range.minlen = max(range.minlen, minlen); 314 range.minlen = max(range.minlen, minlen);
306 ret = btrfs_trim_fs(root, &range); 315 ret = btrfs_trim_fs(root, &range);
307 if (ret < 0) 316 if (ret < 0)
@@ -761,7 +770,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
761 int ret = 1; 770 int ret = 1;
762 771
763 /* 772 /*
764 * make sure that once we start defragging and extent, we keep on 773 * make sure that once we start defragging an extent, we keep on
765 * defragging it 774 * defragging it
766 */ 775 */
767 if (start < *defrag_end) 776 if (start < *defrag_end)
@@ -806,7 +815,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
806 * extent will force at least part of that big extent to be defragged. 815 * extent will force at least part of that big extent to be defragged.
807 */ 816 */
808 if (ret) { 817 if (ret) {
809 *last_len += len;
810 *defrag_end = extent_map_end(em); 818 *defrag_end = extent_map_end(em);
811 } else { 819 } else {
812 *last_len = 0; 820 *last_len = 0;
@@ -844,6 +852,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
844 int i_done; 852 int i_done;
845 struct btrfs_ordered_extent *ordered; 853 struct btrfs_ordered_extent *ordered;
846 struct extent_state *cached_state = NULL; 854 struct extent_state *cached_state = NULL;
855 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
847 856
848 if (isize == 0) 857 if (isize == 0)
849 return 0; 858 return 0;
@@ -861,7 +870,7 @@ again:
861 for (i = 0; i < num_pages; i++) { 870 for (i = 0; i < num_pages; i++) {
862 struct page *page; 871 struct page *page;
863 page = find_or_create_page(inode->i_mapping, 872 page = find_or_create_page(inode->i_mapping,
864 start_index + i, GFP_NOFS); 873 start_index + i, mask);
865 if (!page) 874 if (!page)
866 break; 875 break;
867 876
@@ -973,18 +982,20 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
973 struct btrfs_super_block *disk_super; 982 struct btrfs_super_block *disk_super;
974 struct file_ra_state *ra = NULL; 983 struct file_ra_state *ra = NULL;
975 unsigned long last_index; 984 unsigned long last_index;
985 u64 isize = i_size_read(inode);
976 u64 features; 986 u64 features;
977 u64 last_len = 0; 987 u64 last_len = 0;
978 u64 skip = 0; 988 u64 skip = 0;
979 u64 defrag_end = 0; 989 u64 defrag_end = 0;
980 u64 newer_off = range->start; 990 u64 newer_off = range->start;
981 int newer_left = 0;
982 unsigned long i; 991 unsigned long i;
992 unsigned long ra_index = 0;
983 int ret; 993 int ret;
984 int defrag_count = 0; 994 int defrag_count = 0;
985 int compress_type = BTRFS_COMPRESS_ZLIB; 995 int compress_type = BTRFS_COMPRESS_ZLIB;
986 int extent_thresh = range->extent_thresh; 996 int extent_thresh = range->extent_thresh;
987 int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; 997 int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
998 int cluster = max_cluster;
988 u64 new_align = ~((u64)128 * 1024 - 1); 999 u64 new_align = ~((u64)128 * 1024 - 1);
989 struct page **pages = NULL; 1000 struct page **pages = NULL;
990 1001
@@ -998,7 +1009,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
998 compress_type = range->compress_type; 1009 compress_type = range->compress_type;
999 } 1010 }
1000 1011
1001 if (inode->i_size == 0) 1012 if (isize == 0)
1002 return 0; 1013 return 0;
1003 1014
1004 /* 1015 /*
@@ -1014,7 +1025,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1014 ra = &file->f_ra; 1025 ra = &file->f_ra;
1015 } 1026 }
1016 1027
1017 pages = kmalloc(sizeof(struct page *) * newer_cluster, 1028 pages = kmalloc(sizeof(struct page *) * max_cluster,
1018 GFP_NOFS); 1029 GFP_NOFS);
1019 if (!pages) { 1030 if (!pages) {
1020 ret = -ENOMEM; 1031 ret = -ENOMEM;
@@ -1023,10 +1034,10 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1023 1034
1024 /* find the last page to defrag */ 1035 /* find the last page to defrag */
1025 if (range->start + range->len > range->start) { 1036 if (range->start + range->len > range->start) {
1026 last_index = min_t(u64, inode->i_size - 1, 1037 last_index = min_t(u64, isize - 1,
1027 range->start + range->len - 1) >> PAGE_CACHE_SHIFT; 1038 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
1028 } else { 1039 } else {
1029 last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; 1040 last_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1030 } 1041 }
1031 1042
1032 if (newer_than) { 1043 if (newer_than) {
@@ -1039,16 +1050,24 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1039 * the extents in the file evenly spaced 1050 * the extents in the file evenly spaced
1040 */ 1051 */
1041 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; 1052 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
1042 newer_left = newer_cluster;
1043 } else 1053 } else
1044 goto out_ra; 1054 goto out_ra;
1045 } else { 1055 } else {
1046 i = range->start >> PAGE_CACHE_SHIFT; 1056 i = range->start >> PAGE_CACHE_SHIFT;
1047 } 1057 }
1048 if (!max_to_defrag) 1058 if (!max_to_defrag)
1049 max_to_defrag = last_index - 1; 1059 max_to_defrag = last_index;
1060
1061 /*
1062 * make writeback starts from i, so the defrag range can be
1063 * written sequentially.
1064 */
1065 if (i < inode->i_mapping->writeback_index)
1066 inode->i_mapping->writeback_index = i;
1050 1067
1051 while (i <= last_index && defrag_count < max_to_defrag) { 1068 while (i <= last_index && defrag_count < max_to_defrag &&
1069 (i < (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
1070 PAGE_CACHE_SHIFT)) {
1052 /* 1071 /*
1053 * make sure we stop running if someone unmounts 1072 * make sure we stop running if someone unmounts
1054 * the FS 1073 * the FS
@@ -1071,18 +1090,31 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1071 i = max(i + 1, next); 1090 i = max(i + 1, next);
1072 continue; 1091 continue;
1073 } 1092 }
1093
1094 if (!newer_than) {
1095 cluster = (PAGE_CACHE_ALIGN(defrag_end) >>
1096 PAGE_CACHE_SHIFT) - i;
1097 cluster = min(cluster, max_cluster);
1098 } else {
1099 cluster = max_cluster;
1100 }
1101
1074 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) 1102 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
1075 BTRFS_I(inode)->force_compress = compress_type; 1103 BTRFS_I(inode)->force_compress = compress_type;
1076 1104
1077 btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster); 1105 if (i + cluster > ra_index) {
1106 ra_index = max(i, ra_index);
1107 btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
1108 cluster);
1109 ra_index += max_cluster;
1110 }
1078 1111
1079 ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster); 1112 ret = cluster_pages_for_defrag(inode, pages, i, cluster);
1080 if (ret < 0) 1113 if (ret < 0)
1081 goto out_ra; 1114 goto out_ra;
1082 1115
1083 defrag_count += ret; 1116 defrag_count += ret;
1084 balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); 1117 balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
1085 i += ret;
1086 1118
1087 if (newer_than) { 1119 if (newer_than) {
1088 if (newer_off == (u64)-1) 1120 if (newer_off == (u64)-1)
@@ -1097,12 +1129,17 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1097 if (!ret) { 1129 if (!ret) {
1098 range->start = newer_off; 1130 range->start = newer_off;
1099 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; 1131 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
1100 newer_left = newer_cluster;
1101 } else { 1132 } else {
1102 break; 1133 break;
1103 } 1134 }
1104 } else { 1135 } else {
1105 i++; 1136 if (ret > 0) {
1137 i += ret;
1138 last_len += ret << PAGE_CACHE_SHIFT;
1139 } else {
1140 i++;
1141 last_len = 0;
1142 }
1106 } 1143 }
1107 } 1144 }
1108 1145
@@ -1128,16 +1165,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1128 mutex_unlock(&inode->i_mutex); 1165 mutex_unlock(&inode->i_mutex);
1129 } 1166 }
1130 1167
1131 disk_super = &root->fs_info->super_copy; 1168 disk_super = root->fs_info->super_copy;
1132 features = btrfs_super_incompat_flags(disk_super); 1169 features = btrfs_super_incompat_flags(disk_super);
1133 if (range->compress_type == BTRFS_COMPRESS_LZO) { 1170 if (range->compress_type == BTRFS_COMPRESS_LZO) {
1134 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; 1171 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
1135 btrfs_set_super_incompat_flags(disk_super, features); 1172 btrfs_set_super_incompat_flags(disk_super, features);
1136 } 1173 }
1137 1174
1138 if (!file) 1175 ret = defrag_count;
1139 kfree(ra);
1140 return defrag_count;
1141 1176
1142out_ra: 1177out_ra:
1143 if (!file) 1178 if (!file)
@@ -2579,7 +2614,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2579 return PTR_ERR(trans); 2614 return PTR_ERR(trans);
2580 } 2615 }
2581 2616
2582 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 2617 dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
2583 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, 2618 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
2584 dir_id, "default", 7, 1); 2619 dir_id, "default", 7, 1);
2585 if (IS_ERR_OR_NULL(di)) { 2620 if (IS_ERR_OR_NULL(di)) {
@@ -2595,7 +2630,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2595 btrfs_mark_buffer_dirty(path->nodes[0]); 2630 btrfs_mark_buffer_dirty(path->nodes[0]);
2596 btrfs_free_path(path); 2631 btrfs_free_path(path);
2597 2632
2598 disk_super = &root->fs_info->super_copy; 2633 disk_super = root->fs_info->super_copy;
2599 features = btrfs_super_incompat_flags(disk_super); 2634 features = btrfs_super_incompat_flags(disk_super);
2600 if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) { 2635 if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
2601 features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL; 2636 features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
@@ -2862,7 +2897,7 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
2862 int i; 2897 int i;
2863 unsigned long rel_ptr; 2898 unsigned long rel_ptr;
2864 int size; 2899 int size;
2865 struct btrfs_ioctl_ino_path_args *ipa; 2900 struct btrfs_ioctl_ino_path_args *ipa = NULL;
2866 struct inode_fs_paths *ipath = NULL; 2901 struct inode_fs_paths *ipath = NULL;
2867 struct btrfs_path *path; 2902 struct btrfs_path *path;
2868 2903
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index fb2605d998e9..f38e452486b8 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -158,8 +158,7 @@ static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
158void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) 158void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
159{ 159{
160 int i; 160 int i;
161 u32 type; 161 u32 type, nr;
162 u32 nr = btrfs_header_nritems(l);
163 struct btrfs_item *item; 162 struct btrfs_item *item;
164 struct btrfs_root_item *ri; 163 struct btrfs_root_item *ri;
165 struct btrfs_dir_item *di; 164 struct btrfs_dir_item *di;
@@ -172,6 +171,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
172 struct btrfs_key key; 171 struct btrfs_key key;
173 struct btrfs_key found_key; 172 struct btrfs_key found_key;
174 173
174 if (!l)
175 return;
176
177 nr = btrfs_header_nritems(l);
178
175 printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", 179 printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
176 (unsigned long long)btrfs_header_bytenr(l), nr, 180 (unsigned long long)btrfs_header_bytenr(l), nr,
177 btrfs_leaf_free_space(root, l)); 181 btrfs_leaf_free_space(root, l));
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
new file mode 100644
index 000000000000..cd857119ba8a
--- /dev/null
+++ b/fs/btrfs/reada.c
@@ -0,0 +1,949 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/pagemap.h>
21#include <linux/writeback.h>
22#include <linux/blkdev.h>
23#include <linux/rbtree.h>
24#include <linux/slab.h>
25#include <linux/workqueue.h>
26#include "ctree.h"
27#include "volumes.h"
28#include "disk-io.h"
29#include "transaction.h"
30
31#undef DEBUG
32
33/*
34 * This is the implementation for the generic read ahead framework.
35 *
36 * To trigger a readahead, btrfs_reada_add must be called. It will start
37 * a read ahead for the given range [start, end) on tree root. The returned
38 * handle can either be used to wait on the readahead to finish
39 * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
40 *
41 * The read ahead works as follows:
42 * On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
43 * reada_start_machine will then search for extents to prefetch and trigger
44 * some reads. When a read finishes for a node, all contained node/leaf
45 * pointers that lie in the given range will also be enqueued. The reads will
46 * be triggered in sequential order, thus giving a big win over a naive
47 * enumeration. It will also make use of multi-device layouts. Each disk
48 * will have its on read pointer and all disks will by utilized in parallel.
49 * Also will no two disks read both sides of a mirror simultaneously, as this
50 * would waste seeking capacity. Instead both disks will read different parts
51 * of the filesystem.
52 * Any number of readaheads can be started in parallel. The read order will be
53 * determined globally, i.e. 2 parallel readaheads will normally finish faster
54 * than the 2 started one after another.
55 */
56
57#define MAX_MIRRORS 2
58#define MAX_IN_FLIGHT 6
59
60struct reada_extctl {
61 struct list_head list;
62 struct reada_control *rc;
63 u64 generation;
64};
65
66struct reada_extent {
67 u64 logical;
68 struct btrfs_key top;
69 u32 blocksize;
70 int err;
71 struct list_head extctl;
72 struct kref refcnt;
73 spinlock_t lock;
74 struct reada_zone *zones[MAX_MIRRORS];
75 int nzones;
76 struct btrfs_device *scheduled_for;
77};
78
79struct reada_zone {
80 u64 start;
81 u64 end;
82 u64 elems;
83 struct list_head list;
84 spinlock_t lock;
85 int locked;
86 struct btrfs_device *device;
87 struct btrfs_device *devs[MAX_MIRRORS]; /* full list, incl self */
88 int ndevs;
89 struct kref refcnt;
90};
91
92struct reada_machine_work {
93 struct btrfs_work work;
94 struct btrfs_fs_info *fs_info;
95};
96
97static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *);
98static void reada_control_release(struct kref *kref);
99static void reada_zone_release(struct kref *kref);
100static void reada_start_machine(struct btrfs_fs_info *fs_info);
101static void __reada_start_machine(struct btrfs_fs_info *fs_info);
102
103static int reada_add_block(struct reada_control *rc, u64 logical,
104 struct btrfs_key *top, int level, u64 generation);
105
106/* recurses */
107/* in case of err, eb might be NULL */
108static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
109 u64 start, int err)
110{
111 int level = 0;
112 int nritems;
113 int i;
114 u64 bytenr;
115 u64 generation;
116 struct reada_extent *re;
117 struct btrfs_fs_info *fs_info = root->fs_info;
118 struct list_head list;
119 unsigned long index = start >> PAGE_CACHE_SHIFT;
120 struct btrfs_device *for_dev;
121
122 if (eb)
123 level = btrfs_header_level(eb);
124
125 /* find extent */
126 spin_lock(&fs_info->reada_lock);
127 re = radix_tree_lookup(&fs_info->reada_tree, index);
128 if (re)
129 kref_get(&re->refcnt);
130 spin_unlock(&fs_info->reada_lock);
131
132 if (!re)
133 return -1;
134
135 spin_lock(&re->lock);
136 /*
137 * just take the full list from the extent. afterwards we
138 * don't need the lock anymore
139 */
140 list_replace_init(&re->extctl, &list);
141 for_dev = re->scheduled_for;
142 re->scheduled_for = NULL;
143 spin_unlock(&re->lock);
144
145 if (err == 0) {
146 nritems = level ? btrfs_header_nritems(eb) : 0;
147 generation = btrfs_header_generation(eb);
148 /*
149 * FIXME: currently we just set nritems to 0 if this is a leaf,
150 * effectively ignoring the content. In a next step we could
151 * trigger more readahead depending from the content, e.g.
152 * fetch the checksums for the extents in the leaf.
153 */
154 } else {
155 /*
156 * this is the error case, the extent buffer has not been
157 * read correctly. We won't access anything from it and
158 * just cleanup our data structures. Effectively this will
159 * cut the branch below this node from read ahead.
160 */
161 nritems = 0;
162 generation = 0;
163 }
164
165 for (i = 0; i < nritems; i++) {
166 struct reada_extctl *rec;
167 u64 n_gen;
168 struct btrfs_key key;
169 struct btrfs_key next_key;
170
171 btrfs_node_key_to_cpu(eb, &key, i);
172 if (i + 1 < nritems)
173 btrfs_node_key_to_cpu(eb, &next_key, i + 1);
174 else
175 next_key = re->top;
176 bytenr = btrfs_node_blockptr(eb, i);
177 n_gen = btrfs_node_ptr_generation(eb, i);
178
179 list_for_each_entry(rec, &list, list) {
180 struct reada_control *rc = rec->rc;
181
182 /*
183 * if the generation doesn't match, just ignore this
184 * extctl. This will probably cut off a branch from
185 * prefetch. Alternatively one could start a new (sub-)
186 * prefetch for this branch, starting again from root.
187 * FIXME: move the generation check out of this loop
188 */
189#ifdef DEBUG
190 if (rec->generation != generation) {
191 printk(KERN_DEBUG "generation mismatch for "
192 "(%llu,%d,%llu) %llu != %llu\n",
193 key.objectid, key.type, key.offset,
194 rec->generation, generation);
195 }
196#endif
197 if (rec->generation == generation &&
198 btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
199 btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
200 reada_add_block(rc, bytenr, &next_key,
201 level - 1, n_gen);
202 }
203 }
204 /*
205 * free extctl records
206 */
207 while (!list_empty(&list)) {
208 struct reada_control *rc;
209 struct reada_extctl *rec;
210
211 rec = list_first_entry(&list, struct reada_extctl, list);
212 list_del(&rec->list);
213 rc = rec->rc;
214 kfree(rec);
215
216 kref_get(&rc->refcnt);
217 if (atomic_dec_and_test(&rc->elems)) {
218 kref_put(&rc->refcnt, reada_control_release);
219 wake_up(&rc->wait);
220 }
221 kref_put(&rc->refcnt, reada_control_release);
222
223 reada_extent_put(fs_info, re); /* one ref for each entry */
224 }
225 reada_extent_put(fs_info, re); /* our ref */
226 if (for_dev)
227 atomic_dec(&for_dev->reada_in_flight);
228
229 return 0;
230}
231
232/*
233 * start is passed separately in case eb in NULL, which may be the case with
234 * failed I/O
235 */
236int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
237 u64 start, int err)
238{
239 int ret;
240
241 ret = __readahead_hook(root, eb, start, err);
242
243 reada_start_machine(root->fs_info);
244
245 return ret;
246}
247
248static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
249 struct btrfs_device *dev, u64 logical,
250 struct btrfs_bio *multi)
251{
252 int ret;
253 int looped = 0;
254 struct reada_zone *zone;
255 struct btrfs_block_group_cache *cache = NULL;
256 u64 start;
257 u64 end;
258 int i;
259
260again:
261 zone = NULL;
262 spin_lock(&fs_info->reada_lock);
263 ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
264 logical >> PAGE_CACHE_SHIFT, 1);
265 if (ret == 1)
266 kref_get(&zone->refcnt);
267 spin_unlock(&fs_info->reada_lock);
268
269 if (ret == 1) {
270 if (logical >= zone->start && logical < zone->end)
271 return zone;
272 spin_lock(&fs_info->reada_lock);
273 kref_put(&zone->refcnt, reada_zone_release);
274 spin_unlock(&fs_info->reada_lock);
275 }
276
277 if (looped)
278 return NULL;
279
280 cache = btrfs_lookup_block_group(fs_info, logical);
281 if (!cache)
282 return NULL;
283
284 start = cache->key.objectid;
285 end = start + cache->key.offset - 1;
286 btrfs_put_block_group(cache);
287
288 zone = kzalloc(sizeof(*zone), GFP_NOFS);
289 if (!zone)
290 return NULL;
291
292 zone->start = start;
293 zone->end = end;
294 INIT_LIST_HEAD(&zone->list);
295 spin_lock_init(&zone->lock);
296 zone->locked = 0;
297 kref_init(&zone->refcnt);
298 zone->elems = 0;
299 zone->device = dev; /* our device always sits at index 0 */
300 for (i = 0; i < multi->num_stripes; ++i) {
301 /* bounds have already been checked */
302 zone->devs[i] = multi->stripes[i].dev;
303 }
304 zone->ndevs = multi->num_stripes;
305
306 spin_lock(&fs_info->reada_lock);
307 ret = radix_tree_insert(&dev->reada_zones,
308 (unsigned long)zone->end >> PAGE_CACHE_SHIFT,
309 zone);
310 spin_unlock(&fs_info->reada_lock);
311
312 if (ret) {
313 kfree(zone);
314 looped = 1;
315 goto again;
316 }
317
318 return zone;
319}
320
321static struct reada_extent *reada_find_extent(struct btrfs_root *root,
322 u64 logical,
323 struct btrfs_key *top, int level)
324{
325 int ret;
326 int looped = 0;
327 struct reada_extent *re = NULL;
328 struct btrfs_fs_info *fs_info = root->fs_info;
329 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
330 struct btrfs_bio *multi = NULL;
331 struct btrfs_device *dev;
332 u32 blocksize;
333 u64 length;
334 int nzones = 0;
335 int i;
336 unsigned long index = logical >> PAGE_CACHE_SHIFT;
337
338again:
339 spin_lock(&fs_info->reada_lock);
340 re = radix_tree_lookup(&fs_info->reada_tree, index);
341 if (re)
342 kref_get(&re->refcnt);
343 spin_unlock(&fs_info->reada_lock);
344
345 if (re || looped)
346 return re;
347
348 re = kzalloc(sizeof(*re), GFP_NOFS);
349 if (!re)
350 return NULL;
351
352 blocksize = btrfs_level_size(root, level);
353 re->logical = logical;
354 re->blocksize = blocksize;
355 re->top = *top;
356 INIT_LIST_HEAD(&re->extctl);
357 spin_lock_init(&re->lock);
358 kref_init(&re->refcnt);
359
360 /*
361 * map block
362 */
363 length = blocksize;
364 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &multi, 0);
365 if (ret || !multi || length < blocksize)
366 goto error;
367
368 if (multi->num_stripes > MAX_MIRRORS) {
369 printk(KERN_ERR "btrfs readahead: more than %d copies not "
370 "supported", MAX_MIRRORS);
371 goto error;
372 }
373
374 for (nzones = 0; nzones < multi->num_stripes; ++nzones) {
375 struct reada_zone *zone;
376
377 dev = multi->stripes[nzones].dev;
378 zone = reada_find_zone(fs_info, dev, logical, multi);
379 if (!zone)
380 break;
381
382 re->zones[nzones] = zone;
383 spin_lock(&zone->lock);
384 if (!zone->elems)
385 kref_get(&zone->refcnt);
386 ++zone->elems;
387 spin_unlock(&zone->lock);
388 spin_lock(&fs_info->reada_lock);
389 kref_put(&zone->refcnt, reada_zone_release);
390 spin_unlock(&fs_info->reada_lock);
391 }
392 re->nzones = nzones;
393 if (nzones == 0) {
394 /* not a single zone found, error and out */
395 goto error;
396 }
397
398 /* insert extent in reada_tree + all per-device trees, all or nothing */
399 spin_lock(&fs_info->reada_lock);
400 ret = radix_tree_insert(&fs_info->reada_tree, index, re);
401 if (ret) {
402 spin_unlock(&fs_info->reada_lock);
403 if (ret != -ENOMEM) {
404 /* someone inserted the extent in the meantime */
405 looped = 1;
406 }
407 goto error;
408 }
409 for (i = 0; i < nzones; ++i) {
410 dev = multi->stripes[i].dev;
411 ret = radix_tree_insert(&dev->reada_extents, index, re);
412 if (ret) {
413 while (--i >= 0) {
414 dev = multi->stripes[i].dev;
415 BUG_ON(dev == NULL);
416 radix_tree_delete(&dev->reada_extents, index);
417 }
418 BUG_ON(fs_info == NULL);
419 radix_tree_delete(&fs_info->reada_tree, index);
420 spin_unlock(&fs_info->reada_lock);
421 goto error;
422 }
423 }
424 spin_unlock(&fs_info->reada_lock);
425
426 return re;
427
428error:
429 while (nzones) {
430 struct reada_zone *zone;
431
432 --nzones;
433 zone = re->zones[nzones];
434 kref_get(&zone->refcnt);
435 spin_lock(&zone->lock);
436 --zone->elems;
437 if (zone->elems == 0) {
438 /*
439 * no fs_info->reada_lock needed, as this can't be
440 * the last ref
441 */
442 kref_put(&zone->refcnt, reada_zone_release);
443 }
444 spin_unlock(&zone->lock);
445
446 spin_lock(&fs_info->reada_lock);
447 kref_put(&zone->refcnt, reada_zone_release);
448 spin_unlock(&fs_info->reada_lock);
449 }
450 kfree(re);
451 if (looped)
452 goto again;
453 return NULL;
454}
455
456static void reada_kref_dummy(struct kref *kr)
457{
458}
459
460static void reada_extent_put(struct btrfs_fs_info *fs_info,
461 struct reada_extent *re)
462{
463 int i;
464 unsigned long index = re->logical >> PAGE_CACHE_SHIFT;
465
466 spin_lock(&fs_info->reada_lock);
467 if (!kref_put(&re->refcnt, reada_kref_dummy)) {
468 spin_unlock(&fs_info->reada_lock);
469 return;
470 }
471
472 radix_tree_delete(&fs_info->reada_tree, index);
473 for (i = 0; i < re->nzones; ++i) {
474 struct reada_zone *zone = re->zones[i];
475
476 radix_tree_delete(&zone->device->reada_extents, index);
477 }
478
479 spin_unlock(&fs_info->reada_lock);
480
481 for (i = 0; i < re->nzones; ++i) {
482 struct reada_zone *zone = re->zones[i];
483
484 kref_get(&zone->refcnt);
485 spin_lock(&zone->lock);
486 --zone->elems;
487 if (zone->elems == 0) {
488 /* no fs_info->reada_lock needed, as this can't be
489 * the last ref */
490 kref_put(&zone->refcnt, reada_zone_release);
491 }
492 spin_unlock(&zone->lock);
493
494 spin_lock(&fs_info->reada_lock);
495 kref_put(&zone->refcnt, reada_zone_release);
496 spin_unlock(&fs_info->reada_lock);
497 }
498 if (re->scheduled_for)
499 atomic_dec(&re->scheduled_for->reada_in_flight);
500
501 kfree(re);
502}
503
504static void reada_zone_release(struct kref *kref)
505{
506 struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
507
508 radix_tree_delete(&zone->device->reada_zones,
509 zone->end >> PAGE_CACHE_SHIFT);
510
511 kfree(zone);
512}
513
514static void reada_control_release(struct kref *kref)
515{
516 struct reada_control *rc = container_of(kref, struct reada_control,
517 refcnt);
518
519 kfree(rc);
520}
521
522static int reada_add_block(struct reada_control *rc, u64 logical,
523 struct btrfs_key *top, int level, u64 generation)
524{
525 struct btrfs_root *root = rc->root;
526 struct reada_extent *re;
527 struct reada_extctl *rec;
528
529 re = reada_find_extent(root, logical, top, level); /* takes one ref */
530 if (!re)
531 return -1;
532
533 rec = kzalloc(sizeof(*rec), GFP_NOFS);
534 if (!rec) {
535 reada_extent_put(root->fs_info, re);
536 return -1;
537 }
538
539 rec->rc = rc;
540 rec->generation = generation;
541 atomic_inc(&rc->elems);
542
543 spin_lock(&re->lock);
544 list_add_tail(&rec->list, &re->extctl);
545 spin_unlock(&re->lock);
546
547 /* leave the ref on the extent */
548
549 return 0;
550}
551
552/*
553 * called with fs_info->reada_lock held
554 */
555static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
556{
557 int i;
558 unsigned long index = zone->end >> PAGE_CACHE_SHIFT;
559
560 for (i = 0; i < zone->ndevs; ++i) {
561 struct reada_zone *peer;
562 peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index);
563 if (peer && peer->device != zone->device)
564 peer->locked = lock;
565 }
566}
567
568/*
569 * called with fs_info->reada_lock held
570 */
571static int reada_pick_zone(struct btrfs_device *dev)
572{
573 struct reada_zone *top_zone = NULL;
574 struct reada_zone *top_locked_zone = NULL;
575 u64 top_elems = 0;
576 u64 top_locked_elems = 0;
577 unsigned long index = 0;
578 int ret;
579
580 if (dev->reada_curr_zone) {
581 reada_peer_zones_set_lock(dev->reada_curr_zone, 0);
582 kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release);
583 dev->reada_curr_zone = NULL;
584 }
585 /* pick the zone with the most elements */
586 while (1) {
587 struct reada_zone *zone;
588
589 ret = radix_tree_gang_lookup(&dev->reada_zones,
590 (void **)&zone, index, 1);
591 if (ret == 0)
592 break;
593 index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
594 if (zone->locked) {
595 if (zone->elems > top_locked_elems) {
596 top_locked_elems = zone->elems;
597 top_locked_zone = zone;
598 }
599 } else {
600 if (zone->elems > top_elems) {
601 top_elems = zone->elems;
602 top_zone = zone;
603 }
604 }
605 }
606 if (top_zone)
607 dev->reada_curr_zone = top_zone;
608 else if (top_locked_zone)
609 dev->reada_curr_zone = top_locked_zone;
610 else
611 return 0;
612
613 dev->reada_next = dev->reada_curr_zone->start;
614 kref_get(&dev->reada_curr_zone->refcnt);
615 reada_peer_zones_set_lock(dev->reada_curr_zone, 1);
616
617 return 1;
618}
619
620static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
621 struct btrfs_device *dev)
622{
623 struct reada_extent *re = NULL;
624 int mirror_num = 0;
625 struct extent_buffer *eb = NULL;
626 u64 logical;
627 u32 blocksize;
628 int ret;
629 int i;
630 int need_kick = 0;
631
632 spin_lock(&fs_info->reada_lock);
633 if (dev->reada_curr_zone == NULL) {
634 ret = reada_pick_zone(dev);
635 if (!ret) {
636 spin_unlock(&fs_info->reada_lock);
637 return 0;
638 }
639 }
640 /*
641 * FIXME currently we issue the reads one extent at a time. If we have
642 * a contiguous block of extents, we could also coagulate them or use
643 * plugging to speed things up
644 */
645 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
646 dev->reada_next >> PAGE_CACHE_SHIFT, 1);
647 if (ret == 0 || re->logical >= dev->reada_curr_zone->end) {
648 ret = reada_pick_zone(dev);
649 if (!ret) {
650 spin_unlock(&fs_info->reada_lock);
651 return 0;
652 }
653 re = NULL;
654 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
655 dev->reada_next >> PAGE_CACHE_SHIFT, 1);
656 }
657 if (ret == 0) {
658 spin_unlock(&fs_info->reada_lock);
659 return 0;
660 }
661 dev->reada_next = re->logical + re->blocksize;
662 kref_get(&re->refcnt);
663
664 spin_unlock(&fs_info->reada_lock);
665
666 /*
667 * find mirror num
668 */
669 for (i = 0; i < re->nzones; ++i) {
670 if (re->zones[i]->device == dev) {
671 mirror_num = i + 1;
672 break;
673 }
674 }
675 logical = re->logical;
676 blocksize = re->blocksize;
677
678 spin_lock(&re->lock);
679 if (re->scheduled_for == NULL) {
680 re->scheduled_for = dev;
681 need_kick = 1;
682 }
683 spin_unlock(&re->lock);
684
685 reada_extent_put(fs_info, re);
686
687 if (!need_kick)
688 return 0;
689
690 atomic_inc(&dev->reada_in_flight);
691 ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize,
692 mirror_num, &eb);
693 if (ret)
694 __readahead_hook(fs_info->extent_root, NULL, logical, ret);
695 else if (eb)
696 __readahead_hook(fs_info->extent_root, eb, eb->start, ret);
697
698 if (eb)
699 free_extent_buffer(eb);
700
701 return 1;
702
703}
704
705static void reada_start_machine_worker(struct btrfs_work *work)
706{
707 struct reada_machine_work *rmw;
708 struct btrfs_fs_info *fs_info;
709
710 rmw = container_of(work, struct reada_machine_work, work);
711 fs_info = rmw->fs_info;
712
713 kfree(rmw);
714
715 __reada_start_machine(fs_info);
716}
717
718static void __reada_start_machine(struct btrfs_fs_info *fs_info)
719{
720 struct btrfs_device *device;
721 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
722 u64 enqueued;
723 u64 total = 0;
724 int i;
725
726 do {
727 enqueued = 0;
728 list_for_each_entry(device, &fs_devices->devices, dev_list) {
729 if (atomic_read(&device->reada_in_flight) <
730 MAX_IN_FLIGHT)
731 enqueued += reada_start_machine_dev(fs_info,
732 device);
733 }
734 total += enqueued;
735 } while (enqueued && total < 10000);
736
737 if (enqueued == 0)
738 return;
739
740 /*
741 * If everything is already in the cache, this is effectively single
742 * threaded. To a) not hold the caller for too long and b) to utilize
743 * more cores, we broke the loop above after 10000 iterations and now
744 * enqueue to workers to finish it. This will distribute the load to
745 * the cores.
746 */
747 for (i = 0; i < 2; ++i)
748 reada_start_machine(fs_info);
749}
750
751static void reada_start_machine(struct btrfs_fs_info *fs_info)
752{
753 struct reada_machine_work *rmw;
754
755 rmw = kzalloc(sizeof(*rmw), GFP_NOFS);
756 if (!rmw) {
757 /* FIXME we cannot handle this properly right now */
758 BUG();
759 }
760 rmw->work.func = reada_start_machine_worker;
761 rmw->fs_info = fs_info;
762
763 btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work);
764}
765
766#ifdef DEBUG
767static void dump_devs(struct btrfs_fs_info *fs_info, int all)
768{
769 struct btrfs_device *device;
770 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
771 unsigned long index;
772 int ret;
773 int i;
774 int j;
775 int cnt;
776
777 spin_lock(&fs_info->reada_lock);
778 list_for_each_entry(device, &fs_devices->devices, dev_list) {
779 printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid,
780 atomic_read(&device->reada_in_flight));
781 index = 0;
782 while (1) {
783 struct reada_zone *zone;
784 ret = radix_tree_gang_lookup(&device->reada_zones,
785 (void **)&zone, index, 1);
786 if (ret == 0)
787 break;
788 printk(KERN_DEBUG " zone %llu-%llu elems %llu locked "
789 "%d devs", zone->start, zone->end, zone->elems,
790 zone->locked);
791 for (j = 0; j < zone->ndevs; ++j) {
792 printk(KERN_CONT " %lld",
793 zone->devs[j]->devid);
794 }
795 if (device->reada_curr_zone == zone)
796 printk(KERN_CONT " curr off %llu",
797 device->reada_next - zone->start);
798 printk(KERN_CONT "\n");
799 index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
800 }
801 cnt = 0;
802 index = 0;
803 while (all) {
804 struct reada_extent *re = NULL;
805
806 ret = radix_tree_gang_lookup(&device->reada_extents,
807 (void **)&re, index, 1);
808 if (ret == 0)
809 break;
810 printk(KERN_DEBUG
811 " re: logical %llu size %u empty %d for %lld",
812 re->logical, re->blocksize,
813 list_empty(&re->extctl), re->scheduled_for ?
814 re->scheduled_for->devid : -1);
815
816 for (i = 0; i < re->nzones; ++i) {
817 printk(KERN_CONT " zone %llu-%llu devs",
818 re->zones[i]->start,
819 re->zones[i]->end);
820 for (j = 0; j < re->zones[i]->ndevs; ++j) {
821 printk(KERN_CONT " %lld",
822 re->zones[i]->devs[j]->devid);
823 }
824 }
825 printk(KERN_CONT "\n");
826 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
827 if (++cnt > 15)
828 break;
829 }
830 }
831
832 index = 0;
833 cnt = 0;
834 while (all) {
835 struct reada_extent *re = NULL;
836
837 ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re,
838 index, 1);
839 if (ret == 0)
840 break;
841 if (!re->scheduled_for) {
842 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
843 continue;
844 }
845 printk(KERN_DEBUG
846 "re: logical %llu size %u list empty %d for %lld",
847 re->logical, re->blocksize, list_empty(&re->extctl),
848 re->scheduled_for ? re->scheduled_for->devid : -1);
849 for (i = 0; i < re->nzones; ++i) {
850 printk(KERN_CONT " zone %llu-%llu devs",
851 re->zones[i]->start,
852 re->zones[i]->end);
853 for (i = 0; i < re->nzones; ++i) {
854 printk(KERN_CONT " zone %llu-%llu devs",
855 re->zones[i]->start,
856 re->zones[i]->end);
857 for (j = 0; j < re->zones[i]->ndevs; ++j) {
858 printk(KERN_CONT " %lld",
859 re->zones[i]->devs[j]->devid);
860 }
861 }
862 }
863 printk(KERN_CONT "\n");
864 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
865 }
866 spin_unlock(&fs_info->reada_lock);
867}
868#endif
869
870/*
871 * interface
872 */
873struct reada_control *btrfs_reada_add(struct btrfs_root *root,
874 struct btrfs_key *key_start, struct btrfs_key *key_end)
875{
876 struct reada_control *rc;
877 u64 start;
878 u64 generation;
879 int level;
880 struct extent_buffer *node;
881 static struct btrfs_key max_key = {
882 .objectid = (u64)-1,
883 .type = (u8)-1,
884 .offset = (u64)-1
885 };
886
887 rc = kzalloc(sizeof(*rc), GFP_NOFS);
888 if (!rc)
889 return ERR_PTR(-ENOMEM);
890
891 rc->root = root;
892 rc->key_start = *key_start;
893 rc->key_end = *key_end;
894 atomic_set(&rc->elems, 0);
895 init_waitqueue_head(&rc->wait);
896 kref_init(&rc->refcnt);
897 kref_get(&rc->refcnt); /* one ref for having elements */
898
899 node = btrfs_root_node(root);
900 start = node->start;
901 level = btrfs_header_level(node);
902 generation = btrfs_header_generation(node);
903 free_extent_buffer(node);
904
905 reada_add_block(rc, start, &max_key, level, generation);
906
907 reada_start_machine(root->fs_info);
908
909 return rc;
910}
911
912#ifdef DEBUG
913int btrfs_reada_wait(void *handle)
914{
915 struct reada_control *rc = handle;
916
917 while (atomic_read(&rc->elems)) {
918 wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
919 5 * HZ);
920 dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
921 }
922
923 dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
924
925 kref_put(&rc->refcnt, reada_control_release);
926
927 return 0;
928}
929#else
930int btrfs_reada_wait(void *handle)
931{
932 struct reada_control *rc = handle;
933
934 while (atomic_read(&rc->elems)) {
935 wait_event(rc->wait, atomic_read(&rc->elems) == 0);
936 }
937
938 kref_put(&rc->refcnt, reada_control_release);
939
940 return 0;
941}
942#endif
943
944void btrfs_reada_detach(void *handle)
945{
946 struct reada_control *rc = handle;
947
948 kref_put(&rc->refcnt, reada_control_release);
949}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 59bb1764273d..24d654ce7a06 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2041,8 +2041,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2041 BUG_ON(IS_ERR(trans)); 2041 BUG_ON(IS_ERR(trans));
2042 trans->block_rsv = rc->block_rsv; 2042 trans->block_rsv = rc->block_rsv;
2043 2043
2044 ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, 2044 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved);
2045 min_reserved, 0);
2046 if (ret) { 2045 if (ret) {
2047 BUG_ON(ret != -EAGAIN); 2046 BUG_ON(ret != -EAGAIN);
2048 ret = btrfs_commit_transaction(trans, root); 2047 ret = btrfs_commit_transaction(trans, root);
@@ -2152,8 +2151,7 @@ int prepare_to_merge(struct reloc_control *rc, int err)
2152again: 2151again:
2153 if (!err) { 2152 if (!err) {
2154 num_bytes = rc->merging_rsv_size; 2153 num_bytes = rc->merging_rsv_size;
2155 ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv, 2154 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
2156 num_bytes);
2157 if (ret) 2155 if (ret)
2158 err = ret; 2156 err = ret;
2159 } 2157 }
@@ -2427,7 +2425,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2427 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2425 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2428 2426
2429 trans->block_rsv = rc->block_rsv; 2427 trans->block_rsv = rc->block_rsv;
2430 ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes); 2428 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
2431 if (ret) { 2429 if (ret) {
2432 if (ret == -EAGAIN) 2430 if (ret == -EAGAIN)
2433 rc->commit_transaction = 1; 2431 rc->commit_transaction = 1;
@@ -2922,6 +2920,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
2922 unsigned long last_index; 2920 unsigned long last_index;
2923 struct page *page; 2921 struct page *page;
2924 struct file_ra_state *ra; 2922 struct file_ra_state *ra;
2923 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
2925 int nr = 0; 2924 int nr = 0;
2926 int ret = 0; 2925 int ret = 0;
2927 2926
@@ -2956,7 +2955,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
2956 ra, NULL, index, 2955 ra, NULL, index,
2957 last_index + 1 - index); 2956 last_index + 1 - index);
2958 page = find_or_create_page(inode->i_mapping, index, 2957 page = find_or_create_page(inode->i_mapping, index,
2959 GFP_NOFS); 2958 mask);
2960 if (!page) { 2959 if (!page) {
2961 btrfs_delalloc_release_metadata(inode, 2960 btrfs_delalloc_release_metadata(inode,
2962 PAGE_CACHE_SIZE); 2961 PAGE_CACHE_SIZE);
@@ -3323,8 +3322,11 @@ static int find_data_references(struct reloc_control *rc,
3323 } 3322 }
3324 3323
3325 key.objectid = ref_objectid; 3324 key.objectid = ref_objectid;
3326 key.offset = ref_offset;
3327 key.type = BTRFS_EXTENT_DATA_KEY; 3325 key.type = BTRFS_EXTENT_DATA_KEY;
3326 if (ref_offset > ((u64)-1 << 32))
3327 key.offset = 0;
3328 else
3329 key.offset = ref_offset;
3328 3330
3329 path->search_commit_root = 1; 3331 path->search_commit_root = 1;
3330 path->skip_locking = 1; 3332 path->skip_locking = 1;
@@ -3645,14 +3647,11 @@ int prepare_to_relocate(struct reloc_control *rc)
3645 * btrfs_init_reloc_root will use them when there 3647 * btrfs_init_reloc_root will use them when there
3646 * is no reservation in transaction handle. 3648 * is no reservation in transaction handle.
3647 */ 3649 */
3648 ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv, 3650 ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
3649 rc->extent_root->nodesize * 256); 3651 rc->extent_root->nodesize * 256);
3650 if (ret) 3652 if (ret)
3651 return ret; 3653 return ret;
3652 3654
3653 rc->block_rsv->refill_used = 1;
3654 btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
3655
3656 memset(&rc->cluster, 0, sizeof(rc->cluster)); 3655 memset(&rc->cluster, 0, sizeof(rc->cluster));
3657 rc->search_start = rc->block_group->key.objectid; 3656 rc->search_start = rc->block_group->key.objectid;
3658 rc->extents_found = 0; 3657 rc->extents_found = 0;
@@ -3777,8 +3776,7 @@ restart:
3777 } 3776 }
3778 } 3777 }
3779 3778
3780 ret = btrfs_block_rsv_check(trans, rc->extent_root, 3779 ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5);
3781 rc->block_rsv, 0, 5);
3782 if (ret < 0) { 3780 if (ret < 0) {
3783 if (ret != -EAGAIN) { 3781 if (ret != -EAGAIN) {
3784 err = ret; 3782 err = ret;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index eba42e5fd5fd..94cd3a19e9c8 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -33,15 +33,12 @@
33 * any can be found. 33 * any can be found.
34 * 34 *
35 * Future enhancements: 35 * Future enhancements:
36 * - To enhance the performance, better read-ahead strategies for the
37 * extent-tree can be employed.
38 * - In case an unrepairable extent is encountered, track which files are 36 * - In case an unrepairable extent is encountered, track which files are
39 * affected and report them 37 * affected and report them
40 * - In case of a read error on files with nodatasum, map the file and read 38 * - In case of a read error on files with nodatasum, map the file and read
41 * the extent to trigger a writeback of the good copy 39 * the extent to trigger a writeback of the good copy
42 * - track and record media errors, throw out bad devices 40 * - track and record media errors, throw out bad devices
43 * - add a mode to also read unallocated space 41 * - add a mode to also read unallocated space
44 * - make the prefetch cancellable
45 */ 42 */
46 43
47struct scrub_bio; 44struct scrub_bio;
@@ -209,7 +206,7 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
209 atomic_set(&sdev->in_flight, 0); 206 atomic_set(&sdev->in_flight, 0);
210 atomic_set(&sdev->fixup_cnt, 0); 207 atomic_set(&sdev->fixup_cnt, 0);
211 atomic_set(&sdev->cancel_req, 0); 208 atomic_set(&sdev->cancel_req, 0);
212 sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy); 209 sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
213 INIT_LIST_HEAD(&sdev->csum_list); 210 INIT_LIST_HEAD(&sdev->csum_list);
214 211
215 spin_lock_init(&sdev->list_lock); 212 spin_lock_init(&sdev->list_lock);
@@ -1130,13 +1127,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1130 int slot; 1127 int slot;
1131 int i; 1128 int i;
1132 u64 nstripes; 1129 u64 nstripes;
1133 int start_stripe;
1134 struct extent_buffer *l; 1130 struct extent_buffer *l;
1135 struct btrfs_key key; 1131 struct btrfs_key key;
1136 u64 physical; 1132 u64 physical;
1137 u64 logical; 1133 u64 logical;
1138 u64 generation; 1134 u64 generation;
1139 int mirror_num; 1135 int mirror_num;
1136 struct reada_control *reada1;
1137 struct reada_control *reada2;
1138 struct btrfs_key key_start;
1139 struct btrfs_key key_end;
1140 1140
1141 u64 increment = map->stripe_len; 1141 u64 increment = map->stripe_len;
1142 u64 offset; 1142 u64 offset;
@@ -1168,81 +1168,67 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1168 if (!path) 1168 if (!path)
1169 return -ENOMEM; 1169 return -ENOMEM;
1170 1170
1171 path->reada = 2;
1172 path->search_commit_root = 1; 1171 path->search_commit_root = 1;
1173 path->skip_locking = 1; 1172 path->skip_locking = 1;
1174 1173
1175 /* 1174 /*
1176 * find all extents for each stripe and just read them to get 1175 * trigger the readahead for extent tree csum tree and wait for
1177 * them into the page cache 1176 * completion. During readahead, the scrub is officially paused
1178 * FIXME: we can do better. build a more intelligent prefetching 1177 * to not hold off transaction commits
1179 */ 1178 */
1180 logical = base + offset; 1179 logical = base + offset;
1181 physical = map->stripes[num].physical;
1182 ret = 0;
1183 for (i = 0; i < nstripes; ++i) {
1184 key.objectid = logical;
1185 key.type = BTRFS_EXTENT_ITEM_KEY;
1186 key.offset = (u64)0;
1187
1188 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1189 if (ret < 0)
1190 goto out_noplug;
1191
1192 /*
1193 * we might miss half an extent here, but that doesn't matter,
1194 * as it's only the prefetch
1195 */
1196 while (1) {
1197 l = path->nodes[0];
1198 slot = path->slots[0];
1199 if (slot >= btrfs_header_nritems(l)) {
1200 ret = btrfs_next_leaf(root, path);
1201 if (ret == 0)
1202 continue;
1203 if (ret < 0)
1204 goto out_noplug;
1205 1180
1206 break; 1181 wait_event(sdev->list_wait,
1207 } 1182 atomic_read(&sdev->in_flight) == 0);
1208 btrfs_item_key_to_cpu(l, &key, slot); 1183 atomic_inc(&fs_info->scrubs_paused);
1184 wake_up(&fs_info->scrub_pause_wait);
1209 1185
1210 if (key.objectid >= logical + map->stripe_len) 1186 /* FIXME it might be better to start readahead at commit root */
1211 break; 1187 key_start.objectid = logical;
1188 key_start.type = BTRFS_EXTENT_ITEM_KEY;
1189 key_start.offset = (u64)0;
1190 key_end.objectid = base + offset + nstripes * increment;
1191 key_end.type = BTRFS_EXTENT_ITEM_KEY;
1192 key_end.offset = (u64)0;
1193 reada1 = btrfs_reada_add(root, &key_start, &key_end);
1194
1195 key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1196 key_start.type = BTRFS_EXTENT_CSUM_KEY;
1197 key_start.offset = logical;
1198 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1199 key_end.type = BTRFS_EXTENT_CSUM_KEY;
1200 key_end.offset = base + offset + nstripes * increment;
1201 reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
1202
1203 if (!IS_ERR(reada1))
1204 btrfs_reada_wait(reada1);
1205 if (!IS_ERR(reada2))
1206 btrfs_reada_wait(reada2);
1212 1207
1213 path->slots[0]++; 1208 mutex_lock(&fs_info->scrub_lock);
1214 } 1209 while (atomic_read(&fs_info->scrub_pause_req)) {
1215 btrfs_release_path(path); 1210 mutex_unlock(&fs_info->scrub_lock);
1216 logical += increment; 1211 wait_event(fs_info->scrub_pause_wait,
1217 physical += map->stripe_len; 1212 atomic_read(&fs_info->scrub_pause_req) == 0);
1218 cond_resched(); 1213 mutex_lock(&fs_info->scrub_lock);
1219 } 1214 }
1215 atomic_dec(&fs_info->scrubs_paused);
1216 mutex_unlock(&fs_info->scrub_lock);
1217 wake_up(&fs_info->scrub_pause_wait);
1220 1218
1221 /* 1219 /*
1222 * collect all data csums for the stripe to avoid seeking during 1220 * collect all data csums for the stripe to avoid seeking during
1223 * the scrub. This might currently (crc32) end up to be about 1MB 1221 * the scrub. This might currently (crc32) end up to be about 1MB
1224 */ 1222 */
1225 start_stripe = 0;
1226 blk_start_plug(&plug); 1223 blk_start_plug(&plug);
1227again:
1228 logical = base + offset + start_stripe * increment;
1229 for (i = start_stripe; i < nstripes; ++i) {
1230 ret = btrfs_lookup_csums_range(csum_root, logical,
1231 logical + map->stripe_len - 1,
1232 &sdev->csum_list, 1);
1233 if (ret)
1234 goto out;
1235 1224
1236 logical += increment;
1237 cond_resched();
1238 }
1239 /* 1225 /*
1240 * now find all extents for each stripe and scrub them 1226 * now find all extents for each stripe and scrub them
1241 */ 1227 */
1242 logical = base + offset + start_stripe * increment; 1228 logical = base + offset;
1243 physical = map->stripes[num].physical + start_stripe * map->stripe_len; 1229 physical = map->stripes[num].physical;
1244 ret = 0; 1230 ret = 0;
1245 for (i = start_stripe; i < nstripes; ++i) { 1231 for (i = 0; i < nstripes; ++i) {
1246 /* 1232 /*
1247 * canceled? 1233 * canceled?
1248 */ 1234 */
@@ -1271,11 +1257,14 @@ again:
1271 atomic_dec(&fs_info->scrubs_paused); 1257 atomic_dec(&fs_info->scrubs_paused);
1272 mutex_unlock(&fs_info->scrub_lock); 1258 mutex_unlock(&fs_info->scrub_lock);
1273 wake_up(&fs_info->scrub_pause_wait); 1259 wake_up(&fs_info->scrub_pause_wait);
1274 scrub_free_csums(sdev);
1275 start_stripe = i;
1276 goto again;
1277 } 1260 }
1278 1261
1262 ret = btrfs_lookup_csums_range(csum_root, logical,
1263 logical + map->stripe_len - 1,
1264 &sdev->csum_list, 1);
1265 if (ret)
1266 goto out;
1267
1279 key.objectid = logical; 1268 key.objectid = logical;
1280 key.type = BTRFS_EXTENT_ITEM_KEY; 1269 key.type = BTRFS_EXTENT_ITEM_KEY;
1281 key.offset = (u64)0; 1270 key.offset = (u64)0;
@@ -1371,7 +1360,6 @@ next:
1371 1360
1372out: 1361out:
1373 blk_finish_plug(&plug); 1362 blk_finish_plug(&plug);
1374out_noplug:
1375 btrfs_free_path(path); 1363 btrfs_free_path(path);
1376 return ret < 0 ? ret : 0; 1364 return ret < 0 ? ret : 0;
1377} 1365}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 15634d4648d7..57080dffdfc6 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -40,6 +40,7 @@
40#include <linux/magic.h> 40#include <linux/magic.h>
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/cleancache.h> 42#include <linux/cleancache.h>
43#include <linux/mnt_namespace.h>
43#include "compat.h" 44#include "compat.h"
44#include "delayed-inode.h" 45#include "delayed-inode.h"
45#include "ctree.h" 46#include "ctree.h"
@@ -58,6 +59,7 @@
58#include <trace/events/btrfs.h> 59#include <trace/events/btrfs.h>
59 60
60static const struct super_operations btrfs_super_ops; 61static const struct super_operations btrfs_super_ops;
62static struct file_system_type btrfs_fs_type;
61 63
62static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, 64static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
63 char nbuf[16]) 65 char nbuf[16])
@@ -162,7 +164,7 @@ enum {
162 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, 164 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
163 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, 165 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
164 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, 166 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
165 Opt_inode_cache, Opt_err, 167 Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
166}; 168};
167 169
168static match_table_t tokens = { 170static match_table_t tokens = {
@@ -195,6 +197,8 @@ static match_table_t tokens = {
195 {Opt_subvolrootid, "subvolrootid=%d"}, 197 {Opt_subvolrootid, "subvolrootid=%d"},
196 {Opt_defrag, "autodefrag"}, 198 {Opt_defrag, "autodefrag"},
197 {Opt_inode_cache, "inode_cache"}, 199 {Opt_inode_cache, "inode_cache"},
200 {Opt_no_space_cache, "no_space_cache"},
201 {Opt_recovery, "recovery"},
198 {Opt_err, NULL}, 202 {Opt_err, NULL},
199}; 203};
200 204
@@ -206,14 +210,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
206{ 210{
207 struct btrfs_fs_info *info = root->fs_info; 211 struct btrfs_fs_info *info = root->fs_info;
208 substring_t args[MAX_OPT_ARGS]; 212 substring_t args[MAX_OPT_ARGS];
209 char *p, *num, *orig; 213 char *p, *num, *orig = NULL;
214 u64 cache_gen;
210 int intarg; 215 int intarg;
211 int ret = 0; 216 int ret = 0;
212 char *compress_type; 217 char *compress_type;
213 bool compress_force = false; 218 bool compress_force = false;
214 219
220 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
221 if (cache_gen)
222 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
223
215 if (!options) 224 if (!options)
216 return 0; 225 goto out;
217 226
218 /* 227 /*
219 * strsep changes the string, duplicate it because parse_options 228 * strsep changes the string, duplicate it because parse_options
@@ -360,9 +369,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
360 btrfs_set_opt(info->mount_opt, DISCARD); 369 btrfs_set_opt(info->mount_opt, DISCARD);
361 break; 370 break;
362 case Opt_space_cache: 371 case Opt_space_cache:
363 printk(KERN_INFO "btrfs: enabling disk space caching\n");
364 btrfs_set_opt(info->mount_opt, SPACE_CACHE); 372 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
365 break; 373 break;
374 case Opt_no_space_cache:
375 printk(KERN_INFO "btrfs: disabling disk space caching\n");
376 btrfs_clear_opt(info->mount_opt, SPACE_CACHE);
377 break;
366 case Opt_inode_cache: 378 case Opt_inode_cache:
367 printk(KERN_INFO "btrfs: enabling inode map caching\n"); 379 printk(KERN_INFO "btrfs: enabling inode map caching\n");
368 btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE); 380 btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE);
@@ -381,6 +393,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
381 printk(KERN_INFO "btrfs: enabling auto defrag"); 393 printk(KERN_INFO "btrfs: enabling auto defrag");
382 btrfs_set_opt(info->mount_opt, AUTO_DEFRAG); 394 btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
383 break; 395 break;
396 case Opt_recovery:
397 printk(KERN_INFO "btrfs: enabling auto recovery");
398 btrfs_set_opt(info->mount_opt, RECOVERY);
399 break;
384 case Opt_err: 400 case Opt_err:
385 printk(KERN_INFO "btrfs: unrecognized mount option " 401 printk(KERN_INFO "btrfs: unrecognized mount option "
386 "'%s'\n", p); 402 "'%s'\n", p);
@@ -391,6 +407,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
391 } 407 }
392 } 408 }
393out: 409out:
410 if (!ret && btrfs_test_opt(root, SPACE_CACHE))
411 printk(KERN_INFO "btrfs: disk space caching is enabled\n");
394 kfree(orig); 412 kfree(orig);
395 return ret; 413 return ret;
396} 414}
@@ -406,12 +424,12 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
406 u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices) 424 u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
407{ 425{
408 substring_t args[MAX_OPT_ARGS]; 426 substring_t args[MAX_OPT_ARGS];
409 char *opts, *orig, *p; 427 char *device_name, *opts, *orig, *p;
410 int error = 0; 428 int error = 0;
411 int intarg; 429 int intarg;
412 430
413 if (!options) 431 if (!options)
414 goto out; 432 return 0;
415 433
416 /* 434 /*
417 * strsep changes the string, duplicate it because parse_options 435 * strsep changes the string, duplicate it because parse_options
@@ -457,29 +475,24 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
457 } 475 }
458 break; 476 break;
459 case Opt_device: 477 case Opt_device:
460 error = btrfs_scan_one_device(match_strdup(&args[0]), 478 device_name = match_strdup(&args[0]);
479 if (!device_name) {
480 error = -ENOMEM;
481 goto out;
482 }
483 error = btrfs_scan_one_device(device_name,
461 flags, holder, fs_devices); 484 flags, holder, fs_devices);
485 kfree(device_name);
462 if (error) 486 if (error)
463 goto out_free_opts; 487 goto out;
464 break; 488 break;
465 default: 489 default:
466 break; 490 break;
467 } 491 }
468 } 492 }
469 493
470 out_free_opts: 494out:
471 kfree(orig); 495 kfree(orig);
472 out:
473 /*
474 * If no subvolume name is specified we use the default one. Allocate
475 * a copy of the string "." here so that code later in the
476 * mount path doesn't care if it's the default volume or another one.
477 */
478 if (!*subvol_name) {
479 *subvol_name = kstrdup(".", GFP_KERNEL);
480 if (!*subvol_name)
481 return -ENOMEM;
482 }
483 return error; 496 return error;
484} 497}
485 498
@@ -492,7 +505,6 @@ static struct dentry *get_default_root(struct super_block *sb,
492 struct btrfs_path *path; 505 struct btrfs_path *path;
493 struct btrfs_key location; 506 struct btrfs_key location;
494 struct inode *inode; 507 struct inode *inode;
495 struct dentry *dentry;
496 u64 dir_id; 508 u64 dir_id;
497 int new = 0; 509 int new = 0;
498 510
@@ -517,7 +529,7 @@ static struct dentry *get_default_root(struct super_block *sb,
517 * will mount by default if we haven't been given a specific subvolume 529 * will mount by default if we haven't been given a specific subvolume
518 * to mount. 530 * to mount.
519 */ 531 */
520 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 532 dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
521 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); 533 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
522 if (IS_ERR(di)) { 534 if (IS_ERR(di)) {
523 btrfs_free_path(path); 535 btrfs_free_path(path);
@@ -566,29 +578,7 @@ setup_root:
566 return dget(sb->s_root); 578 return dget(sb->s_root);
567 } 579 }
568 580
569 if (new) { 581 return d_obtain_alias(inode);
570 const struct qstr name = { .name = "/", .len = 1 };
571
572 /*
573 * New inode, we need to make the dentry a sibling of s_root so
574 * everything gets cleaned up properly on unmount.
575 */
576 dentry = d_alloc(sb->s_root, &name);
577 if (!dentry) {
578 iput(inode);
579 return ERR_PTR(-ENOMEM);
580 }
581 d_splice_alias(inode, dentry);
582 } else {
583 /*
584 * We found the inode in cache, just find a dentry for it and
585 * put the reference to the inode we just got.
586 */
587 dentry = d_find_alias(inode);
588 iput(inode);
589 }
590
591 return dentry;
592} 582}
593 583
594static int btrfs_fill_super(struct super_block *sb, 584static int btrfs_fill_super(struct super_block *sb,
@@ -719,6 +709,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
719 seq_puts(seq, ",noacl"); 709 seq_puts(seq, ",noacl");
720 if (btrfs_test_opt(root, SPACE_CACHE)) 710 if (btrfs_test_opt(root, SPACE_CACHE))
721 seq_puts(seq, ",space_cache"); 711 seq_puts(seq, ",space_cache");
712 else
713 seq_puts(seq, ",no_space_cache");
722 if (btrfs_test_opt(root, CLEAR_CACHE)) 714 if (btrfs_test_opt(root, CLEAR_CACHE))
723 seq_puts(seq, ",clear_cache"); 715 seq_puts(seq, ",clear_cache");
724 if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) 716 if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
@@ -753,6 +745,137 @@ static int btrfs_set_super(struct super_block *s, void *data)
753 return set_anon_super(s, data); 745 return set_anon_super(s, data);
754} 746}
755 747
748/*
749 * subvolumes are identified by ino 256
750 */
751static inline int is_subvolume_inode(struct inode *inode)
752{
753 if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
754 return 1;
755 return 0;
756}
757
758/*
759 * This will strip out the subvol=%s argument for an argument string and add
760 * subvolid=0 to make sure we get the actual tree root for path walking to the
761 * subvol we want.
762 */
763static char *setup_root_args(char *args)
764{
765 unsigned copied = 0;
766 unsigned len = strlen(args) + 2;
767 char *pos;
768 char *ret;
769
770 /*
771 * We need the same args as before, but minus
772 *
773 * subvol=a
774 *
775 * and add
776 *
777 * subvolid=0
778 *
779 * which is a difference of 2 characters, so we allocate strlen(args) +
780 * 2 characters.
781 */
782 ret = kzalloc(len * sizeof(char), GFP_NOFS);
783 if (!ret)
784 return NULL;
785 pos = strstr(args, "subvol=");
786
787 /* This shouldn't happen, but just in case.. */
788 if (!pos) {
789 kfree(ret);
790 return NULL;
791 }
792
793 /*
794 * The subvol=<> arg is not at the front of the string, copy everybody
795 * up to that into ret.
796 */
797 if (pos != args) {
798 *pos = '\0';
799 strcpy(ret, args);
800 copied += strlen(args);
801 pos++;
802 }
803
804 strncpy(ret + copied, "subvolid=0", len - copied);
805
806 /* Length of subvolid=0 */
807 copied += 10;
808
809 /*
810 * If there is no , after the subvol= option then we know there's no
811 * other options and we can just return.
812 */
813 pos = strchr(pos, ',');
814 if (!pos)
815 return ret;
816
817 /* Copy the rest of the arguments into our buffer */
818 strncpy(ret + copied, pos, len - copied);
819 copied += strlen(pos);
820
821 return ret;
822}
823
824static struct dentry *mount_subvol(const char *subvol_name, int flags,
825 const char *device_name, char *data)
826{
827 struct super_block *s;
828 struct dentry *root;
829 struct vfsmount *mnt;
830 struct mnt_namespace *ns_private;
831 char *newargs;
832 struct path path;
833 int error;
834
835 newargs = setup_root_args(data);
836 if (!newargs)
837 return ERR_PTR(-ENOMEM);
838 mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
839 newargs);
840 kfree(newargs);
841 if (IS_ERR(mnt))
842 return ERR_CAST(mnt);
843
844 ns_private = create_mnt_ns(mnt);
845 if (IS_ERR(ns_private)) {
846 mntput(mnt);
847 return ERR_CAST(ns_private);
848 }
849
850 /*
851 * This will trigger the automount of the subvol so we can just
852 * drop the mnt we have here and return the dentry that we
853 * found.
854 */
855 error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name,
856 LOOKUP_FOLLOW, &path);
857 put_mnt_ns(ns_private);
858 if (error)
859 return ERR_PTR(error);
860
861 if (!is_subvolume_inode(path.dentry->d_inode)) {
862 path_put(&path);
863 mntput(mnt);
864 error = -EINVAL;
865 printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
866 subvol_name);
867 return ERR_PTR(-EINVAL);
868 }
869
870 /* Get a ref to the sb and the dentry we found and return it */
871 s = path.mnt->mnt_sb;
872 atomic_inc(&s->s_active);
873 root = dget(path.dentry);
874 path_put(&path);
875 down_write(&s->s_umount);
876
877 return root;
878}
756 879
757/* 880/*
758 * Find a superblock for the given device / mount point. 881 * Find a superblock for the given device / mount point.
@@ -784,13 +907,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
784 if (error) 907 if (error)
785 return ERR_PTR(error); 908 return ERR_PTR(error);
786 909
910 if (subvol_name) {
911 root = mount_subvol(subvol_name, flags, device_name, data);
912 kfree(subvol_name);
913 return root;
914 }
915
787 error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices); 916 error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
788 if (error) 917 if (error)
789 goto error_free_subvol_name; 918 return ERR_PTR(error);
790 919
791 error = btrfs_open_devices(fs_devices, mode, fs_type); 920 error = btrfs_open_devices(fs_devices, mode, fs_type);
792 if (error) 921 if (error)
793 goto error_free_subvol_name; 922 return ERR_PTR(error);
794 923
795 if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { 924 if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
796 error = -EACCES; 925 error = -EACCES;
@@ -813,88 +942,57 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
813 fs_info->fs_devices = fs_devices; 942 fs_info->fs_devices = fs_devices;
814 tree_root->fs_info = fs_info; 943 tree_root->fs_info = fs_info;
815 944
945 fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
946 fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
947 if (!fs_info->super_copy || !fs_info->super_for_commit) {
948 error = -ENOMEM;
949 goto error_close_devices;
950 }
951
816 bdev = fs_devices->latest_bdev; 952 bdev = fs_devices->latest_bdev;
817 s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root); 953 s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
818 if (IS_ERR(s)) 954 if (IS_ERR(s)) {
819 goto error_s; 955 error = PTR_ERR(s);
956 goto error_close_devices;
957 }
820 958
821 if (s->s_root) { 959 if (s->s_root) {
822 if ((flags ^ s->s_flags) & MS_RDONLY) { 960 if ((flags ^ s->s_flags) & MS_RDONLY) {
823 deactivate_locked_super(s); 961 deactivate_locked_super(s);
824 error = -EBUSY; 962 return ERR_PTR(-EBUSY);
825 goto error_close_devices;
826 } 963 }
827 964
828 btrfs_close_devices(fs_devices); 965 btrfs_close_devices(fs_devices);
829 kfree(fs_info); 966 free_fs_info(fs_info);
830 kfree(tree_root); 967 kfree(tree_root);
831 } else { 968 } else {
832 char b[BDEVNAME_SIZE]; 969 char b[BDEVNAME_SIZE];
833 970
834 s->s_flags = flags | MS_NOSEC; 971 s->s_flags = flags | MS_NOSEC;
835 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); 972 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
973 btrfs_sb(s)->fs_info->bdev_holder = fs_type;
836 error = btrfs_fill_super(s, fs_devices, data, 974 error = btrfs_fill_super(s, fs_devices, data,
837 flags & MS_SILENT ? 1 : 0); 975 flags & MS_SILENT ? 1 : 0);
838 if (error) { 976 if (error) {
839 deactivate_locked_super(s); 977 deactivate_locked_super(s);
840 goto error_free_subvol_name; 978 return ERR_PTR(error);
841 } 979 }
842 980
843 btrfs_sb(s)->fs_info->bdev_holder = fs_type;
844 s->s_flags |= MS_ACTIVE; 981 s->s_flags |= MS_ACTIVE;
845 } 982 }
846 983
847 /* if they gave us a subvolume name bind mount into that */ 984 root = get_default_root(s, subvol_objectid);
848 if (strcmp(subvol_name, ".")) { 985 if (IS_ERR(root)) {
849 struct dentry *new_root; 986 deactivate_locked_super(s);
850 987 return root;
851 root = get_default_root(s, subvol_rootid);
852 if (IS_ERR(root)) {
853 error = PTR_ERR(root);
854 deactivate_locked_super(s);
855 goto error_free_subvol_name;
856 }
857
858 mutex_lock(&root->d_inode->i_mutex);
859 new_root = lookup_one_len(subvol_name, root,
860 strlen(subvol_name));
861 mutex_unlock(&root->d_inode->i_mutex);
862
863 if (IS_ERR(new_root)) {
864 dput(root);
865 deactivate_locked_super(s);
866 error = PTR_ERR(new_root);
867 goto error_free_subvol_name;
868 }
869 if (!new_root->d_inode) {
870 dput(root);
871 dput(new_root);
872 deactivate_locked_super(s);
873 error = -ENXIO;
874 goto error_free_subvol_name;
875 }
876 dput(root);
877 root = new_root;
878 } else {
879 root = get_default_root(s, subvol_objectid);
880 if (IS_ERR(root)) {
881 error = PTR_ERR(root);
882 deactivate_locked_super(s);
883 goto error_free_subvol_name;
884 }
885 } 988 }
886 989
887 kfree(subvol_name);
888 return root; 990 return root;
889 991
890error_s:
891 error = PTR_ERR(s);
892error_close_devices: 992error_close_devices:
893 btrfs_close_devices(fs_devices); 993 btrfs_close_devices(fs_devices);
894 kfree(fs_info); 994 free_fs_info(fs_info);
895 kfree(tree_root); 995 kfree(tree_root);
896error_free_subvol_name:
897 kfree(subvol_name);
898 return ERR_PTR(error); 996 return ERR_PTR(error);
899} 997}
900 998
@@ -919,7 +1017,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
919 if (root->fs_info->fs_devices->rw_devices == 0) 1017 if (root->fs_info->fs_devices->rw_devices == 0)
920 return -EACCES; 1018 return -EACCES;
921 1019
922 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) 1020 if (btrfs_super_log_root(root->fs_info->super_copy) != 0)
923 return -EINVAL; 1021 return -EINVAL;
924 1022
925 ret = btrfs_cleanup_fs_roots(root->fs_info); 1023 ret = btrfs_cleanup_fs_roots(root->fs_info);
@@ -1085,7 +1183,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1085static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) 1183static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1086{ 1184{
1087 struct btrfs_root *root = btrfs_sb(dentry->d_sb); 1185 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
1088 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 1186 struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1089 struct list_head *head = &root->fs_info->space_info; 1187 struct list_head *head = &root->fs_info->space_info;
1090 struct btrfs_space_info *found; 1188 struct btrfs_space_info *found;
1091 u64 total_used = 0; 1189 u64 total_used = 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e24b7964a155..29f782cc2cc9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -275,7 +275,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
275 */ 275 */
276 if (num_items > 0 && root != root->fs_info->chunk_root) { 276 if (num_items > 0 && root != root->fs_info->chunk_root) {
277 num_bytes = btrfs_calc_trans_metadata_size(root, num_items); 277 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
278 ret = btrfs_block_rsv_add(NULL, root, 278 ret = btrfs_block_rsv_add(root,
279 &root->fs_info->trans_block_rsv, 279 &root->fs_info->trans_block_rsv,
280 num_bytes); 280 num_bytes);
281 if (ret) 281 if (ret)
@@ -418,8 +418,8 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
418 struct btrfs_root *root) 418 struct btrfs_root *root)
419{ 419{
420 int ret; 420 int ret;
421 ret = btrfs_block_rsv_check(trans, root, 421
422 &root->fs_info->global_block_rsv, 0, 5); 422 ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
423 return ret ? 1 : 0; 423 return ret ? 1 : 0;
424} 424}
425 425
@@ -427,17 +427,26 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
427 struct btrfs_root *root) 427 struct btrfs_root *root)
428{ 428{
429 struct btrfs_transaction *cur_trans = trans->transaction; 429 struct btrfs_transaction *cur_trans = trans->transaction;
430 struct btrfs_block_rsv *rsv = trans->block_rsv;
430 int updates; 431 int updates;
431 432
432 smp_mb(); 433 smp_mb();
433 if (cur_trans->blocked || cur_trans->delayed_refs.flushing) 434 if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
434 return 1; 435 return 1;
435 436
437 /*
438 * We need to do this in case we're deleting csums so the global block
439 * rsv get's used instead of the csum block rsv.
440 */
441 trans->block_rsv = NULL;
442
436 updates = trans->delayed_ref_updates; 443 updates = trans->delayed_ref_updates;
437 trans->delayed_ref_updates = 0; 444 trans->delayed_ref_updates = 0;
438 if (updates) 445 if (updates)
439 btrfs_run_delayed_refs(trans, root, updates); 446 btrfs_run_delayed_refs(trans, root, updates);
440 447
448 trans->block_rsv = rsv;
449
441 return should_end_transaction(trans, root); 450 return should_end_transaction(trans, root);
442} 451}
443 452
@@ -453,6 +462,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
453 return 0; 462 return 0;
454 } 463 }
455 464
465 btrfs_trans_release_metadata(trans, root);
466 trans->block_rsv = NULL;
456 while (count < 4) { 467 while (count < 4) {
457 unsigned long cur = trans->delayed_ref_updates; 468 unsigned long cur = trans->delayed_ref_updates;
458 trans->delayed_ref_updates = 0; 469 trans->delayed_ref_updates = 0;
@@ -473,8 +484,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
473 count++; 484 count++;
474 } 485 }
475 486
476 btrfs_trans_release_metadata(trans, root);
477
478 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && 487 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
479 should_end_transaction(trans, root)) { 488 should_end_transaction(trans, root)) {
480 trans->transaction->blocked = 1; 489 trans->transaction->blocked = 1;
@@ -562,50 +571,21 @@ int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
562int btrfs_write_marked_extents(struct btrfs_root *root, 571int btrfs_write_marked_extents(struct btrfs_root *root,
563 struct extent_io_tree *dirty_pages, int mark) 572 struct extent_io_tree *dirty_pages, int mark)
564{ 573{
565 int ret;
566 int err = 0; 574 int err = 0;
567 int werr = 0; 575 int werr = 0;
568 struct page *page; 576 struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
569 struct inode *btree_inode = root->fs_info->btree_inode;
570 u64 start = 0; 577 u64 start = 0;
571 u64 end; 578 u64 end;
572 unsigned long index;
573 579
574 while (1) { 580 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
575 ret = find_first_extent_bit(dirty_pages, start, &start, &end, 581 mark)) {
576 mark); 582 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
577 if (ret) 583 GFP_NOFS);
578 break; 584 err = filemap_fdatawrite_range(mapping, start, end);
579 while (start <= end) { 585 if (err)
580 cond_resched(); 586 werr = err;
581 587 cond_resched();
582 index = start >> PAGE_CACHE_SHIFT; 588 start = end + 1;
583 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
584 page = find_get_page(btree_inode->i_mapping, index);
585 if (!page)
586 continue;
587
588 btree_lock_page_hook(page);
589 if (!page->mapping) {
590 unlock_page(page);
591 page_cache_release(page);
592 continue;
593 }
594
595 if (PageWriteback(page)) {
596 if (PageDirty(page))
597 wait_on_page_writeback(page);
598 else {
599 unlock_page(page);
600 page_cache_release(page);
601 continue;
602 }
603 }
604 err = write_one_page(page, 0);
605 if (err)
606 werr = err;
607 page_cache_release(page);
608 }
609 } 589 }
610 if (err) 590 if (err)
611 werr = err; 591 werr = err;
@@ -621,39 +601,20 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
621int btrfs_wait_marked_extents(struct btrfs_root *root, 601int btrfs_wait_marked_extents(struct btrfs_root *root,
622 struct extent_io_tree *dirty_pages, int mark) 602 struct extent_io_tree *dirty_pages, int mark)
623{ 603{
624 int ret;
625 int err = 0; 604 int err = 0;
626 int werr = 0; 605 int werr = 0;
627 struct page *page; 606 struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
628 struct inode *btree_inode = root->fs_info->btree_inode;
629 u64 start = 0; 607 u64 start = 0;
630 u64 end; 608 u64 end;
631 unsigned long index;
632 609
633 while (1) { 610 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
634 ret = find_first_extent_bit(dirty_pages, start, &start, &end, 611 EXTENT_NEED_WAIT)) {
635 mark); 612 clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
636 if (ret) 613 err = filemap_fdatawait_range(mapping, start, end);
637 break; 614 if (err)
638 615 werr = err;
639 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); 616 cond_resched();
640 while (start <= end) { 617 start = end + 1;
641 index = start >> PAGE_CACHE_SHIFT;
642 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
643 page = find_get_page(btree_inode->i_mapping, index);
644 if (!page)
645 continue;
646 if (PageDirty(page)) {
647 btree_lock_page_hook(page);
648 wait_on_page_writeback(page);
649 err = write_one_page(page, 0);
650 if (err)
651 werr = err;
652 }
653 wait_on_page_writeback(page);
654 page_cache_release(page);
655 cond_resched();
656 }
657 } 618 }
658 if (err) 619 if (err)
659 werr = err; 620 werr = err;
@@ -673,7 +634,12 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
673 634
674 ret = btrfs_write_marked_extents(root, dirty_pages, mark); 635 ret = btrfs_write_marked_extents(root, dirty_pages, mark);
675 ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); 636 ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
676 return ret || ret2; 637
638 if (ret)
639 return ret;
640 if (ret2)
641 return ret2;
642 return 0;
677} 643}
678 644
679int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 645int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
@@ -911,10 +877,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
911 } 877 }
912 878
913 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); 879 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
914 btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
915 880
916 if (to_reserve > 0) { 881 if (to_reserve > 0) {
917 ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, 882 ret = btrfs_block_rsv_add(root, &pending->block_rsv,
918 to_reserve); 883 to_reserve);
919 if (ret) { 884 if (ret) {
920 pending->error = ret; 885 pending->error = ret;
@@ -1002,7 +967,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1002 BUG_ON(IS_ERR(pending->snap)); 967 BUG_ON(IS_ERR(pending->snap));
1003 968
1004 btrfs_reloc_post_snapshot(trans, pending); 969 btrfs_reloc_post_snapshot(trans, pending);
1005 btrfs_orphan_post_snapshot(trans, pending);
1006fail: 970fail:
1007 kfree(new_root_item); 971 kfree(new_root_item);
1008 trans->block_rsv = rsv; 972 trans->block_rsv = rsv;
@@ -1032,7 +996,7 @@ static void update_super_roots(struct btrfs_root *root)
1032 struct btrfs_root_item *root_item; 996 struct btrfs_root_item *root_item;
1033 struct btrfs_super_block *super; 997 struct btrfs_super_block *super;
1034 998
1035 super = &root->fs_info->super_copy; 999 super = root->fs_info->super_copy;
1036 1000
1037 root_item = &root->fs_info->chunk_root->root_item; 1001 root_item = &root->fs_info->chunk_root->root_item;
1038 super->chunk_root = root_item->bytenr; 1002 super->chunk_root = root_item->bytenr;
@@ -1043,7 +1007,7 @@ static void update_super_roots(struct btrfs_root *root)
1043 super->root = root_item->bytenr; 1007 super->root = root_item->bytenr;
1044 super->generation = root_item->generation; 1008 super->generation = root_item->generation;
1045 super->root_level = root_item->level; 1009 super->root_level = root_item->level;
1046 if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE)) 1010 if (btrfs_test_opt(root, SPACE_CACHE))
1047 super->cache_generation = root_item->generation; 1011 super->cache_generation = root_item->generation;
1048} 1012}
1049 1013
@@ -1168,14 +1132,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1168 1132
1169 btrfs_run_ordered_operations(root, 0); 1133 btrfs_run_ordered_operations(root, 0);
1170 1134
1135 btrfs_trans_release_metadata(trans, root);
1136 trans->block_rsv = NULL;
1137
1171 /* make a pass through all the delayed refs we have so far 1138 /* make a pass through all the delayed refs we have so far
1172 * any runnings procs may add more while we are here 1139 * any runnings procs may add more while we are here
1173 */ 1140 */
1174 ret = btrfs_run_delayed_refs(trans, root, 0); 1141 ret = btrfs_run_delayed_refs(trans, root, 0);
1175 BUG_ON(ret); 1142 BUG_ON(ret);
1176 1143
1177 btrfs_trans_release_metadata(trans, root);
1178
1179 cur_trans = trans->transaction; 1144 cur_trans = trans->transaction;
1180 /* 1145 /*
1181 * set the flushing flag so procs in this transaction have to 1146 * set the flushing flag so procs in this transaction have to
@@ -1341,12 +1306,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1341 update_super_roots(root); 1306 update_super_roots(root);
1342 1307
1343 if (!root->fs_info->log_root_recovering) { 1308 if (!root->fs_info->log_root_recovering) {
1344 btrfs_set_super_log_root(&root->fs_info->super_copy, 0); 1309 btrfs_set_super_log_root(root->fs_info->super_copy, 0);
1345 btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0); 1310 btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
1346 } 1311 }
1347 1312
1348 memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, 1313 memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
1349 sizeof(root->fs_info->super_copy)); 1314 sizeof(*root->fs_info->super_copy));
1350 1315
1351 trans->transaction->blocked = 0; 1316 trans->transaction->blocked = 0;
1352 spin_lock(&root->fs_info->trans_lock); 1317 spin_lock(&root->fs_info->trans_lock);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 786639fca067..f4d81c06d48f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -276,8 +276,9 @@ static int process_one_buffer(struct btrfs_root *log,
276 struct walk_control *wc, u64 gen) 276 struct walk_control *wc, u64 gen)
277{ 277{
278 if (wc->pin) 278 if (wc->pin)
279 btrfs_pin_extent(log->fs_info->extent_root, 279 btrfs_pin_extent_for_log_replay(wc->trans,
280 eb->start, eb->len, 0); 280 log->fs_info->extent_root,
281 eb->start, eb->len);
281 282
282 if (btrfs_buffer_uptodate(eb, gen)) { 283 if (btrfs_buffer_uptodate(eb, gen)) {
283 if (wc->write) 284 if (wc->write)
@@ -1760,7 +1761,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1760 1761
1761 WARN_ON(root_owner != 1762 WARN_ON(root_owner !=
1762 BTRFS_TREE_LOG_OBJECTID); 1763 BTRFS_TREE_LOG_OBJECTID);
1763 ret = btrfs_free_reserved_extent(root, 1764 ret = btrfs_free_and_pin_reserved_extent(root,
1764 bytenr, blocksize); 1765 bytenr, blocksize);
1765 BUG_ON(ret); 1766 BUG_ON(ret);
1766 } 1767 }
@@ -1828,7 +1829,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1828 btrfs_tree_unlock(next); 1829 btrfs_tree_unlock(next);
1829 1830
1830 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1831 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1831 ret = btrfs_free_reserved_extent(root, 1832 ret = btrfs_free_and_pin_reserved_extent(root,
1832 path->nodes[*level]->start, 1833 path->nodes[*level]->start,
1833 path->nodes[*level]->len); 1834 path->nodes[*level]->len);
1834 BUG_ON(ret); 1835 BUG_ON(ret);
@@ -1897,7 +1898,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1897 1898
1898 WARN_ON(log->root_key.objectid != 1899 WARN_ON(log->root_key.objectid !=
1899 BTRFS_TREE_LOG_OBJECTID); 1900 BTRFS_TREE_LOG_OBJECTID);
1900 ret = btrfs_free_reserved_extent(log, next->start, 1901 ret = btrfs_free_and_pin_reserved_extent(log, next->start,
1901 next->len); 1902 next->len);
1902 BUG_ON(ret); 1903 BUG_ON(ret);
1903 } 1904 }
@@ -2013,10 +2014,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2013 /* wait for previous tree log sync to complete */ 2014 /* wait for previous tree log sync to complete */
2014 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2015 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2015 wait_log_commit(trans, root, root->log_transid - 1); 2016 wait_log_commit(trans, root, root->log_transid - 1);
2016
2017 while (1) { 2017 while (1) {
2018 unsigned long batch = root->log_batch; 2018 unsigned long batch = root->log_batch;
2019 if (root->log_multiple_pids) { 2019 /* when we're on an ssd, just kick the log commit out */
2020 if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
2020 mutex_unlock(&root->log_mutex); 2021 mutex_unlock(&root->log_mutex);
2021 schedule_timeout_uninterruptible(1); 2022 schedule_timeout_uninterruptible(1);
2022 mutex_lock(&root->log_mutex); 2023 mutex_lock(&root->log_mutex);
@@ -2117,9 +2118,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2117 BUG_ON(ret); 2118 BUG_ON(ret);
2118 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2119 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2119 2120
2120 btrfs_set_super_log_root(&root->fs_info->super_for_commit, 2121 btrfs_set_super_log_root(root->fs_info->super_for_commit,
2121 log_root_tree->node->start); 2122 log_root_tree->node->start);
2122 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, 2123 btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
2123 btrfs_header_level(log_root_tree->node)); 2124 btrfs_header_level(log_root_tree->node));
2124 2125
2125 log_root_tree->log_batch = 0; 2126 log_root_tree->log_batch = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 18baac5a3f6c..f8e2943101a1 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -366,6 +366,14 @@ static noinline int device_list_add(const char *path,
366 } 366 }
367 INIT_LIST_HEAD(&device->dev_alloc_list); 367 INIT_LIST_HEAD(&device->dev_alloc_list);
368 368
369 /* init readahead state */
370 spin_lock_init(&device->reada_lock);
371 device->reada_curr_zone = NULL;
372 atomic_set(&device->reada_in_flight, 0);
373 device->reada_next = 0;
374 INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
375 INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
376
369 mutex_lock(&fs_devices->device_list_mutex); 377 mutex_lock(&fs_devices->device_list_mutex);
370 list_add_rcu(&device->dev_list, &fs_devices->devices); 378 list_add_rcu(&device->dev_list, &fs_devices->devices);
371 mutex_unlock(&fs_devices->device_list_mutex); 379 mutex_unlock(&fs_devices->device_list_mutex);
@@ -597,10 +605,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
597 set_blocksize(bdev, 4096); 605 set_blocksize(bdev, 4096);
598 606
599 bh = btrfs_read_dev_super(bdev); 607 bh = btrfs_read_dev_super(bdev);
600 if (!bh) { 608 if (!bh)
601 ret = -EINVAL;
602 goto error_close; 609 goto error_close;
603 }
604 610
605 disk_super = (struct btrfs_super_block *)bh->b_data; 611 disk_super = (struct btrfs_super_block *)bh->b_data;
606 devid = btrfs_stack_device_id(&disk_super->dev_item); 612 devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -655,7 +661,7 @@ error:
655 continue; 661 continue;
656 } 662 }
657 if (fs_devices->open_devices == 0) { 663 if (fs_devices->open_devices == 0) {
658 ret = -EIO; 664 ret = -EINVAL;
659 goto out; 665 goto out;
660 } 666 }
661 fs_devices->seeding = seeding; 667 fs_devices->seeding = seeding;
@@ -1013,8 +1019,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1013 } 1019 }
1014 BUG_ON(ret); 1020 BUG_ON(ret);
1015 1021
1016 if (device->bytes_used > 0) 1022 if (device->bytes_used > 0) {
1017 device->bytes_used -= btrfs_dev_extent_length(leaf, extent); 1023 u64 len = btrfs_dev_extent_length(leaf, extent);
1024 device->bytes_used -= len;
1025 spin_lock(&root->fs_info->free_chunk_lock);
1026 root->fs_info->free_chunk_space += len;
1027 spin_unlock(&root->fs_info->free_chunk_lock);
1028 }
1018 ret = btrfs_del_item(trans, root, path); 1029 ret = btrfs_del_item(trans, root, path);
1019 1030
1020out: 1031out:
@@ -1356,6 +1367,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1356 if (ret) 1367 if (ret)
1357 goto error_undo; 1368 goto error_undo;
1358 1369
1370 spin_lock(&root->fs_info->free_chunk_lock);
1371 root->fs_info->free_chunk_space = device->total_bytes -
1372 device->bytes_used;
1373 spin_unlock(&root->fs_info->free_chunk_lock);
1374
1359 device->in_fs_metadata = 0; 1375 device->in_fs_metadata = 0;
1360 btrfs_scrub_cancel_dev(root, device); 1376 btrfs_scrub_cancel_dev(root, device);
1361 1377
@@ -1387,8 +1403,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1387 call_rcu(&device->rcu, free_device); 1403 call_rcu(&device->rcu, free_device);
1388 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1404 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1389 1405
1390 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 1406 num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
1391 btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); 1407 btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
1392 1408
1393 if (cur_devices->open_devices == 0) { 1409 if (cur_devices->open_devices == 0) {
1394 struct btrfs_fs_devices *fs_devices; 1410 struct btrfs_fs_devices *fs_devices;
@@ -1450,7 +1466,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1450 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1466 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1451 struct btrfs_fs_devices *old_devices; 1467 struct btrfs_fs_devices *old_devices;
1452 struct btrfs_fs_devices *seed_devices; 1468 struct btrfs_fs_devices *seed_devices;
1453 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 1469 struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1454 struct btrfs_device *device; 1470 struct btrfs_device *device;
1455 u64 super_flags; 1471 u64 super_flags;
1456 1472
@@ -1691,15 +1707,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1691 root->fs_info->fs_devices->num_can_discard++; 1707 root->fs_info->fs_devices->num_can_discard++;
1692 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1708 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1693 1709
1710 spin_lock(&root->fs_info->free_chunk_lock);
1711 root->fs_info->free_chunk_space += device->total_bytes;
1712 spin_unlock(&root->fs_info->free_chunk_lock);
1713
1694 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 1714 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
1695 root->fs_info->fs_devices->rotating = 1; 1715 root->fs_info->fs_devices->rotating = 1;
1696 1716
1697 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); 1717 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
1698 btrfs_set_super_total_bytes(&root->fs_info->super_copy, 1718 btrfs_set_super_total_bytes(root->fs_info->super_copy,
1699 total_bytes + device->total_bytes); 1719 total_bytes + device->total_bytes);
1700 1720
1701 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); 1721 total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
1702 btrfs_set_super_num_devices(&root->fs_info->super_copy, 1722 btrfs_set_super_num_devices(root->fs_info->super_copy,
1703 total_bytes + 1); 1723 total_bytes + 1);
1704 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1724 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1705 1725
@@ -1790,7 +1810,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1790 struct btrfs_device *device, u64 new_size) 1810 struct btrfs_device *device, u64 new_size)
1791{ 1811{
1792 struct btrfs_super_block *super_copy = 1812 struct btrfs_super_block *super_copy =
1793 &device->dev_root->fs_info->super_copy; 1813 device->dev_root->fs_info->super_copy;
1794 u64 old_total = btrfs_super_total_bytes(super_copy); 1814 u64 old_total = btrfs_super_total_bytes(super_copy);
1795 u64 diff = new_size - device->total_bytes; 1815 u64 diff = new_size - device->total_bytes;
1796 1816
@@ -1849,7 +1869,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1849static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 1869static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1850 chunk_offset) 1870 chunk_offset)
1851{ 1871{
1852 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 1872 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
1853 struct btrfs_disk_key *disk_key; 1873 struct btrfs_disk_key *disk_key;
1854 struct btrfs_chunk *chunk; 1874 struct btrfs_chunk *chunk;
1855 u8 *ptr; 1875 u8 *ptr;
@@ -2175,7 +2195,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2175 bool retried = false; 2195 bool retried = false;
2176 struct extent_buffer *l; 2196 struct extent_buffer *l;
2177 struct btrfs_key key; 2197 struct btrfs_key key;
2178 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2198 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2179 u64 old_total = btrfs_super_total_bytes(super_copy); 2199 u64 old_total = btrfs_super_total_bytes(super_copy);
2180 u64 old_size = device->total_bytes; 2200 u64 old_size = device->total_bytes;
2181 u64 diff = device->total_bytes - new_size; 2201 u64 diff = device->total_bytes - new_size;
@@ -2192,8 +2212,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2192 lock_chunks(root); 2212 lock_chunks(root);
2193 2213
2194 device->total_bytes = new_size; 2214 device->total_bytes = new_size;
2195 if (device->writeable) 2215 if (device->writeable) {
2196 device->fs_devices->total_rw_bytes -= diff; 2216 device->fs_devices->total_rw_bytes -= diff;
2217 spin_lock(&root->fs_info->free_chunk_lock);
2218 root->fs_info->free_chunk_space -= diff;
2219 spin_unlock(&root->fs_info->free_chunk_lock);
2220 }
2197 unlock_chunks(root); 2221 unlock_chunks(root);
2198 2222
2199again: 2223again:
@@ -2257,6 +2281,9 @@ again:
2257 device->total_bytes = old_size; 2281 device->total_bytes = old_size;
2258 if (device->writeable) 2282 if (device->writeable)
2259 device->fs_devices->total_rw_bytes += diff; 2283 device->fs_devices->total_rw_bytes += diff;
2284 spin_lock(&root->fs_info->free_chunk_lock);
2285 root->fs_info->free_chunk_space += diff;
2286 spin_unlock(&root->fs_info->free_chunk_lock);
2260 unlock_chunks(root); 2287 unlock_chunks(root);
2261 goto done; 2288 goto done;
2262 } 2289 }
@@ -2292,7 +2319,7 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
2292 struct btrfs_key *key, 2319 struct btrfs_key *key,
2293 struct btrfs_chunk *chunk, int item_size) 2320 struct btrfs_chunk *chunk, int item_size)
2294{ 2321{
2295 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2322 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2296 struct btrfs_disk_key disk_key; 2323 struct btrfs_disk_key disk_key;
2297 u32 array_size; 2324 u32 array_size;
2298 u8 *ptr; 2325 u8 *ptr;
@@ -2615,6 +2642,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2615 index++; 2642 index++;
2616 } 2643 }
2617 2644
2645 spin_lock(&extent_root->fs_info->free_chunk_lock);
2646 extent_root->fs_info->free_chunk_space -= (stripe_size *
2647 map->num_stripes);
2648 spin_unlock(&extent_root->fs_info->free_chunk_lock);
2649
2618 index = 0; 2650 index = 0;
2619 stripe = &chunk->stripe; 2651 stripe = &chunk->stripe;
2620 while (index < map->num_stripes) { 2652 while (index < map->num_stripes) {
@@ -3626,15 +3658,20 @@ static int read_one_dev(struct btrfs_root *root,
3626 fill_device_from_item(leaf, dev_item, device); 3658 fill_device_from_item(leaf, dev_item, device);
3627 device->dev_root = root->fs_info->dev_root; 3659 device->dev_root = root->fs_info->dev_root;
3628 device->in_fs_metadata = 1; 3660 device->in_fs_metadata = 1;
3629 if (device->writeable) 3661 if (device->writeable) {
3630 device->fs_devices->total_rw_bytes += device->total_bytes; 3662 device->fs_devices->total_rw_bytes += device->total_bytes;
3663 spin_lock(&root->fs_info->free_chunk_lock);
3664 root->fs_info->free_chunk_space += device->total_bytes -
3665 device->bytes_used;
3666 spin_unlock(&root->fs_info->free_chunk_lock);
3667 }
3631 ret = 0; 3668 ret = 0;
3632 return ret; 3669 return ret;
3633} 3670}
3634 3671
3635int btrfs_read_sys_array(struct btrfs_root *root) 3672int btrfs_read_sys_array(struct btrfs_root *root)
3636{ 3673{
3637 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 3674 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3638 struct extent_buffer *sb; 3675 struct extent_buffer *sb;
3639 struct btrfs_disk_key *disk_key; 3676 struct btrfs_disk_key *disk_key;
3640 struct btrfs_chunk *chunk; 3677 struct btrfs_chunk *chunk;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 71f4f3f67495..ab5b1c49f352 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -92,6 +92,14 @@ struct btrfs_device {
92 struct btrfs_work work; 92 struct btrfs_work work;
93 struct rcu_head rcu; 93 struct rcu_head rcu;
94 struct work_struct rcu_work; 94 struct work_struct rcu_work;
95
96 /* readahead state */
97 spinlock_t reada_lock;
98 atomic_t reada_in_flight;
99 u64 reada_next;
100 struct reada_zone *reada_curr_zone;
101 struct radix_tree_root reada_zones;
102 struct radix_tree_root reada_extents;
95}; 103};
96 104
97struct btrfs_fs_devices { 105struct btrfs_fs_devices {
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 69565e5fc6a0..a76e41c04b71 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -127,6 +127,17 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
127again: 127again:
128 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), 128 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
129 name, name_len, value, size); 129 name, name_len, value, size);
130 /*
131 * If we're setting an xattr to a new value but the new value is say
132 * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting
133 * back from split_leaf. This is because it thinks we'll be extending
134 * the existing item size, but we're asking for enough space to add the
135 * item itself. So if we get EOVERFLOW just set ret to EEXIST and let
136 * the rest of the function figure it out.
137 */
138 if (ret == -EOVERFLOW)
139 ret = -EEXIST;
140
130 if (ret == -EEXIST) { 141 if (ret == -EEXIST) {
131 if (flags & XATTR_CREATE) 142 if (flags & XATTR_CREATE)
132 goto out; 143 goto out;
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index e76bfeb68267..30acd22147e1 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -351,9 +351,7 @@ static int
351build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp) 351build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
352{ 352{
353 unsigned int dlen; 353 unsigned int dlen;
354 unsigned int wlen; 354 unsigned int size = 2 * sizeof(struct ntlmssp2_name);
355 unsigned int size = 6 * sizeof(struct ntlmssp2_name);
356 __le64 curtime;
357 char *defdmname = "WORKGROUP"; 355 char *defdmname = "WORKGROUP";
358 unsigned char *blobptr; 356 unsigned char *blobptr;
359 struct ntlmssp2_name *attrptr; 357 struct ntlmssp2_name *attrptr;
@@ -365,15 +363,14 @@ build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
365 } 363 }
366 364
367 dlen = strlen(ses->domainName); 365 dlen = strlen(ses->domainName);
368 wlen = strlen(ses->server->hostname);
369 366
370 /* The length of this blob is a size which is 367 /*
371 * six times the size of a structure which holds name/size + 368 * The length of this blob is two times the size of a
372 * two times the unicode length of a domain name + 369 * structure (av pair) which holds name/size
373 * two times the unicode length of a server name + 370 * ( for NTLMSSP_AV_NB_DOMAIN_NAME followed by NTLMSSP_AV_EOL ) +
374 * size of a timestamp (which is 8 bytes). 371 * unicode length of a netbios domain name
375 */ 372 */
376 ses->auth_key.len = size + 2 * (2 * dlen) + 2 * (2 * wlen) + 8; 373 ses->auth_key.len = size + 2 * dlen;
377 ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL); 374 ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL);
378 if (!ses->auth_key.response) { 375 if (!ses->auth_key.response) {
379 ses->auth_key.len = 0; 376 ses->auth_key.len = 0;
@@ -384,44 +381,15 @@ build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
384 blobptr = ses->auth_key.response; 381 blobptr = ses->auth_key.response;
385 attrptr = (struct ntlmssp2_name *) blobptr; 382 attrptr = (struct ntlmssp2_name *) blobptr;
386 383
384 /*
385 * As defined in MS-NTLM 3.3.2, just this av pair field
386 * is sufficient as part of the temp
387 */
387 attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME); 388 attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME);
388 attrptr->length = cpu_to_le16(2 * dlen); 389 attrptr->length = cpu_to_le16(2 * dlen);
389 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name); 390 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
390 cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp); 391 cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
391 392
392 blobptr += 2 * dlen;
393 attrptr = (struct ntlmssp2_name *) blobptr;
394
395 attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_COMPUTER_NAME);
396 attrptr->length = cpu_to_le16(2 * wlen);
397 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
398 cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp);
399
400 blobptr += 2 * wlen;
401 attrptr = (struct ntlmssp2_name *) blobptr;
402
403 attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_DOMAIN_NAME);
404 attrptr->length = cpu_to_le16(2 * dlen);
405 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
406 cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
407
408 blobptr += 2 * dlen;
409 attrptr = (struct ntlmssp2_name *) blobptr;
410
411 attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_COMPUTER_NAME);
412 attrptr->length = cpu_to_le16(2 * wlen);
413 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
414 cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp);
415
416 blobptr += 2 * wlen;
417 attrptr = (struct ntlmssp2_name *) blobptr;
418
419 attrptr->type = cpu_to_le16(NTLMSSP_AV_TIMESTAMP);
420 attrptr->length = cpu_to_le16(sizeof(__le64));
421 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
422 curtime = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
423 memcpy(blobptr, &curtime, sizeof(__le64));
424
425 return 0; 393 return 0;
426} 394}
427 395
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index f93eb948d071..54b8f1e7da94 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -548,6 +548,12 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
548 struct inode *dir = dentry->d_inode; 548 struct inode *dir = dentry->d_inode;
549 struct dentry *child; 549 struct dentry *child;
550 550
551 if (!dir) {
552 dput(dentry);
553 dentry = ERR_PTR(-ENOENT);
554 break;
555 }
556
551 /* skip separators */ 557 /* skip separators */
552 while (*s == sep) 558 while (*s == sep)
553 s++; 559 s++;
@@ -563,10 +569,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
563 mutex_unlock(&dir->i_mutex); 569 mutex_unlock(&dir->i_mutex);
564 dput(dentry); 570 dput(dentry);
565 dentry = child; 571 dentry = child;
566 if (!dentry->d_inode) {
567 dput(dentry);
568 dentry = ERR_PTR(-ENOENT);
569 }
570 } while (!IS_ERR(dentry)); 572 } while (!IS_ERR(dentry));
571 _FreeXid(xid); 573 _FreeXid(xid);
572 kfree(full_path); 574 kfree(full_path);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index aac37d99a487..a80f7bd97b90 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -4079,7 +4079,8 @@ int CIFSFindNext(const int xid, struct cifs_tcon *tcon,
4079 T2_FNEXT_RSP_PARMS *parms; 4079 T2_FNEXT_RSP_PARMS *parms;
4080 char *response_data; 4080 char *response_data;
4081 int rc = 0; 4081 int rc = 0;
4082 int bytes_returned, name_len; 4082 int bytes_returned;
4083 unsigned int name_len;
4083 __u16 params, byte_count; 4084 __u16 params, byte_count;
4084 4085
4085 cFYI(1, "In FindNext"); 4086 cFYI(1, "In FindNext");
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 633c246b6775..71beb0201970 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1298,7 +1298,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1298 /* ignore */ 1298 /* ignore */
1299 } else if (strnicmp(data, "guest", 5) == 0) { 1299 } else if (strnicmp(data, "guest", 5) == 0) {
1300 /* ignore */ 1300 /* ignore */
1301 } else if (strnicmp(data, "rw", 2) == 0) { 1301 } else if (strnicmp(data, "rw", 2) == 0 && strlen(data) == 2) {
1302 /* ignore */ 1302 /* ignore */
1303 } else if (strnicmp(data, "ro", 2) == 0) { 1303 } else if (strnicmp(data, "ro", 2) == 0) {
1304 /* ignore */ 1304 /* ignore */
@@ -1401,7 +1401,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1401 vol->server_ino = 1; 1401 vol->server_ino = 1;
1402 } else if (strnicmp(data, "noserverino", 9) == 0) { 1402 } else if (strnicmp(data, "noserverino", 9) == 0) {
1403 vol->server_ino = 0; 1403 vol->server_ino = 0;
1404 } else if (strnicmp(data, "rwpidforward", 4) == 0) { 1404 } else if (strnicmp(data, "rwpidforward", 12) == 0) {
1405 vol->rwpidforward = 1; 1405 vol->rwpidforward = 1;
1406 } else if (strnicmp(data, "cifsacl", 7) == 0) { 1406 } else if (strnicmp(data, "cifsacl", 7) == 0) {
1407 vol->cifs_acl = 1; 1407 vol->cifs_acl = 1;
@@ -2018,7 +2018,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
2018 warned_on_ntlm = true; 2018 warned_on_ntlm = true;
2019 cERROR(1, "default security mechanism requested. The default " 2019 cERROR(1, "default security mechanism requested. The default "
2020 "security mechanism will be upgraded from ntlm to " 2020 "security mechanism will be upgraded from ntlm to "
2021 "ntlmv2 in kernel release 3.1"); 2021 "ntlmv2 in kernel release 3.2");
2022 } 2022 }
2023 ses->overrideSecFlg = volume_info->secFlg; 2023 ses->overrideSecFlg = volume_info->secFlg;
2024 2024
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 04da6acde85d..12661e1deedd 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1134,7 +1134,7 @@ struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode,
1134 return bh; 1134 return bh;
1135 if (buffer_uptodate(bh)) 1135 if (buffer_uptodate(bh))
1136 return bh; 1136 return bh;
1137 ll_rw_block(READ_META, 1, &bh); 1137 ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
1138 wait_on_buffer(bh); 1138 wait_on_buffer(bh);
1139 if (buffer_uptodate(bh)) 1139 if (buffer_uptodate(bh))
1140 return bh; 1140 return bh;
@@ -2807,7 +2807,7 @@ make_io:
2807 trace_ext3_load_inode(inode); 2807 trace_ext3_load_inode(inode);
2808 get_bh(bh); 2808 get_bh(bh);
2809 bh->b_end_io = end_buffer_read_sync; 2809 bh->b_end_io = end_buffer_read_sync;
2810 submit_bh(READ_META, bh); 2810 submit_bh(READ | REQ_META | REQ_PRIO, bh);
2811 wait_on_buffer(bh); 2811 wait_on_buffer(bh);
2812 if (!buffer_uptodate(bh)) { 2812 if (!buffer_uptodate(bh)) {
2813 ext3_error(inode->i_sb, "ext3_get_inode_loc", 2813 ext3_error(inode->i_sb, "ext3_get_inode_loc",
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 5571708b6a58..0629e09f6511 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -922,7 +922,8 @@ restart:
922 bh = ext3_getblk(NULL, dir, b++, 0, &err); 922 bh = ext3_getblk(NULL, dir, b++, 0, &err);
923 bh_use[ra_max] = bh; 923 bh_use[ra_max] = bh;
924 if (bh) 924 if (bh)
925 ll_rw_block(READ_META, 1, &bh); 925 ll_rw_block(READ | REQ_META | REQ_PRIO,
926 1, &bh);
926 } 927 }
927 } 928 }
928 if ((bh = bh_use[ra_ptr++]) == NULL) 929 if ((bh = bh_use[ra_ptr++]) == NULL)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 18d2558b7624..986e2388f031 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -647,7 +647,7 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
647 return bh; 647 return bh;
648 if (buffer_uptodate(bh)) 648 if (buffer_uptodate(bh))
649 return bh; 649 return bh;
650 ll_rw_block(READ_META, 1, &bh); 650 ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
651 wait_on_buffer(bh); 651 wait_on_buffer(bh);
652 if (buffer_uptodate(bh)) 652 if (buffer_uptodate(bh))
653 return bh; 653 return bh;
@@ -3298,7 +3298,7 @@ make_io:
3298 trace_ext4_load_inode(inode); 3298 trace_ext4_load_inode(inode);
3299 get_bh(bh); 3299 get_bh(bh);
3300 bh->b_end_io = end_buffer_read_sync; 3300 bh->b_end_io = end_buffer_read_sync;
3301 submit_bh(READ_META, bh); 3301 submit_bh(READ | REQ_META | REQ_PRIO, bh);
3302 wait_on_buffer(bh); 3302 wait_on_buffer(bh);
3303 if (!buffer_uptodate(bh)) { 3303 if (!buffer_uptodate(bh)) {
3304 EXT4_ERROR_INODE_BLOCK(inode, block, 3304 EXT4_ERROR_INODE_BLOCK(inode, block,
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index f8068c7bae9f..1c924faeb6c8 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -922,7 +922,8 @@ restart:
922 bh = ext4_getblk(NULL, dir, b++, 0, &err); 922 bh = ext4_getblk(NULL, dir, b++, 0, &err);
923 bh_use[ra_max] = bh; 923 bh_use[ra_max] = bh;
924 if (bh) 924 if (bh)
925 ll_rw_block(READ_META, 1, &bh); 925 ll_rw_block(READ | REQ_META | REQ_PRIO,
926 1, &bh);
926 } 927 }
927 } 928 }
928 if ((bh = bh_use[ra_ptr++]) == NULL) 929 if ((bh = bh_use[ra_ptr++]) == NULL)
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 85c62923ee29..598646434362 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -624,9 +624,9 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
624 bh->b_end_io = end_buffer_write_sync; 624 bh->b_end_io = end_buffer_write_sync;
625 get_bh(bh); 625 get_bh(bh);
626 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) 626 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
627 submit_bh(WRITE_SYNC | REQ_META, bh); 627 submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh);
628 else 628 else
629 submit_bh(WRITE_FLUSH_FUA | REQ_META, bh); 629 submit_bh(WRITE_FLUSH_FUA | REQ_META | REQ_PRIO, bh);
630 wait_on_buffer(bh); 630 wait_on_buffer(bh);
631 631
632 if (!buffer_uptodate(bh)) 632 if (!buffer_uptodate(bh))
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 747238cd9f96..be29858900f6 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -37,7 +37,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
37{ 37{
38 struct buffer_head *bh, *head; 38 struct buffer_head *bh, *head;
39 int nr_underway = 0; 39 int nr_underway = 0;
40 int write_op = REQ_META | 40 int write_op = REQ_META | REQ_PRIO |
41 (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); 41 (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
42 42
43 BUG_ON(!PageLocked(page)); 43 BUG_ON(!PageLocked(page));
@@ -225,7 +225,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
225 } 225 }
226 bh->b_end_io = end_buffer_read_sync; 226 bh->b_end_io = end_buffer_read_sync;
227 get_bh(bh); 227 get_bh(bh);
228 submit_bh(READ_SYNC | REQ_META, bh); 228 submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh);
229 if (!(flags & DIO_WAIT)) 229 if (!(flags & DIO_WAIT))
230 return 0; 230 return 0;
231 231
@@ -435,7 +435,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
435 if (buffer_uptodate(first_bh)) 435 if (buffer_uptodate(first_bh))
436 goto out; 436 goto out;
437 if (!buffer_locked(first_bh)) 437 if (!buffer_locked(first_bh))
438 ll_rw_block(READ_SYNC | REQ_META, 1, &first_bh); 438 ll_rw_block(READ_SYNC | REQ_META | REQ_PRIO, 1, &first_bh);
439 439
440 dblock++; 440 dblock++;
441 extlen--; 441 extlen--;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 3bc073a4cf82..079587e53849 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -224,7 +224,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
224 224
225 bio->bi_end_io = end_bio_io_page; 225 bio->bi_end_io = end_bio_io_page;
226 bio->bi_private = page; 226 bio->bi_private = page;
227 submit_bio(READ_SYNC | REQ_META, bio); 227 submit_bio(READ_SYNC | REQ_META | REQ_PRIO, bio);
228 wait_on_page_locked(page); 228 wait_on_page_locked(page);
229 bio_put(bio); 229 bio_put(bio);
230 if (!PageUptodate(page)) { 230 if (!PageUptodate(page)) {
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 42e8d23bc047..0e8bb13381e4 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -709,7 +709,7 @@ get_a_page:
709 set_buffer_uptodate(bh); 709 set_buffer_uptodate(bh);
710 710
711 if (!buffer_uptodate(bh)) { 711 if (!buffer_uptodate(bh)) {
712 ll_rw_block(READ_META, 1, &bh); 712 ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
713 wait_on_buffer(bh); 713 wait_on_buffer(bh);
714 if (!buffer_uptodate(bh)) 714 if (!buffer_uptodate(bh))
715 goto unlock_out; 715 goto unlock_out;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index c106ca22e812..d24a9b666a23 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -344,6 +344,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
344 struct inode *root, *inode; 344 struct inode *root, *inode;
345 struct qstr str; 345 struct qstr str;
346 struct nls_table *nls = NULL; 346 struct nls_table *nls = NULL;
347 u64 last_fs_block, last_fs_page;
347 int err; 348 int err;
348 349
349 err = -EINVAL; 350 err = -EINVAL;
@@ -399,9 +400,13 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
399 if (!sbi->rsrc_clump_blocks) 400 if (!sbi->rsrc_clump_blocks)
400 sbi->rsrc_clump_blocks = 1; 401 sbi->rsrc_clump_blocks = 1;
401 402
402 err = generic_check_addressable(sbi->alloc_blksz_shift, 403 err = -EFBIG;
403 sbi->total_blocks); 404 last_fs_block = sbi->total_blocks - 1;
404 if (err) { 405 last_fs_page = (last_fs_block << sbi->alloc_blksz_shift) >>
406 PAGE_CACHE_SHIFT;
407
408 if ((last_fs_block > (sector_t)(~0ULL) >> (sbi->alloc_blksz_shift - 9)) ||
409 (last_fs_page > (pgoff_t)(~0ULL))) {
405 printk(KERN_ERR "hfs: filesystem size too large.\n"); 410 printk(KERN_ERR "hfs: filesystem size too large.\n");
406 goto out_free_vhdr; 411 goto out_free_vhdr;
407 } 412 }
@@ -525,8 +530,8 @@ out_close_cat_tree:
525out_close_ext_tree: 530out_close_ext_tree:
526 hfs_btree_close(sbi->ext_tree); 531 hfs_btree_close(sbi->ext_tree);
527out_free_vhdr: 532out_free_vhdr:
528 kfree(sbi->s_vhdr); 533 kfree(sbi->s_vhdr_buf);
529 kfree(sbi->s_backup_vhdr); 534 kfree(sbi->s_backup_vhdr_buf);
530out_unload_nls: 535out_unload_nls:
531 unload_nls(sbi->nls); 536 unload_nls(sbi->nls);
532 unload_nls(nls); 537 unload_nls(nls);
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 10e515a0d452..7daf4b852d1c 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -272,9 +272,9 @@ reread:
272 return 0; 272 return 0;
273 273
274out_free_backup_vhdr: 274out_free_backup_vhdr:
275 kfree(sbi->s_backup_vhdr); 275 kfree(sbi->s_backup_vhdr_buf);
276out_free_vhdr: 276out_free_vhdr:
277 kfree(sbi->s_vhdr); 277 kfree(sbi->s_vhdr_buf);
278out: 278out:
279 return error; 279 return error;
280} 280}
diff --git a/fs/namei.c b/fs/namei.c
index b52bc685465f..0b3138de2a3b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -721,12 +721,6 @@ static int follow_automount(struct path *path, unsigned flags,
721 if (!path->dentry->d_op || !path->dentry->d_op->d_automount) 721 if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
722 return -EREMOTE; 722 return -EREMOTE;
723 723
724 /* We don't want to mount if someone supplied AT_NO_AUTOMOUNT
725 * and this is the terminal part of the path.
726 */
727 if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_PARENT))
728 return -EISDIR; /* we actually want to stop here */
729
730 /* We don't want to mount if someone's just doing a stat - 724 /* We don't want to mount if someone's just doing a stat -
731 * unless they're stat'ing a directory and appended a '/' to 725 * unless they're stat'ing a directory and appended a '/' to
732 * the name. 726 * the name.
@@ -739,7 +733,7 @@ static int follow_automount(struct path *path, unsigned flags,
739 * of the daemon to instantiate them before they can be used. 733 * of the daemon to instantiate them before they can be used.
740 */ 734 */
741 if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | 735 if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
742 LOOKUP_OPEN | LOOKUP_CREATE)) && 736 LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
743 path->dentry->d_inode) 737 path->dentry->d_inode)
744 return -EISDIR; 738 return -EISDIR;
745 739
@@ -2616,6 +2610,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
2616 if (!dir->i_op->rmdir) 2610 if (!dir->i_op->rmdir)
2617 return -EPERM; 2611 return -EPERM;
2618 2612
2613 dget(dentry);
2619 mutex_lock(&dentry->d_inode->i_mutex); 2614 mutex_lock(&dentry->d_inode->i_mutex);
2620 2615
2621 error = -EBUSY; 2616 error = -EBUSY;
@@ -2636,6 +2631,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
2636 2631
2637out: 2632out:
2638 mutex_unlock(&dentry->d_inode->i_mutex); 2633 mutex_unlock(&dentry->d_inode->i_mutex);
2634 dput(dentry);
2639 if (!error) 2635 if (!error)
2640 d_delete(dentry); 2636 d_delete(dentry);
2641 return error; 2637 return error;
@@ -3025,6 +3021,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
3025 if (error) 3021 if (error)
3026 return error; 3022 return error;
3027 3023
3024 dget(new_dentry);
3028 if (target) 3025 if (target)
3029 mutex_lock(&target->i_mutex); 3026 mutex_lock(&target->i_mutex);
3030 3027
@@ -3045,6 +3042,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
3045out: 3042out:
3046 if (target) 3043 if (target)
3047 mutex_unlock(&target->i_mutex); 3044 mutex_unlock(&target->i_mutex);
3045 dput(new_dentry);
3048 if (!error) 3046 if (!error)
3049 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) 3047 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
3050 d_move(old_dentry,new_dentry); 3048 d_move(old_dentry,new_dentry);
diff --git a/fs/namespace.c b/fs/namespace.c
index 22bfe8273c68..b4febb29d3bb 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1757,7 +1757,7 @@ static int do_loopback(struct path *path, char *old_name,
1757 return err; 1757 return err;
1758 if (!old_name || !*old_name) 1758 if (!old_name || !*old_name)
1759 return -EINVAL; 1759 return -EINVAL;
1760 err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); 1760 err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
1761 if (err) 1761 if (err)
1762 return err; 1762 return err;
1763 1763
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 1ec1a85fa71c..3e93e9a1bee1 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -56,6 +56,9 @@ enum nfs4_session_state {
56 NFS4_SESSION_DRAINING, 56 NFS4_SESSION_DRAINING,
57}; 57};
58 58
59#define NFS4_RENEW_TIMEOUT 0x01
60#define NFS4_RENEW_DELEGATION_CB 0x02
61
59struct nfs4_minor_version_ops { 62struct nfs4_minor_version_ops {
60 u32 minor_version; 63 u32 minor_version;
61 64
@@ -225,7 +228,7 @@ struct nfs4_state_recovery_ops {
225}; 228};
226 229
227struct nfs4_state_maintenance_ops { 230struct nfs4_state_maintenance_ops {
228 int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *); 231 int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *, unsigned);
229 struct rpc_cred * (*get_state_renewal_cred_locked)(struct nfs_client *); 232 struct rpc_cred * (*get_state_renewal_cred_locked)(struct nfs_client *);
230 int (*renew_lease)(struct nfs_client *, struct rpc_cred *); 233 int (*renew_lease)(struct nfs_client *, struct rpc_cred *);
231}; 234};
@@ -237,8 +240,6 @@ extern const struct inode_operations nfs4_dir_inode_operations;
237extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); 240extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
238extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *); 241extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
239extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred); 242extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
240extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
241extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
242extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); 243extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
243extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); 244extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
244extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc); 245extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc);
@@ -349,6 +350,7 @@ extern void nfs4_close_sync(struct nfs4_state *, fmode_t);
349extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t); 350extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
350extern void nfs4_schedule_lease_recovery(struct nfs_client *); 351extern void nfs4_schedule_lease_recovery(struct nfs_client *);
351extern void nfs4_schedule_state_manager(struct nfs_client *); 352extern void nfs4_schedule_state_manager(struct nfs_client *);
353extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp);
352extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); 354extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
353extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); 355extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
354extern void nfs41_handle_recall_slot(struct nfs_client *clp); 356extern void nfs41_handle_recall_slot(struct nfs_client *clp);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8c77039e7a81..4700fae1ada0 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3374,9 +3374,13 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata)
3374 3374
3375 if (task->tk_status < 0) { 3375 if (task->tk_status < 0) {
3376 /* Unless we're shutting down, schedule state recovery! */ 3376 /* Unless we're shutting down, schedule state recovery! */
3377 if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0) 3377 if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) == 0)
3378 return;
3379 if (task->tk_status != NFS4ERR_CB_PATH_DOWN) {
3378 nfs4_schedule_lease_recovery(clp); 3380 nfs4_schedule_lease_recovery(clp);
3379 return; 3381 return;
3382 }
3383 nfs4_schedule_path_down_recovery(clp);
3380 } 3384 }
3381 do_renew_lease(clp, timestamp); 3385 do_renew_lease(clp, timestamp);
3382} 3386}
@@ -3386,7 +3390,7 @@ static const struct rpc_call_ops nfs4_renew_ops = {
3386 .rpc_release = nfs4_renew_release, 3390 .rpc_release = nfs4_renew_release,
3387}; 3391};
3388 3392
3389int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred) 3393static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags)
3390{ 3394{
3391 struct rpc_message msg = { 3395 struct rpc_message msg = {
3392 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW], 3396 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW],
@@ -3395,9 +3399,11 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
3395 }; 3399 };
3396 struct nfs4_renewdata *data; 3400 struct nfs4_renewdata *data;
3397 3401
3402 if (renew_flags == 0)
3403 return 0;
3398 if (!atomic_inc_not_zero(&clp->cl_count)) 3404 if (!atomic_inc_not_zero(&clp->cl_count))
3399 return -EIO; 3405 return -EIO;
3400 data = kmalloc(sizeof(*data), GFP_KERNEL); 3406 data = kmalloc(sizeof(*data), GFP_NOFS);
3401 if (data == NULL) 3407 if (data == NULL)
3402 return -ENOMEM; 3408 return -ENOMEM;
3403 data->client = clp; 3409 data->client = clp;
@@ -3406,7 +3412,7 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
3406 &nfs4_renew_ops, data); 3412 &nfs4_renew_ops, data);
3407} 3413}
3408 3414
3409int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) 3415static int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
3410{ 3416{
3411 struct rpc_message msg = { 3417 struct rpc_message msg = {
3412 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW], 3418 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW],
@@ -5504,11 +5510,13 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_
5504 return rpc_run_task(&task_setup_data); 5510 return rpc_run_task(&task_setup_data);
5505} 5511}
5506 5512
5507static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred) 5513static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags)
5508{ 5514{
5509 struct rpc_task *task; 5515 struct rpc_task *task;
5510 int ret = 0; 5516 int ret = 0;
5511 5517
5518 if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0)
5519 return 0;
5512 task = _nfs41_proc_sequence(clp, cred); 5520 task = _nfs41_proc_sequence(clp, cred);
5513 if (IS_ERR(task)) 5521 if (IS_ERR(task))
5514 ret = PTR_ERR(task); 5522 ret = PTR_ERR(task);
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index df8e7f3ca56d..dc484c0eae7f 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -60,6 +60,7 @@ nfs4_renew_state(struct work_struct *work)
60 struct rpc_cred *cred; 60 struct rpc_cred *cred;
61 long lease; 61 long lease;
62 unsigned long last, now; 62 unsigned long last, now;
63 unsigned renew_flags = 0;
63 64
64 ops = clp->cl_mvops->state_renewal_ops; 65 ops = clp->cl_mvops->state_renewal_ops;
65 dprintk("%s: start\n", __func__); 66 dprintk("%s: start\n", __func__);
@@ -72,18 +73,23 @@ nfs4_renew_state(struct work_struct *work)
72 last = clp->cl_last_renewal; 73 last = clp->cl_last_renewal;
73 now = jiffies; 74 now = jiffies;
74 /* Are we close to a lease timeout? */ 75 /* Are we close to a lease timeout? */
75 if (time_after(now, last + lease/3)) { 76 if (time_after(now, last + lease/3))
77 renew_flags |= NFS4_RENEW_TIMEOUT;
78 if (nfs_delegations_present(clp))
79 renew_flags |= NFS4_RENEW_DELEGATION_CB;
80
81 if (renew_flags != 0) {
76 cred = ops->get_state_renewal_cred_locked(clp); 82 cred = ops->get_state_renewal_cred_locked(clp);
77 spin_unlock(&clp->cl_lock); 83 spin_unlock(&clp->cl_lock);
78 if (cred == NULL) { 84 if (cred == NULL) {
79 if (!nfs_delegations_present(clp)) { 85 if (!(renew_flags & NFS4_RENEW_DELEGATION_CB)) {
80 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); 86 set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
81 goto out; 87 goto out;
82 } 88 }
83 nfs_expire_all_delegations(clp); 89 nfs_expire_all_delegations(clp);
84 } else { 90 } else {
85 /* Queue an asynchronous RENEW. */ 91 /* Queue an asynchronous RENEW. */
86 ops->sched_state_renewal(clp, cred); 92 ops->sched_state_renewal(clp, cred, renew_flags);
87 put_rpccred(cred); 93 put_rpccred(cred);
88 goto out_exp; 94 goto out_exp;
89 } 95 }
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 72ab97ef3d61..39914be40b03 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1038,6 +1038,12 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp)
1038 nfs4_schedule_state_manager(clp); 1038 nfs4_schedule_state_manager(clp);
1039} 1039}
1040 1040
1041void nfs4_schedule_path_down_recovery(struct nfs_client *clp)
1042{
1043 nfs_handle_cb_pathdown(clp);
1044 nfs4_schedule_state_manager(clp);
1045}
1046
1041static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state) 1047static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
1042{ 1048{
1043 1049
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index b961ceac66b4..5b19b6aabe18 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2035,9 +2035,6 @@ static inline void nfs_initialise_sb(struct super_block *sb)
2035 sb->s_blocksize = nfs_block_bits(server->wsize, 2035 sb->s_blocksize = nfs_block_bits(server->wsize,
2036 &sb->s_blocksize_bits); 2036 &sb->s_blocksize_bits);
2037 2037
2038 if (server->flags & NFS_MOUNT_NOAC)
2039 sb->s_flags |= MS_SYNCHRONOUS;
2040
2041 sb->s_bdi = &server->backing_dev_info; 2038 sb->s_bdi = &server->backing_dev_info;
2042 2039
2043 nfs_super_set_maxbytes(sb, server->maxfilesize); 2040 nfs_super_set_maxbytes(sb, server->maxfilesize);
@@ -2249,6 +2246,10 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
2249 if (server->flags & NFS_MOUNT_UNSHARED) 2246 if (server->flags & NFS_MOUNT_UNSHARED)
2250 compare_super = NULL; 2247 compare_super = NULL;
2251 2248
2249 /* -o noac implies -o sync */
2250 if (server->flags & NFS_MOUNT_NOAC)
2251 sb_mntdata.mntflags |= MS_SYNCHRONOUS;
2252
2252 /* Get a superblock - note that we may end up sharing one that already exists */ 2253 /* Get a superblock - note that we may end up sharing one that already exists */
2253 s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata); 2254 s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata);
2254 if (IS_ERR(s)) { 2255 if (IS_ERR(s)) {
@@ -2361,6 +2362,10 @@ nfs_xdev_mount(struct file_system_type *fs_type, int flags,
2361 if (server->flags & NFS_MOUNT_UNSHARED) 2362 if (server->flags & NFS_MOUNT_UNSHARED)
2362 compare_super = NULL; 2363 compare_super = NULL;
2363 2364
2365 /* -o noac implies -o sync */
2366 if (server->flags & NFS_MOUNT_NOAC)
2367 sb_mntdata.mntflags |= MS_SYNCHRONOUS;
2368
2364 /* Get a superblock - note that we may end up sharing one that already exists */ 2369 /* Get a superblock - note that we may end up sharing one that already exists */
2365 s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata); 2370 s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata);
2366 if (IS_ERR(s)) { 2371 if (IS_ERR(s)) {
@@ -2628,6 +2633,10 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
2628 if (server->flags & NFS4_MOUNT_UNSHARED) 2633 if (server->flags & NFS4_MOUNT_UNSHARED)
2629 compare_super = NULL; 2634 compare_super = NULL;
2630 2635
2636 /* -o noac implies -o sync */
2637 if (server->flags & NFS_MOUNT_NOAC)
2638 sb_mntdata.mntflags |= MS_SYNCHRONOUS;
2639
2631 /* Get a superblock - note that we may end up sharing one that already exists */ 2640 /* Get a superblock - note that we may end up sharing one that already exists */
2632 s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata); 2641 s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
2633 if (IS_ERR(s)) { 2642 if (IS_ERR(s)) {
@@ -2789,7 +2798,7 @@ static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt,
2789 goto out_put_mnt_ns; 2798 goto out_put_mnt_ns;
2790 2799
2791 ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt, 2800 ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt,
2792 export_path, LOOKUP_FOLLOW, &path); 2801 export_path, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
2793 2802
2794 nfs_referral_loop_unprotect(); 2803 nfs_referral_loop_unprotect();
2795 put_mnt_ns(ns_private); 2804 put_mnt_ns(ns_private);
@@ -2916,6 +2925,10 @@ nfs4_xdev_mount(struct file_system_type *fs_type, int flags,
2916 if (server->flags & NFS4_MOUNT_UNSHARED) 2925 if (server->flags & NFS4_MOUNT_UNSHARED)
2917 compare_super = NULL; 2926 compare_super = NULL;
2918 2927
2928 /* -o noac implies -o sync */
2929 if (server->flags & NFS_MOUNT_NOAC)
2930 sb_mntdata.mntflags |= MS_SYNCHRONOUS;
2931
2919 /* Get a superblock - note that we may end up sharing one that already exists */ 2932 /* Get a superblock - note that we may end up sharing one that already exists */
2920 s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata); 2933 s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
2921 if (IS_ERR(s)) { 2934 if (IS_ERR(s)) {
@@ -3003,6 +3016,10 @@ nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags,
3003 if (server->flags & NFS4_MOUNT_UNSHARED) 3016 if (server->flags & NFS4_MOUNT_UNSHARED)
3004 compare_super = NULL; 3017 compare_super = NULL;
3005 3018
3019 /* -o noac implies -o sync */
3020 if (server->flags & NFS_MOUNT_NOAC)
3021 sb_mntdata.mntflags |= MS_SYNCHRONOUS;
3022
3006 /* Get a superblock - note that we may end up sharing one that already exists */ 3023 /* Get a superblock - note that we may end up sharing one that already exists */
3007 s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata); 3024 s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
3008 if (IS_ERR(s)) { 3025 if (IS_ERR(s)) {
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b39b37f80913..c9bd2a6b7d4b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -958,7 +958,7 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc, struct list_head
958 if (!data) 958 if (!data)
959 goto out_bad; 959 goto out_bad;
960 data->pagevec[0] = page; 960 data->pagevec[0] = page;
961 nfs_write_rpcsetup(req, data, wsize, offset, desc->pg_ioflags); 961 nfs_write_rpcsetup(req, data, len, offset, desc->pg_ioflags);
962 list_add(&data->list, res); 962 list_add(&data->list, res);
963 requests++; 963 requests++;
964 nbytes -= len; 964 nbytes -= len;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 25b6a887adb9..5afaa58a8630 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -877,30 +877,54 @@ struct numa_maps_private {
877 struct numa_maps md; 877 struct numa_maps md;
878}; 878};
879 879
880static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty) 880static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
881 unsigned long nr_pages)
881{ 882{
882 int count = page_mapcount(page); 883 int count = page_mapcount(page);
883 884
884 md->pages++; 885 md->pages += nr_pages;
885 if (pte_dirty || PageDirty(page)) 886 if (pte_dirty || PageDirty(page))
886 md->dirty++; 887 md->dirty += nr_pages;
887 888
888 if (PageSwapCache(page)) 889 if (PageSwapCache(page))
889 md->swapcache++; 890 md->swapcache += nr_pages;
890 891
891 if (PageActive(page) || PageUnevictable(page)) 892 if (PageActive(page) || PageUnevictable(page))
892 md->active++; 893 md->active += nr_pages;
893 894
894 if (PageWriteback(page)) 895 if (PageWriteback(page))
895 md->writeback++; 896 md->writeback += nr_pages;
896 897
897 if (PageAnon(page)) 898 if (PageAnon(page))
898 md->anon++; 899 md->anon += nr_pages;
899 900
900 if (count > md->mapcount_max) 901 if (count > md->mapcount_max)
901 md->mapcount_max = count; 902 md->mapcount_max = count;
902 903
903 md->node[page_to_nid(page)]++; 904 md->node[page_to_nid(page)] += nr_pages;
905}
906
907static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
908 unsigned long addr)
909{
910 struct page *page;
911 int nid;
912
913 if (!pte_present(pte))
914 return NULL;
915
916 page = vm_normal_page(vma, addr, pte);
917 if (!page)
918 return NULL;
919
920 if (PageReserved(page))
921 return NULL;
922
923 nid = page_to_nid(page);
924 if (!node_isset(nid, node_states[N_HIGH_MEMORY]))
925 return NULL;
926
927 return page;
904} 928}
905 929
906static int gather_pte_stats(pmd_t *pmd, unsigned long addr, 930static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
@@ -912,26 +936,32 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
912 pte_t *pte; 936 pte_t *pte;
913 937
914 md = walk->private; 938 md = walk->private;
915 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 939 spin_lock(&walk->mm->page_table_lock);
916 do { 940 if (pmd_trans_huge(*pmd)) {
917 struct page *page; 941 if (pmd_trans_splitting(*pmd)) {
918 int nid; 942 spin_unlock(&walk->mm->page_table_lock);
943 wait_split_huge_page(md->vma->anon_vma, pmd);
944 } else {
945 pte_t huge_pte = *(pte_t *)pmd;
946 struct page *page;
919 947
920 if (!pte_present(*pte)) 948 page = can_gather_numa_stats(huge_pte, md->vma, addr);
921 continue; 949 if (page)
950 gather_stats(page, md, pte_dirty(huge_pte),
951 HPAGE_PMD_SIZE/PAGE_SIZE);
952 spin_unlock(&walk->mm->page_table_lock);
953 return 0;
954 }
955 } else {
956 spin_unlock(&walk->mm->page_table_lock);
957 }
922 958
923 page = vm_normal_page(md->vma, addr, *pte); 959 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
960 do {
961 struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
924 if (!page) 962 if (!page)
925 continue; 963 continue;
926 964 gather_stats(page, md, pte_dirty(*pte), 1);
927 if (PageReserved(page))
928 continue;
929
930 nid = page_to_nid(page);
931 if (!node_isset(nid, node_states[N_HIGH_MEMORY]))
932 continue;
933
934 gather_stats(page, md, pte_dirty(*pte));
935 965
936 } while (pte++, addr += PAGE_SIZE, addr != end); 966 } while (pte++, addr += PAGE_SIZE, addr != end);
937 pte_unmap_unlock(orig_pte, ptl); 967 pte_unmap_unlock(orig_pte, ptl);
@@ -952,7 +982,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
952 return 0; 982 return 0;
953 983
954 md = walk->private; 984 md = walk->private;
955 gather_stats(page, md, pte_dirty(*pte)); 985 gather_stats(page, md, pte_dirty(*pte), 1);
956 return 0; 986 return 0;
957} 987}
958 988
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index b34bdb25490c..10b6be3ca280 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -355,7 +355,7 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
355 * resolution (think about autofs) and thus deadlocks could arise. 355 * resolution (think about autofs) and thus deadlocks could arise.
356 */ 356 */
357 if (cmds == Q_QUOTAON) { 357 if (cmds == Q_QUOTAON) {
358 ret = user_path_at(AT_FDCWD, addr, LOOKUP_FOLLOW, &path); 358 ret = user_path_at(AT_FDCWD, addr, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
359 if (ret) 359 if (ret)
360 pathp = ERR_PTR(ret); 360 pathp = ERR_PTR(ret);
361 else 361 else
diff --git a/fs/stat.c b/fs/stat.c
index ba5316ffac61..78a3aa83c7ea 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -81,8 +81,6 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
81 81
82 if (!(flag & AT_SYMLINK_NOFOLLOW)) 82 if (!(flag & AT_SYMLINK_NOFOLLOW))
83 lookup_flags |= LOOKUP_FOLLOW; 83 lookup_flags |= LOOKUP_FOLLOW;
84 if (flag & AT_NO_AUTOMOUNT)
85 lookup_flags |= LOOKUP_NO_AUTOMOUNT;
86 if (flag & AT_EMPTY_PATH) 84 if (flag & AT_EMPTY_PATH)
87 lookup_flags |= LOOKUP_EMPTY; 85 lookup_flags |= LOOKUP_EMPTY;
88 86
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 63e971e2b837..8c37dde4c521 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1300,6 +1300,7 @@ xfs_end_io_direct_write(
1300 bool is_async) 1300 bool is_async)
1301{ 1301{
1302 struct xfs_ioend *ioend = iocb->private; 1302 struct xfs_ioend *ioend = iocb->private;
1303 struct inode *inode = ioend->io_inode;
1303 1304
1304 /* 1305 /*
1305 * blockdev_direct_IO can return an error even after the I/O 1306 * blockdev_direct_IO can return an error even after the I/O
@@ -1331,7 +1332,7 @@ xfs_end_io_direct_write(
1331 } 1332 }
1332 1333
1333 /* XXX: probably should move into the real I/O completion handler */ 1334 /* XXX: probably should move into the real I/O completion handler */
1334 inode_dio_done(ioend->io_inode); 1335 inode_dio_done(inode);
1335} 1336}
1336 1337
1337STATIC ssize_t 1338STATIC ssize_t
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index cac2ecfa6746..ef43fce519a1 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -629,7 +629,7 @@ xfs_buf_item_push(
629 * the xfsbufd to get this buffer written. We have to unlock the buffer 629 * the xfsbufd to get this buffer written. We have to unlock the buffer
630 * to allow the xfsbufd to write it, too. 630 * to allow the xfsbufd to write it, too.
631 */ 631 */
632STATIC void 632STATIC bool
633xfs_buf_item_pushbuf( 633xfs_buf_item_pushbuf(
634 struct xfs_log_item *lip) 634 struct xfs_log_item *lip)
635{ 635{
@@ -643,6 +643,7 @@ xfs_buf_item_pushbuf(
643 643
644 xfs_buf_delwri_promote(bp); 644 xfs_buf_delwri_promote(bp);
645 xfs_buf_relse(bp); 645 xfs_buf_relse(bp);
646 return true;
646} 647}
647 648
648STATIC void 649STATIC void
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 9e0e2fa3f2c8..bb3f71d236d2 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -183,13 +183,14 @@ xfs_qm_dqunpin_wait(
183 * search the buffer cache can be a time consuming thing, and AIL lock is a 183 * search the buffer cache can be a time consuming thing, and AIL lock is a
184 * spinlock. 184 * spinlock.
185 */ 185 */
186STATIC void 186STATIC bool
187xfs_qm_dquot_logitem_pushbuf( 187xfs_qm_dquot_logitem_pushbuf(
188 struct xfs_log_item *lip) 188 struct xfs_log_item *lip)
189{ 189{
190 struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); 190 struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip);
191 struct xfs_dquot *dqp = qlip->qli_dquot; 191 struct xfs_dquot *dqp = qlip->qli_dquot;
192 struct xfs_buf *bp; 192 struct xfs_buf *bp;
193 bool ret = true;
193 194
194 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 195 ASSERT(XFS_DQ_IS_LOCKED(dqp));
195 196
@@ -201,17 +202,20 @@ xfs_qm_dquot_logitem_pushbuf(
201 if (completion_done(&dqp->q_flush) || 202 if (completion_done(&dqp->q_flush) ||
202 !(lip->li_flags & XFS_LI_IN_AIL)) { 203 !(lip->li_flags & XFS_LI_IN_AIL)) {
203 xfs_dqunlock(dqp); 204 xfs_dqunlock(dqp);
204 return; 205 return true;
205 } 206 }
206 207
207 bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno, 208 bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno,
208 dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK); 209 dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
209 xfs_dqunlock(dqp); 210 xfs_dqunlock(dqp);
210 if (!bp) 211 if (!bp)
211 return; 212 return true;
212 if (XFS_BUF_ISDELAYWRITE(bp)) 213 if (XFS_BUF_ISDELAYWRITE(bp))
213 xfs_buf_delwri_promote(bp); 214 xfs_buf_delwri_promote(bp);
215 if (xfs_buf_ispinned(bp))
216 ret = false;
214 xfs_buf_relse(bp); 217 xfs_buf_relse(bp);
218 return ret;
215} 219}
216 220
217/* 221/*
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 588406dc6a35..836ad80d4f2b 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -708,13 +708,14 @@ xfs_inode_item_committed(
708 * marked delayed write. If that's the case, we'll promote it and that will 708 * marked delayed write. If that's the case, we'll promote it and that will
709 * allow the caller to write the buffer by triggering the xfsbufd to run. 709 * allow the caller to write the buffer by triggering the xfsbufd to run.
710 */ 710 */
711STATIC void 711STATIC bool
712xfs_inode_item_pushbuf( 712xfs_inode_item_pushbuf(
713 struct xfs_log_item *lip) 713 struct xfs_log_item *lip)
714{ 714{
715 struct xfs_inode_log_item *iip = INODE_ITEM(lip); 715 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
716 struct xfs_inode *ip = iip->ili_inode; 716 struct xfs_inode *ip = iip->ili_inode;
717 struct xfs_buf *bp; 717 struct xfs_buf *bp;
718 bool ret = true;
718 719
719 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); 720 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
720 721
@@ -725,7 +726,7 @@ xfs_inode_item_pushbuf(
725 if (completion_done(&ip->i_flush) || 726 if (completion_done(&ip->i_flush) ||
726 !(lip->li_flags & XFS_LI_IN_AIL)) { 727 !(lip->li_flags & XFS_LI_IN_AIL)) {
727 xfs_iunlock(ip, XFS_ILOCK_SHARED); 728 xfs_iunlock(ip, XFS_ILOCK_SHARED);
728 return; 729 return true;
729 } 730 }
730 731
731 bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno, 732 bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno,
@@ -733,10 +734,13 @@ xfs_inode_item_pushbuf(
733 734
734 xfs_iunlock(ip, XFS_ILOCK_SHARED); 735 xfs_iunlock(ip, XFS_ILOCK_SHARED);
735 if (!bp) 736 if (!bp)
736 return; 737 return true;
737 if (XFS_BUF_ISDELAYWRITE(bp)) 738 if (XFS_BUF_ISDELAYWRITE(bp))
738 xfs_buf_delwri_promote(bp); 739 xfs_buf_delwri_promote(bp);
740 if (xfs_buf_ispinned(bp))
741 ret = false;
739 xfs_buf_relse(bp); 742 xfs_buf_relse(bp);
743 return ret;
740} 744}
741 745
742/* 746/*
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 1e8a45e74c3e..828662f70d64 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -68,6 +68,8 @@
68#include <linux/ctype.h> 68#include <linux/ctype.h>
69#include <linux/writeback.h> 69#include <linux/writeback.h>
70#include <linux/capability.h> 70#include <linux/capability.h>
71#include <linux/kthread.h>
72#include <linux/freezer.h>
71#include <linux/list_sort.h> 73#include <linux/list_sort.h>
72 74
73#include <asm/page.h> 75#include <asm/page.h>
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 2366c54cc4fa..5cf06b85fd9d 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1652,24 +1652,13 @@ xfs_init_workqueues(void)
1652 */ 1652 */
1653 xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8); 1653 xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8);
1654 if (!xfs_syncd_wq) 1654 if (!xfs_syncd_wq)
1655 goto out; 1655 return -ENOMEM;
1656
1657 xfs_ail_wq = alloc_workqueue("xfsail", WQ_CPU_INTENSIVE, 8);
1658 if (!xfs_ail_wq)
1659 goto out_destroy_syncd;
1660
1661 return 0; 1656 return 0;
1662
1663out_destroy_syncd:
1664 destroy_workqueue(xfs_syncd_wq);
1665out:
1666 return -ENOMEM;
1667} 1657}
1668 1658
1669STATIC void 1659STATIC void
1670xfs_destroy_workqueues(void) 1660xfs_destroy_workqueues(void)
1671{ 1661{
1672 destroy_workqueue(xfs_ail_wq);
1673 destroy_workqueue(xfs_syncd_wq); 1662 destroy_workqueue(xfs_syncd_wq);
1674} 1663}
1675 1664
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 06a9759b6352..53597f4db9b5 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -350,7 +350,7 @@ typedef struct xfs_item_ops {
350 void (*iop_unlock)(xfs_log_item_t *); 350 void (*iop_unlock)(xfs_log_item_t *);
351 xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); 351 xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
352 void (*iop_push)(xfs_log_item_t *); 352 void (*iop_push)(xfs_log_item_t *);
353 void (*iop_pushbuf)(xfs_log_item_t *); 353 bool (*iop_pushbuf)(xfs_log_item_t *);
354 void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); 354 void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
355} xfs_item_ops_t; 355} xfs_item_ops_t;
356 356
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index c15aa29fa169..3a1e7ca54c2d 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -28,8 +28,6 @@
28#include "xfs_trans_priv.h" 28#include "xfs_trans_priv.h"
29#include "xfs_error.h" 29#include "xfs_error.h"
30 30
31struct workqueue_struct *xfs_ail_wq; /* AIL workqueue */
32
33#ifdef DEBUG 31#ifdef DEBUG
34/* 32/*
35 * Check that the list is sorted as it should be. 33 * Check that the list is sorted as it should be.
@@ -356,16 +354,10 @@ xfs_ail_delete(
356 xfs_trans_ail_cursor_clear(ailp, lip); 354 xfs_trans_ail_cursor_clear(ailp, lip);
357} 355}
358 356
359/* 357static long
360 * xfs_ail_worker does the work of pushing on the AIL. It will requeue itself 358xfsaild_push(
361 * to run at a later time if there is more work to do to complete the push. 359 struct xfs_ail *ailp)
362 */
363STATIC void
364xfs_ail_worker(
365 struct work_struct *work)
366{ 360{
367 struct xfs_ail *ailp = container_of(to_delayed_work(work),
368 struct xfs_ail, xa_work);
369 xfs_mount_t *mp = ailp->xa_mount; 361 xfs_mount_t *mp = ailp->xa_mount;
370 struct xfs_ail_cursor cur; 362 struct xfs_ail_cursor cur;
371 xfs_log_item_t *lip; 363 xfs_log_item_t *lip;
@@ -427,8 +419,13 @@ xfs_ail_worker(
427 419
428 case XFS_ITEM_PUSHBUF: 420 case XFS_ITEM_PUSHBUF:
429 XFS_STATS_INC(xs_push_ail_pushbuf); 421 XFS_STATS_INC(xs_push_ail_pushbuf);
430 IOP_PUSHBUF(lip); 422
431 ailp->xa_last_pushed_lsn = lsn; 423 if (!IOP_PUSHBUF(lip)) {
424 stuck++;
425 flush_log = 1;
426 } else {
427 ailp->xa_last_pushed_lsn = lsn;
428 }
432 push_xfsbufd = 1; 429 push_xfsbufd = 1;
433 break; 430 break;
434 431
@@ -440,7 +437,6 @@ xfs_ail_worker(
440 437
441 case XFS_ITEM_LOCKED: 438 case XFS_ITEM_LOCKED:
442 XFS_STATS_INC(xs_push_ail_locked); 439 XFS_STATS_INC(xs_push_ail_locked);
443 ailp->xa_last_pushed_lsn = lsn;
444 stuck++; 440 stuck++;
445 break; 441 break;
446 442
@@ -501,20 +497,6 @@ out_done:
501 /* We're past our target or empty, so idle */ 497 /* We're past our target or empty, so idle */
502 ailp->xa_last_pushed_lsn = 0; 498 ailp->xa_last_pushed_lsn = 0;
503 499
504 /*
505 * We clear the XFS_AIL_PUSHING_BIT first before checking
506 * whether the target has changed. If the target has changed,
507 * this pushes the requeue race directly onto the result of the
508 * atomic test/set bit, so we are guaranteed that either the
509 * the pusher that changed the target or ourselves will requeue
510 * the work (but not both).
511 */
512 clear_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags);
513 smp_rmb();
514 if (XFS_LSN_CMP(ailp->xa_target, target) == 0 ||
515 test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags))
516 return;
517
518 tout = 50; 500 tout = 50;
519 } else if (XFS_LSN_CMP(lsn, target) >= 0) { 501 } else if (XFS_LSN_CMP(lsn, target) >= 0) {
520 /* 502 /*
@@ -537,9 +519,30 @@ out_done:
537 tout = 20; 519 tout = 20;
538 } 520 }
539 521
540 /* There is more to do, requeue us. */ 522 return tout;
541 queue_delayed_work(xfs_syncd_wq, &ailp->xa_work, 523}
542 msecs_to_jiffies(tout)); 524
525static int
526xfsaild(
527 void *data)
528{
529 struct xfs_ail *ailp = data;
530 long tout = 0; /* milliseconds */
531
532 while (!kthread_should_stop()) {
533 if (tout && tout <= 20)
534 __set_current_state(TASK_KILLABLE);
535 else
536 __set_current_state(TASK_INTERRUPTIBLE);
537 schedule_timeout(tout ?
538 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
539
540 try_to_freeze();
541
542 tout = xfsaild_push(ailp);
543 }
544
545 return 0;
543} 546}
544 547
545/* 548/*
@@ -574,8 +577,9 @@ xfs_ail_push(
574 */ 577 */
575 smp_wmb(); 578 smp_wmb();
576 xfs_trans_ail_copy_lsn(ailp, &ailp->xa_target, &threshold_lsn); 579 xfs_trans_ail_copy_lsn(ailp, &ailp->xa_target, &threshold_lsn);
577 if (!test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags)) 580 smp_wmb();
578 queue_delayed_work(xfs_syncd_wq, &ailp->xa_work, 0); 581
582 wake_up_process(ailp->xa_task);
579} 583}
580 584
581/* 585/*
@@ -813,9 +817,18 @@ xfs_trans_ail_init(
813 INIT_LIST_HEAD(&ailp->xa_ail); 817 INIT_LIST_HEAD(&ailp->xa_ail);
814 INIT_LIST_HEAD(&ailp->xa_cursors); 818 INIT_LIST_HEAD(&ailp->xa_cursors);
815 spin_lock_init(&ailp->xa_lock); 819 spin_lock_init(&ailp->xa_lock);
816 INIT_DELAYED_WORK(&ailp->xa_work, xfs_ail_worker); 820
821 ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
822 ailp->xa_mount->m_fsname);
823 if (IS_ERR(ailp->xa_task))
824 goto out_free_ailp;
825
817 mp->m_ail = ailp; 826 mp->m_ail = ailp;
818 return 0; 827 return 0;
828
829out_free_ailp:
830 kmem_free(ailp);
831 return ENOMEM;
819} 832}
820 833
821void 834void
@@ -824,6 +837,6 @@ xfs_trans_ail_destroy(
824{ 837{
825 struct xfs_ail *ailp = mp->m_ail; 838 struct xfs_ail *ailp = mp->m_ail;
826 839
827 cancel_delayed_work_sync(&ailp->xa_work); 840 kthread_stop(ailp->xa_task);
828 kmem_free(ailp); 841 kmem_free(ailp);
829} 842}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 212946b97239..22750b5e4a8f 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -64,23 +64,17 @@ struct xfs_ail_cursor {
64 */ 64 */
65struct xfs_ail { 65struct xfs_ail {
66 struct xfs_mount *xa_mount; 66 struct xfs_mount *xa_mount;
67 struct task_struct *xa_task;
67 struct list_head xa_ail; 68 struct list_head xa_ail;
68 xfs_lsn_t xa_target; 69 xfs_lsn_t xa_target;
69 struct list_head xa_cursors; 70 struct list_head xa_cursors;
70 spinlock_t xa_lock; 71 spinlock_t xa_lock;
71 struct delayed_work xa_work;
72 xfs_lsn_t xa_last_pushed_lsn; 72 xfs_lsn_t xa_last_pushed_lsn;
73 unsigned long xa_flags;
74}; 73};
75 74
76#define XFS_AIL_PUSHING_BIT 0
77
78/* 75/*
79 * From xfs_trans_ail.c 76 * From xfs_trans_ail.c
80 */ 77 */
81
82extern struct workqueue_struct *xfs_ail_wq; /* AIL workqueue */
83
84void xfs_trans_ail_update_bulk(struct xfs_ail *ailp, 78void xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
85 struct xfs_ail_cursor *cur, 79 struct xfs_ail_cursor *cur,
86 struct xfs_log_item **log_items, int nr_items, 80 struct xfs_log_item **log_items, int nr_items,