aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/bio-integrity.c1
-rw-r--r--fs/bio.c1
-rw-r--r--fs/block_dev.c15
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/acl.c17
-rw-r--r--fs/btrfs/backref.c776
-rw-r--r--fs/btrfs/backref.h62
-rw-r--r--fs/btrfs/btrfs_inode.h17
-rw-r--r--fs/btrfs/compression.c3
-rw-r--r--fs/btrfs/ctree.c10
-rw-r--r--fs/btrfs/ctree.h198
-rw-r--r--fs/btrfs/delayed-inode.c50
-rw-r--r--fs/btrfs/disk-io.c441
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c851
-rw-r--r--fs/btrfs/extent_io.c614
-rw-r--r--fs/btrfs/extent_io.h23
-rw-r--r--fs/btrfs/file-item.c17
-rw-r--r--fs/btrfs/file.c25
-rw-r--r--fs/btrfs/free-space-cache.c926
-rw-r--r--fs/btrfs/inode-map.c6
-rw-r--r--fs/btrfs/inode.c457
-rw-r--r--fs/btrfs/ioctl.c227
-rw-r--r--fs/btrfs/ioctl.h29
-rw-r--r--fs/btrfs/print-tree.c8
-rw-r--r--fs/btrfs/reada.c951
-rw-r--r--fs/btrfs/relocation.c24
-rw-r--r--fs/btrfs/scrub.c591
-rw-r--r--fs/btrfs/super.c298
-rw-r--r--fs/btrfs/transaction.c146
-rw-r--r--fs/btrfs/tree-log.c19
-rw-r--r--fs/btrfs/volumes.c207
-rw-r--r--fs/btrfs/volumes.h18
-rw-r--r--fs/btrfs/xattr.c11
-rw-r--r--fs/buffer.c2
-rw-r--r--fs/ceph/caps.c8
-rw-r--r--fs/ceph/dir.c87
-rw-r--r--fs/ceph/inode.c8
-rw-r--r--fs/ceph/mds_client.c10
-rw-r--r--fs/ceph/super.c4
-rw-r--r--fs/ceph/super.h23
-rw-r--r--fs/cifs/connect.c1
-rw-r--r--fs/exofs/Kconfig2
-rw-r--r--fs/exofs/ore.c1
-rw-r--r--fs/exofs/super.c1
-rw-r--r--fs/ext4/inode.c2
-rw-r--r--fs/fs-writeback.c84
-rw-r--r--fs/fuse/cuse.c1
-rw-r--r--fs/gfs2/ops_fstype.c1
-rw-r--r--fs/ioprio.c1
-rw-r--r--fs/jffs2/compr.c128
-rw-r--r--fs/jffs2/compr.h2
-rw-r--r--fs/jffs2/fs.c2
-rw-r--r--fs/jffs2/jffs2_fs_sb.h6
-rw-r--r--fs/jffs2/os-linux.h2
-rw-r--r--fs/jffs2/scan.c4
-rw-r--r--fs/jffs2/super.c119
-rw-r--r--fs/jffs2/wbuf.c9
-rw-r--r--fs/jfs/jfs_logmgr.c1
-rw-r--r--fs/logfs/super.c1
-rw-r--r--fs/nfs/callback_xdr.c12
-rw-r--r--fs/nfs/file.c9
-rw-r--r--fs/nfs/nfs4filelayout.c8
-rw-r--r--fs/nfs/nfs4proc.c6
-rw-r--r--fs/nfs/nfs4xdr.c2
-rw-r--r--fs/nfs/objlayout/objio_osd.c872
-rw-r--r--fs/nfs/objlayout/objlayout.c209
-rw-r--r--fs/nfs/objlayout/objlayout.h48
-rw-r--r--fs/nfs/pagelist.c3
-rw-r--r--fs/nfs/pnfs.c26
-rw-r--r--fs/nfs/pnfs_dev.c1
-rw-r--r--fs/nfs/write.c4
-rw-r--r--fs/nfsd/nfs4acl.c1
-rw-r--r--fs/nfsd/nfsctl.c1
-rw-r--r--fs/nfsd/nfssvc.c3
-rw-r--r--fs/ocfs2/cluster/tcp.c1
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c1
-rw-r--r--fs/proc/vmcore.c1
-rw-r--r--fs/quota/quota.c2
-rw-r--r--fs/squashfs/Kconfig22
-rw-r--r--fs/squashfs/squashfs_fs.h7
-rw-r--r--fs/squashfs/super.c2
-rw-r--r--fs/statfs.c2
-rw-r--r--fs/sync.c4
-rw-r--r--fs/ubifs/budget.c2
-rw-r--r--fs/ubifs/debug.c16
-rw-r--r--fs/ubifs/debug.h5
-rw-r--r--fs/ubifs/recovery.c2
-rw-r--r--fs/ubifs/sb.c2
89 files changed, 6256 insertions, 2574 deletions
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 9c5e6b2cd11a..c2183f3917cd 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/blkdev.h> 23#include <linux/blkdev.h>
24#include <linux/mempool.h> 24#include <linux/mempool.h>
25#include <linux/export.h>
25#include <linux/bio.h> 26#include <linux/bio.h>
26#include <linux/workqueue.h> 27#include <linux/workqueue.h>
27#include <linux/slab.h> 28#include <linux/slab.h>
diff --git a/fs/bio.c b/fs/bio.c
index 9bfade8a609b..41c93c722244 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -255,7 +255,6 @@ void bio_init(struct bio *bio)
255{ 255{
256 memset(bio, 0, sizeof(*bio)); 256 memset(bio, 0, sizeof(*bio));
257 bio->bi_flags = 1 << BIO_UPTODATE; 257 bio->bi_flags = 1 << BIO_UPTODATE;
258 bio->bi_comp_cpu = -1;
259 atomic_set(&bio->bi_cnt, 1); 258 atomic_set(&bio->bi_cnt, 1);
260} 259}
261EXPORT_SYMBOL(bio_init); 260EXPORT_SYMBOL(bio_init);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 95f786ec7f08..b07f1da1de4e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -971,7 +971,7 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty)
971 971
972 if (!bdev->bd_disk) 972 if (!bdev->bd_disk)
973 return; 973 return;
974 if (disk_partitionable(bdev->bd_disk)) 974 if (disk_part_scan_enabled(bdev->bd_disk))
975 bdev->bd_invalidated = 1; 975 bdev->bd_invalidated = 1;
976} 976}
977 977
@@ -1085,6 +1085,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
1085static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) 1085static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1086{ 1086{
1087 struct gendisk *disk; 1087 struct gendisk *disk;
1088 struct module *owner;
1088 int ret; 1089 int ret;
1089 int partno; 1090 int partno;
1090 int perm = 0; 1091 int perm = 0;
@@ -1110,6 +1111,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1110 disk = get_gendisk(bdev->bd_dev, &partno); 1111 disk = get_gendisk(bdev->bd_dev, &partno);
1111 if (!disk) 1112 if (!disk)
1112 goto out; 1113 goto out;
1114 owner = disk->fops->owner;
1113 1115
1114 disk_block_events(disk); 1116 disk_block_events(disk);
1115 mutex_lock_nested(&bdev->bd_mutex, for_part); 1117 mutex_lock_nested(&bdev->bd_mutex, for_part);
@@ -1137,8 +1139,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1137 bdev->bd_disk = NULL; 1139 bdev->bd_disk = NULL;
1138 mutex_unlock(&bdev->bd_mutex); 1140 mutex_unlock(&bdev->bd_mutex);
1139 disk_unblock_events(disk); 1141 disk_unblock_events(disk);
1140 module_put(disk->fops->owner);
1141 put_disk(disk); 1142 put_disk(disk);
1143 module_put(owner);
1142 goto restart; 1144 goto restart;
1143 } 1145 }
1144 } 1146 }
@@ -1194,8 +1196,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1194 goto out_unlock_bdev; 1196 goto out_unlock_bdev;
1195 } 1197 }
1196 /* only one opener holds refs to the module and disk */ 1198 /* only one opener holds refs to the module and disk */
1197 module_put(disk->fops->owner);
1198 put_disk(disk); 1199 put_disk(disk);
1200 module_put(owner);
1199 } 1201 }
1200 bdev->bd_openers++; 1202 bdev->bd_openers++;
1201 if (for_part) 1203 if (for_part)
@@ -1215,8 +1217,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1215 out_unlock_bdev: 1217 out_unlock_bdev:
1216 mutex_unlock(&bdev->bd_mutex); 1218 mutex_unlock(&bdev->bd_mutex);
1217 disk_unblock_events(disk); 1219 disk_unblock_events(disk);
1218 module_put(disk->fops->owner);
1219 put_disk(disk); 1220 put_disk(disk);
1221 module_put(owner);
1220 out: 1222 out:
1221 bdput(bdev); 1223 bdput(bdev);
1222 1224
@@ -1442,14 +1444,15 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1442 if (!bdev->bd_openers) { 1444 if (!bdev->bd_openers) {
1443 struct module *owner = disk->fops->owner; 1445 struct module *owner = disk->fops->owner;
1444 1446
1445 put_disk(disk);
1446 module_put(owner);
1447 disk_put_part(bdev->bd_part); 1447 disk_put_part(bdev->bd_part);
1448 bdev->bd_part = NULL; 1448 bdev->bd_part = NULL;
1449 bdev->bd_disk = NULL; 1449 bdev->bd_disk = NULL;
1450 if (bdev != bdev->bd_contains) 1450 if (bdev != bdev->bd_contains)
1451 victim = bdev->bd_contains; 1451 victim = bdev->bd_contains;
1452 bdev->bd_contains = NULL; 1452 bdev->bd_contains = NULL;
1453
1454 put_disk(disk);
1455 module_put(owner);
1453 } 1456 }
1454 mutex_unlock(&bdev->bd_mutex); 1457 mutex_unlock(&bdev->bd_mutex);
1455 bdput(bdev); 1458 bdput(bdev);
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 40e6ac08c21f..c0ddfd29c5e5 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,6 +7,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o backref.o
11 12
12btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index eb159aaa5a11..89b156d85d63 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -59,22 +59,19 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
59 if (!value) 59 if (!value)
60 return ERR_PTR(-ENOMEM); 60 return ERR_PTR(-ENOMEM);
61 size = __btrfs_getxattr(inode, name, value, size); 61 size = __btrfs_getxattr(inode, name, value, size);
62 if (size > 0) { 62 }
63 acl = posix_acl_from_xattr(value, size); 63 if (size > 0) {
64 if (IS_ERR(acl)) { 64 acl = posix_acl_from_xattr(value, size);
65 kfree(value);
66 return acl;
67 }
68 set_cached_acl(inode, type, acl);
69 }
70 kfree(value);
71 } else if (size == -ENOENT || size == -ENODATA || size == 0) { 65 } else if (size == -ENOENT || size == -ENODATA || size == 0) {
72 /* FIXME, who returns -ENOENT? I think nobody */ 66 /* FIXME, who returns -ENOENT? I think nobody */
73 acl = NULL; 67 acl = NULL;
74 set_cached_acl(inode, type, acl);
75 } else { 68 } else {
76 acl = ERR_PTR(-EIO); 69 acl = ERR_PTR(-EIO);
77 } 70 }
71 kfree(value);
72
73 if (!IS_ERR(acl))
74 set_cached_acl(inode, type, acl);
78 75
79 return acl; 76 return acl;
80} 77}
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
new file mode 100644
index 000000000000..8855aad3929c
--- /dev/null
+++ b/fs/btrfs/backref.c
@@ -0,0 +1,776 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "backref.h"
22
23struct __data_ref {
24 struct list_head list;
25 u64 inum;
26 u64 root;
27 u64 extent_data_item_offset;
28};
29
30struct __shared_ref {
31 struct list_head list;
32 u64 disk_byte;
33};
34
35static int __inode_info(u64 inum, u64 ioff, u8 key_type,
36 struct btrfs_root *fs_root, struct btrfs_path *path,
37 struct btrfs_key *found_key)
38{
39 int ret;
40 struct btrfs_key key;
41 struct extent_buffer *eb;
42
43 key.type = key_type;
44 key.objectid = inum;
45 key.offset = ioff;
46
47 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
48 if (ret < 0)
49 return ret;
50
51 eb = path->nodes[0];
52 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
53 ret = btrfs_next_leaf(fs_root, path);
54 if (ret)
55 return ret;
56 eb = path->nodes[0];
57 }
58
59 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
60 if (found_key->type != key.type || found_key->objectid != key.objectid)
61 return 1;
62
63 return 0;
64}
65
66/*
67 * this makes the path point to (inum INODE_ITEM ioff)
68 */
69int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
70 struct btrfs_path *path)
71{
72 struct btrfs_key key;
73 return __inode_info(inum, ioff, BTRFS_INODE_ITEM_KEY, fs_root, path,
74 &key);
75}
76
77static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
78 struct btrfs_path *path,
79 struct btrfs_key *found_key)
80{
81 return __inode_info(inum, ioff, BTRFS_INODE_REF_KEY, fs_root, path,
82 found_key);
83}
84
85/*
86 * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
87 * of the path are separated by '/' and the path is guaranteed to be
88 * 0-terminated. the path is only given within the current file system.
89 * Therefore, it never starts with a '/'. the caller is responsible to provide
90 * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
91 * the start point of the resulting string is returned. this pointer is within
92 * dest, normally.
93 * in case the path buffer would overflow, the pointer is decremented further
94 * as if output was written to the buffer, though no more output is actually
95 * generated. that way, the caller can determine how much space would be
96 * required for the path to fit into the buffer. in that case, the returned
97 * value will be smaller than dest. callers must check this!
98 */
99static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
100 struct btrfs_inode_ref *iref,
101 struct extent_buffer *eb_in, u64 parent,
102 char *dest, u32 size)
103{
104 u32 len;
105 int slot;
106 u64 next_inum;
107 int ret;
108 s64 bytes_left = size - 1;
109 struct extent_buffer *eb = eb_in;
110 struct btrfs_key found_key;
111
112 if (bytes_left >= 0)
113 dest[bytes_left] = '\0';
114
115 while (1) {
116 len = btrfs_inode_ref_name_len(eb, iref);
117 bytes_left -= len;
118 if (bytes_left >= 0)
119 read_extent_buffer(eb, dest + bytes_left,
120 (unsigned long)(iref + 1), len);
121 if (eb != eb_in)
122 free_extent_buffer(eb);
123 ret = inode_ref_info(parent, 0, fs_root, path, &found_key);
124 if (ret)
125 break;
126 next_inum = found_key.offset;
127
128 /* regular exit ahead */
129 if (parent == next_inum)
130 break;
131
132 slot = path->slots[0];
133 eb = path->nodes[0];
134 /* make sure we can use eb after releasing the path */
135 if (eb != eb_in)
136 atomic_inc(&eb->refs);
137 btrfs_release_path(path);
138
139 iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
140 parent = next_inum;
141 --bytes_left;
142 if (bytes_left >= 0)
143 dest[bytes_left] = '/';
144 }
145
146 btrfs_release_path(path);
147
148 if (ret)
149 return ERR_PTR(ret);
150
151 return dest + bytes_left;
152}
153
154/*
155 * this makes the path point to (logical EXTENT_ITEM *)
156 * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
157 * tree blocks and <0 on error.
158 */
159int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
160 struct btrfs_path *path, struct btrfs_key *found_key)
161{
162 int ret;
163 u64 flags;
164 u32 item_size;
165 struct extent_buffer *eb;
166 struct btrfs_extent_item *ei;
167 struct btrfs_key key;
168
169 key.type = BTRFS_EXTENT_ITEM_KEY;
170 key.objectid = logical;
171 key.offset = (u64)-1;
172
173 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
174 if (ret < 0)
175 return ret;
176 ret = btrfs_previous_item(fs_info->extent_root, path,
177 0, BTRFS_EXTENT_ITEM_KEY);
178 if (ret < 0)
179 return ret;
180
181 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
182 if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
183 found_key->objectid > logical ||
184 found_key->objectid + found_key->offset <= logical)
185 return -ENOENT;
186
187 eb = path->nodes[0];
188 item_size = btrfs_item_size_nr(eb, path->slots[0]);
189 BUG_ON(item_size < sizeof(*ei));
190
191 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
192 flags = btrfs_extent_flags(eb, ei);
193
194 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
195 return BTRFS_EXTENT_FLAG_TREE_BLOCK;
196 if (flags & BTRFS_EXTENT_FLAG_DATA)
197 return BTRFS_EXTENT_FLAG_DATA;
198
199 return -EIO;
200}
201
202/*
203 * helper function to iterate extent inline refs. ptr must point to a 0 value
204 * for the first call and may be modified. it is used to track state.
205 * if more refs exist, 0 is returned and the next call to
206 * __get_extent_inline_ref must pass the modified ptr parameter to get the
207 * next ref. after the last ref was processed, 1 is returned.
208 * returns <0 on error
209 */
210static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
211 struct btrfs_extent_item *ei, u32 item_size,
212 struct btrfs_extent_inline_ref **out_eiref,
213 int *out_type)
214{
215 unsigned long end;
216 u64 flags;
217 struct btrfs_tree_block_info *info;
218
219 if (!*ptr) {
220 /* first call */
221 flags = btrfs_extent_flags(eb, ei);
222 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
223 info = (struct btrfs_tree_block_info *)(ei + 1);
224 *out_eiref =
225 (struct btrfs_extent_inline_ref *)(info + 1);
226 } else {
227 *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1);
228 }
229 *ptr = (unsigned long)*out_eiref;
230 if ((void *)*ptr >= (void *)ei + item_size)
231 return -ENOENT;
232 }
233
234 end = (unsigned long)ei + item_size;
235 *out_eiref = (struct btrfs_extent_inline_ref *)*ptr;
236 *out_type = btrfs_extent_inline_ref_type(eb, *out_eiref);
237
238 *ptr += btrfs_extent_inline_ref_size(*out_type);
239 WARN_ON(*ptr > end);
240 if (*ptr == end)
241 return 1; /* last */
242
243 return 0;
244}
245
246/*
247 * reads the tree block backref for an extent. tree level and root are returned
248 * through out_level and out_root. ptr must point to a 0 value for the first
249 * call and may be modified (see __get_extent_inline_ref comment).
250 * returns 0 if data was provided, 1 if there was no more data to provide or
251 * <0 on error.
252 */
253int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
254 struct btrfs_extent_item *ei, u32 item_size,
255 u64 *out_root, u8 *out_level)
256{
257 int ret;
258 int type;
259 struct btrfs_tree_block_info *info;
260 struct btrfs_extent_inline_ref *eiref;
261
262 if (*ptr == (unsigned long)-1)
263 return 1;
264
265 while (1) {
266 ret = __get_extent_inline_ref(ptr, eb, ei, item_size,
267 &eiref, &type);
268 if (ret < 0)
269 return ret;
270
271 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
272 type == BTRFS_SHARED_BLOCK_REF_KEY)
273 break;
274
275 if (ret == 1)
276 return 1;
277 }
278
279 /* we can treat both ref types equally here */
280 info = (struct btrfs_tree_block_info *)(ei + 1);
281 *out_root = btrfs_extent_inline_ref_offset(eb, eiref);
282 *out_level = btrfs_tree_block_level(eb, info);
283
284 if (ret == 1)
285 *ptr = (unsigned long)-1;
286
287 return 0;
288}
289
290static int __data_list_add(struct list_head *head, u64 inum,
291 u64 extent_data_item_offset, u64 root)
292{
293 struct __data_ref *ref;
294
295 ref = kmalloc(sizeof(*ref), GFP_NOFS);
296 if (!ref)
297 return -ENOMEM;
298
299 ref->inum = inum;
300 ref->extent_data_item_offset = extent_data_item_offset;
301 ref->root = root;
302 list_add_tail(&ref->list, head);
303
304 return 0;
305}
306
307static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb,
308 struct btrfs_extent_data_ref *dref)
309{
310 return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref),
311 btrfs_extent_data_ref_offset(eb, dref),
312 btrfs_extent_data_ref_root(eb, dref));
313}
314
315static int __shared_list_add(struct list_head *head, u64 disk_byte)
316{
317 struct __shared_ref *ref;
318
319 ref = kmalloc(sizeof(*ref), GFP_NOFS);
320 if (!ref)
321 return -ENOMEM;
322
323 ref->disk_byte = disk_byte;
324 list_add_tail(&ref->list, head);
325
326 return 0;
327}
328
329static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info,
330 u64 logical, u64 inum,
331 u64 extent_data_item_offset,
332 u64 extent_offset,
333 struct btrfs_path *path,
334 struct list_head *data_refs,
335 iterate_extent_inodes_t *iterate,
336 void *ctx)
337{
338 u64 ref_root;
339 u32 item_size;
340 struct btrfs_key key;
341 struct extent_buffer *eb;
342 struct btrfs_extent_item *ei;
343 struct btrfs_extent_inline_ref *eiref;
344 struct __data_ref *ref;
345 int ret;
346 int type;
347 int last;
348 unsigned long ptr = 0;
349
350 WARN_ON(!list_empty(data_refs));
351 ret = extent_from_logical(fs_info, logical, path, &key);
352 if (ret & BTRFS_EXTENT_FLAG_DATA)
353 ret = -EIO;
354 if (ret < 0)
355 goto out;
356
357 eb = path->nodes[0];
358 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
359 item_size = btrfs_item_size_nr(eb, path->slots[0]);
360
361 ret = 0;
362 ref_root = 0;
363 /*
364 * as done in iterate_extent_inodes, we first build a list of refs to
365 * iterate, then free the path and then iterate them to avoid deadlocks.
366 */
367 do {
368 last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
369 &eiref, &type);
370 if (last < 0) {
371 ret = last;
372 goto out;
373 }
374 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
375 type == BTRFS_SHARED_BLOCK_REF_KEY) {
376 ref_root = btrfs_extent_inline_ref_offset(eb, eiref);
377 ret = __data_list_add(data_refs, inum,
378 extent_data_item_offset,
379 ref_root);
380 }
381 } while (!ret && !last);
382
383 btrfs_release_path(path);
384
385 if (ref_root == 0) {
386 printk(KERN_ERR "btrfs: failed to find tree block ref "
387 "for shared data backref %llu\n", logical);
388 WARN_ON(1);
389 ret = -EIO;
390 }
391
392out:
393 while (!list_empty(data_refs)) {
394 ref = list_first_entry(data_refs, struct __data_ref, list);
395 list_del(&ref->list);
396 if (!ret)
397 ret = iterate(ref->inum, extent_offset +
398 ref->extent_data_item_offset,
399 ref->root, ctx);
400 kfree(ref);
401 }
402
403 return ret;
404}
405
406static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
407 u64 logical, u64 orig_extent_item_objectid,
408 u64 extent_offset, struct btrfs_path *path,
409 struct list_head *data_refs,
410 iterate_extent_inodes_t *iterate,
411 void *ctx)
412{
413 u64 disk_byte;
414 struct btrfs_key key;
415 struct btrfs_file_extent_item *fi;
416 struct extent_buffer *eb;
417 int slot;
418 int nritems;
419 int ret;
420 int found = 0;
421
422 eb = read_tree_block(fs_info->tree_root, logical,
423 fs_info->tree_root->leafsize, 0);
424 if (!eb)
425 return -EIO;
426
427 /*
428 * from the shared data ref, we only have the leaf but we need
429 * the key. thus, we must look into all items and see that we
430 * find one (some) with a reference to our extent item.
431 */
432 nritems = btrfs_header_nritems(eb);
433 for (slot = 0; slot < nritems; ++slot) {
434 btrfs_item_key_to_cpu(eb, &key, slot);
435 if (key.type != BTRFS_EXTENT_DATA_KEY)
436 continue;
437 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
438 if (!fi) {
439 free_extent_buffer(eb);
440 return -EIO;
441 }
442 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
443 if (disk_byte != orig_extent_item_objectid) {
444 if (found)
445 break;
446 else
447 continue;
448 }
449 ++found;
450 ret = __iter_shared_inline_ref_inodes(fs_info, logical,
451 key.objectid,
452 key.offset,
453 extent_offset, path,
454 data_refs,
455 iterate, ctx);
456 if (ret)
457 break;
458 }
459
460 if (!found) {
461 printk(KERN_ERR "btrfs: failed to follow shared data backref "
462 "to parent %llu\n", logical);
463 WARN_ON(1);
464 ret = -EIO;
465 }
466
467 free_extent_buffer(eb);
468 return ret;
469}
470
471/*
472 * calls iterate() for every inode that references the extent identified by
473 * the given parameters. will use the path given as a parameter and return it
474 * released.
475 * when the iterator function returns a non-zero value, iteration stops.
476 */
477int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
478 struct btrfs_path *path,
479 u64 extent_item_objectid,
480 u64 extent_offset,
481 iterate_extent_inodes_t *iterate, void *ctx)
482{
483 unsigned long ptr = 0;
484 int last;
485 int ret;
486 int type;
487 u64 logical;
488 u32 item_size;
489 struct btrfs_extent_inline_ref *eiref;
490 struct btrfs_extent_data_ref *dref;
491 struct extent_buffer *eb;
492 struct btrfs_extent_item *ei;
493 struct btrfs_key key;
494 struct list_head data_refs = LIST_HEAD_INIT(data_refs);
495 struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
496 struct __data_ref *ref_d;
497 struct __shared_ref *ref_s;
498
499 eb = path->nodes[0];
500 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
501 item_size = btrfs_item_size_nr(eb, path->slots[0]);
502
503 /* first we iterate the inline refs, ... */
504 do {
505 last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
506 &eiref, &type);
507 if (last == -ENOENT) {
508 ret = 0;
509 break;
510 }
511 if (last < 0) {
512 ret = last;
513 break;
514 }
515
516 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
517 dref = (struct btrfs_extent_data_ref *)(&eiref->offset);
518 ret = __data_list_add_eb(&data_refs, eb, dref);
519 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
520 logical = btrfs_extent_inline_ref_offset(eb, eiref);
521 ret = __shared_list_add(&shared_refs, logical);
522 }
523 } while (!ret && !last);
524
525 /* ... then we proceed to in-tree references and ... */
526 while (!ret) {
527 ++path->slots[0];
528 if (path->slots[0] > btrfs_header_nritems(eb)) {
529 ret = btrfs_next_leaf(fs_info->extent_root, path);
530 if (ret) {
531 if (ret == 1)
532 ret = 0; /* we're done */
533 break;
534 }
535 eb = path->nodes[0];
536 }
537 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
538 if (key.objectid != extent_item_objectid)
539 break;
540 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
541 dref = btrfs_item_ptr(eb, path->slots[0],
542 struct btrfs_extent_data_ref);
543 ret = __data_list_add_eb(&data_refs, eb, dref);
544 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
545 ret = __shared_list_add(&shared_refs, key.offset);
546 }
547 }
548
549 btrfs_release_path(path);
550
551 /*
552 * ... only at the very end we can process the refs we found. this is
553 * because the iterator function we call is allowed to make tree lookups
554 * and we have to avoid deadlocks. additionally, we need more tree
555 * lookups ourselves for shared data refs.
556 */
557 while (!list_empty(&data_refs)) {
558 ref_d = list_first_entry(&data_refs, struct __data_ref, list);
559 list_del(&ref_d->list);
560 if (!ret)
561 ret = iterate(ref_d->inum, extent_offset +
562 ref_d->extent_data_item_offset,
563 ref_d->root, ctx);
564 kfree(ref_d);
565 }
566
567 while (!list_empty(&shared_refs)) {
568 ref_s = list_first_entry(&shared_refs, struct __shared_ref,
569 list);
570 list_del(&ref_s->list);
571 if (!ret)
572 ret = __iter_shared_inline_ref(fs_info,
573 ref_s->disk_byte,
574 extent_item_objectid,
575 extent_offset, path,
576 &data_refs,
577 iterate, ctx);
578 kfree(ref_s);
579 }
580
581 return ret;
582}
583
584int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
585 struct btrfs_path *path,
586 iterate_extent_inodes_t *iterate, void *ctx)
587{
588 int ret;
589 u64 offset;
590 struct btrfs_key found_key;
591
592 ret = extent_from_logical(fs_info, logical, path,
593 &found_key);
594 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
595 ret = -EINVAL;
596 if (ret < 0)
597 return ret;
598
599 offset = logical - found_key.objectid;
600 ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
601 offset, iterate, ctx);
602
603 return ret;
604}
605
606static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
607 struct btrfs_path *path,
608 iterate_irefs_t *iterate, void *ctx)
609{
610 int ret;
611 int slot;
612 u32 cur;
613 u32 len;
614 u32 name_len;
615 u64 parent = 0;
616 int found = 0;
617 struct extent_buffer *eb;
618 struct btrfs_item *item;
619 struct btrfs_inode_ref *iref;
620 struct btrfs_key found_key;
621
622 while (1) {
623 ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
624 &found_key);
625 if (ret < 0)
626 break;
627 if (ret) {
628 ret = found ? 0 : -ENOENT;
629 break;
630 }
631 ++found;
632
633 parent = found_key.offset;
634 slot = path->slots[0];
635 eb = path->nodes[0];
636 /* make sure we can use eb after releasing the path */
637 atomic_inc(&eb->refs);
638 btrfs_release_path(path);
639
640 item = btrfs_item_nr(eb, slot);
641 iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
642
643 for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
644 name_len = btrfs_inode_ref_name_len(eb, iref);
645 /* path must be released before calling iterate()! */
646 ret = iterate(parent, iref, eb, ctx);
647 if (ret) {
648 free_extent_buffer(eb);
649 break;
650 }
651 len = sizeof(*iref) + name_len;
652 iref = (struct btrfs_inode_ref *)((char *)iref + len);
653 }
654 free_extent_buffer(eb);
655 }
656
657 btrfs_release_path(path);
658
659 return ret;
660}
661
662/*
663 * returns 0 if the path could be dumped (probably truncated)
664 * returns <0 in case of an error
665 */
666static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
667 struct extent_buffer *eb, void *ctx)
668{
669 struct inode_fs_paths *ipath = ctx;
670 char *fspath;
671 char *fspath_min;
672 int i = ipath->fspath->elem_cnt;
673 const int s_ptr = sizeof(char *);
674 u32 bytes_left;
675
676 bytes_left = ipath->fspath->bytes_left > s_ptr ?
677 ipath->fspath->bytes_left - s_ptr : 0;
678
679 fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
680 fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb,
681 inum, fspath_min, bytes_left);
682 if (IS_ERR(fspath))
683 return PTR_ERR(fspath);
684
685 if (fspath > fspath_min) {
686 ipath->fspath->val[i] = (u64)fspath;
687 ++ipath->fspath->elem_cnt;
688 ipath->fspath->bytes_left = fspath - fspath_min;
689 } else {
690 ++ipath->fspath->elem_missed;
691 ipath->fspath->bytes_missing += fspath_min - fspath;
692 ipath->fspath->bytes_left = 0;
693 }
694
695 return 0;
696}
697
698/*
699 * this dumps all file system paths to the inode into the ipath struct, provided
700 * is has been created large enough. each path is zero-terminated and accessed
701 * from ipath->fspath->val[i].
702 * when it returns, there are ipath->fspath->elem_cnt number of paths available
703 * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the
704 * number of missed paths in recored in ipath->fspath->elem_missed, otherwise,
705 * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would
706 * have been needed to return all paths.
707 */
708int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
709{
710 return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
711 inode_to_path, ipath);
712}
713
714/*
715 * allocates space to return multiple file system paths for an inode.
716 * total_bytes to allocate are passed, note that space usable for actual path
717 * information will be total_bytes - sizeof(struct inode_fs_paths).
718 * the returned pointer must be freed with free_ipath() in the end.
719 */
720struct btrfs_data_container *init_data_container(u32 total_bytes)
721{
722 struct btrfs_data_container *data;
723 size_t alloc_bytes;
724
725 alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
726 data = kmalloc(alloc_bytes, GFP_NOFS);
727 if (!data)
728 return ERR_PTR(-ENOMEM);
729
730 if (total_bytes >= sizeof(*data)) {
731 data->bytes_left = total_bytes - sizeof(*data);
732 data->bytes_missing = 0;
733 } else {
734 data->bytes_missing = sizeof(*data) - total_bytes;
735 data->bytes_left = 0;
736 }
737
738 data->elem_cnt = 0;
739 data->elem_missed = 0;
740
741 return data;
742}
743
744/*
745 * allocates space to return multiple file system paths for an inode.
746 * total_bytes to allocate are passed, note that space usable for actual path
747 * information will be total_bytes - sizeof(struct inode_fs_paths).
748 * the returned pointer must be freed with free_ipath() in the end.
749 */
750struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
751 struct btrfs_path *path)
752{
753 struct inode_fs_paths *ifp;
754 struct btrfs_data_container *fspath;
755
756 fspath = init_data_container(total_bytes);
757 if (IS_ERR(fspath))
758 return (void *)fspath;
759
760 ifp = kmalloc(sizeof(*ifp), GFP_NOFS);
761 if (!ifp) {
762 kfree(fspath);
763 return ERR_PTR(-ENOMEM);
764 }
765
766 ifp->btrfs_path = path;
767 ifp->fspath = fspath;
768 ifp->fs_root = fs_root;
769
770 return ifp;
771}
772
773void free_ipath(struct inode_fs_paths *ipath)
774{
775 kfree(ipath);
776}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
new file mode 100644
index 000000000000..92618837cb8f
--- /dev/null
+++ b/fs/btrfs/backref.h
@@ -0,0 +1,62 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_BACKREF__
20#define __BTRFS_BACKREF__
21
22#include "ioctl.h"
23
24struct inode_fs_paths {
25 struct btrfs_path *btrfs_path;
26 struct btrfs_root *fs_root;
27 struct btrfs_data_container *fspath;
28};
29
30typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
31 void *ctx);
32typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref,
33 struct extent_buffer *eb, void *ctx);
34
35int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
36 struct btrfs_path *path);
37
38int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
39 struct btrfs_path *path, struct btrfs_key *found_key);
40
41int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
42 struct btrfs_extent_item *ei, u32 item_size,
43 u64 *out_root, u8 *out_level);
44
45int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
46 struct btrfs_path *path,
47 u64 extent_item_objectid,
48 u64 extent_offset,
49 iterate_extent_inodes_t *iterate, void *ctx);
50
51int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
52 struct btrfs_path *path,
53 iterate_extent_inodes_t *iterate, void *ctx);
54
55int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
56
57struct btrfs_data_container *init_data_container(u32 total_bytes);
58struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
59 struct btrfs_path *path);
60void free_ipath(struct inode_fs_paths *ipath);
61
62#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index d9f99a16edd6..5a5d325a3935 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -103,11 +103,6 @@ struct btrfs_inode {
103 */ 103 */
104 u64 delalloc_bytes; 104 u64 delalloc_bytes;
105 105
106 /* total number of bytes that may be used for this inode for
107 * delalloc
108 */
109 u64 reserved_bytes;
110
111 /* 106 /*
112 * the size of the file stored in the metadata on disk. data=ordered 107 * the size of the file stored in the metadata on disk. data=ordered
113 * means the in-memory i_size might be larger than the size on disk 108 * means the in-memory i_size might be larger than the size on disk
@@ -115,9 +110,6 @@ struct btrfs_inode {
115 */ 110 */
116 u64 disk_i_size; 111 u64 disk_i_size;
117 112
118 /* flags field from the on disk inode */
119 u32 flags;
120
121 /* 113 /*
122 * if this is a directory then index_cnt is the counter for the index 114 * if this is a directory then index_cnt is the counter for the index
123 * number for new files that are created 115 * number for new files that are created
@@ -132,6 +124,15 @@ struct btrfs_inode {
132 u64 last_unlink_trans; 124 u64 last_unlink_trans;
133 125
134 /* 126 /*
127 * Number of bytes outstanding that are going to need csums. This is
128 * used in ENOSPC accounting.
129 */
130 u64 csum_bytes;
131
132 /* flags field from the on disk inode */
133 u32 flags;
134
135 /*
135 * Counters to keep track of the number of extent item's we may use due 136 * Counters to keep track of the number of extent item's we may use due
136 * to delalloc and such. outstanding_extents is the number of extent 137 * to delalloc and such. outstanding_extents is the number of extent
137 * items we think we'll end up using, and reserved_extents is the number 138 * items we think we'll end up using, and reserved_extents is the number
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8ec5d86f1734..14f1c5a0b2d2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -85,7 +85,8 @@ struct compressed_bio {
85static inline int compressed_bio_size(struct btrfs_root *root, 85static inline int compressed_bio_size(struct btrfs_root *root,
86 unsigned long disk_size) 86 unsigned long disk_size)
87{ 87{
88 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); 88 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
89
89 return sizeof(struct compressed_bio) + 90 return sizeof(struct compressed_bio) +
90 ((disk_size + root->sectorsize - 1) / root->sectorsize) * 91 ((disk_size + root->sectorsize - 1) / root->sectorsize) *
91 csum_size; 92 csum_size;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 011cab3aca8d..0fe615e4ea38 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -902,9 +902,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
902 902
903 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 903 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
904 904
905 if (level < BTRFS_MAX_LEVEL - 1) 905 if (level < BTRFS_MAX_LEVEL - 1) {
906 parent = path->nodes[level + 1]; 906 parent = path->nodes[level + 1];
907 pslot = path->slots[level + 1]; 907 pslot = path->slots[level + 1];
908 }
908 909
909 /* 910 /*
910 * deal with the case where there is only one pointer in the root 911 * deal with the case where there is only one pointer in the root
@@ -1107,9 +1108,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1107 mid = path->nodes[level]; 1108 mid = path->nodes[level];
1108 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1109 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1109 1110
1110 if (level < BTRFS_MAX_LEVEL - 1) 1111 if (level < BTRFS_MAX_LEVEL - 1) {
1111 parent = path->nodes[level + 1]; 1112 parent = path->nodes[level + 1];
1112 pslot = path->slots[level + 1]; 1113 pslot = path->slots[level + 1];
1114 }
1113 1115
1114 if (!parent) 1116 if (!parent)
1115 return 1; 1117 return 1;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 03912c5c6f49..b9ba59ff9292 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -30,6 +30,7 @@
30#include <linux/kobject.h> 30#include <linux/kobject.h>
31#include <trace/events/btrfs.h> 31#include <trace/events/btrfs.h>
32#include <asm/kmap_types.h> 32#include <asm/kmap_types.h>
33#include <linux/pagemap.h>
33#include "extent_io.h" 34#include "extent_io.h"
34#include "extent_map.h" 35#include "extent_map.h"
35#include "async-thread.h" 36#include "async-thread.h"
@@ -360,6 +361,47 @@ struct btrfs_header {
360#define BTRFS_LABEL_SIZE 256 361#define BTRFS_LABEL_SIZE 256
361 362
362/* 363/*
364 * just in case we somehow lose the roots and are not able to mount,
365 * we store an array of the roots from previous transactions
366 * in the super.
367 */
368#define BTRFS_NUM_BACKUP_ROOTS 4
369struct btrfs_root_backup {
370 __le64 tree_root;
371 __le64 tree_root_gen;
372
373 __le64 chunk_root;
374 __le64 chunk_root_gen;
375
376 __le64 extent_root;
377 __le64 extent_root_gen;
378
379 __le64 fs_root;
380 __le64 fs_root_gen;
381
382 __le64 dev_root;
383 __le64 dev_root_gen;
384
385 __le64 csum_root;
386 __le64 csum_root_gen;
387
388 __le64 total_bytes;
389 __le64 bytes_used;
390 __le64 num_devices;
391 /* future */
392 __le64 unsed_64[4];
393
394 u8 tree_root_level;
395 u8 chunk_root_level;
396 u8 extent_root_level;
397 u8 fs_root_level;
398 u8 dev_root_level;
399 u8 csum_root_level;
400 /* future and to align */
401 u8 unused_8[10];
402} __attribute__ ((__packed__));
403
404/*
363 * the super block basically lists the main trees of the FS 405 * the super block basically lists the main trees of the FS
364 * it currently lacks any block count etc etc 406 * it currently lacks any block count etc etc
365 */ 407 */
@@ -405,6 +447,7 @@ struct btrfs_super_block {
405 /* future expansion */ 447 /* future expansion */
406 __le64 reserved[31]; 448 __le64 reserved[31];
407 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; 449 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
450 struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
408} __attribute__ ((__packed__)); 451} __attribute__ ((__packed__));
409 452
410/* 453/*
@@ -772,14 +815,8 @@ struct btrfs_space_info {
772struct btrfs_block_rsv { 815struct btrfs_block_rsv {
773 u64 size; 816 u64 size;
774 u64 reserved; 817 u64 reserved;
775 u64 freed[2];
776 struct btrfs_space_info *space_info; 818 struct btrfs_space_info *space_info;
777 struct list_head list;
778 spinlock_t lock; 819 spinlock_t lock;
779 atomic_t usage;
780 unsigned int priority:8;
781 unsigned int durable:1;
782 unsigned int refill_used:1;
783 unsigned int full:1; 820 unsigned int full:1;
784}; 821};
785 822
@@ -840,10 +877,10 @@ struct btrfs_block_group_cache {
840 spinlock_t lock; 877 spinlock_t lock;
841 u64 pinned; 878 u64 pinned;
842 u64 reserved; 879 u64 reserved;
843 u64 reserved_pinned;
844 u64 bytes_super; 880 u64 bytes_super;
845 u64 flags; 881 u64 flags;
846 u64 sectorsize; 882 u64 sectorsize;
883 u64 cache_generation;
847 unsigned int ro:1; 884 unsigned int ro:1;
848 unsigned int dirty:1; 885 unsigned int dirty:1;
849 unsigned int iref:1; 886 unsigned int iref:1;
@@ -899,6 +936,10 @@ struct btrfs_fs_info {
899 spinlock_t block_group_cache_lock; 936 spinlock_t block_group_cache_lock;
900 struct rb_root block_group_cache_tree; 937 struct rb_root block_group_cache_tree;
901 938
939 /* keep track of unallocated space */
940 spinlock_t free_chunk_lock;
941 u64 free_chunk_space;
942
902 struct extent_io_tree freed_extents[2]; 943 struct extent_io_tree freed_extents[2];
903 struct extent_io_tree *pinned_extents; 944 struct extent_io_tree *pinned_extents;
904 945
@@ -916,14 +957,11 @@ struct btrfs_fs_info {
916 struct btrfs_block_rsv trans_block_rsv; 957 struct btrfs_block_rsv trans_block_rsv;
917 /* block reservation for chunk tree */ 958 /* block reservation for chunk tree */
918 struct btrfs_block_rsv chunk_block_rsv; 959 struct btrfs_block_rsv chunk_block_rsv;
960 /* block reservation for delayed operations */
961 struct btrfs_block_rsv delayed_block_rsv;
919 962
920 struct btrfs_block_rsv empty_block_rsv; 963 struct btrfs_block_rsv empty_block_rsv;
921 964
922 /* list of block reservations that cross multiple transactions */
923 struct list_head durable_block_rsv_list;
924
925 struct mutex durable_block_rsv_mutex;
926
927 u64 generation; 965 u64 generation;
928 u64 last_trans_committed; 966 u64 last_trans_committed;
929 967
@@ -942,8 +980,8 @@ struct btrfs_fs_info {
942 wait_queue_head_t transaction_blocked_wait; 980 wait_queue_head_t transaction_blocked_wait;
943 wait_queue_head_t async_submit_wait; 981 wait_queue_head_t async_submit_wait;
944 982
945 struct btrfs_super_block super_copy; 983 struct btrfs_super_block *super_copy;
946 struct btrfs_super_block super_for_commit; 984 struct btrfs_super_block *super_for_commit;
947 struct block_device *__bdev; 985 struct block_device *__bdev;
948 struct super_block *sb; 986 struct super_block *sb;
949 struct inode *btree_inode; 987 struct inode *btree_inode;
@@ -1036,6 +1074,7 @@ struct btrfs_fs_info {
1036 struct btrfs_workers endio_freespace_worker; 1074 struct btrfs_workers endio_freespace_worker;
1037 struct btrfs_workers submit_workers; 1075 struct btrfs_workers submit_workers;
1038 struct btrfs_workers caching_workers; 1076 struct btrfs_workers caching_workers;
1077 struct btrfs_workers readahead_workers;
1039 1078
1040 /* 1079 /*
1041 * fixup workers take dirty pages that didn't properly go through 1080 * fixup workers take dirty pages that didn't properly go through
@@ -1119,6 +1158,13 @@ struct btrfs_fs_info {
1119 u64 fs_state; 1158 u64 fs_state;
1120 1159
1121 struct btrfs_delayed_root *delayed_root; 1160 struct btrfs_delayed_root *delayed_root;
1161
1162 /* readahead tree */
1163 spinlock_t reada_lock;
1164 struct radix_tree_root reada_tree;
1165
1166 /* next backup root to be overwritten */
1167 int backup_root_index;
1122}; 1168};
1123 1169
1124/* 1170/*
@@ -1363,6 +1409,7 @@ struct btrfs_ioctl_defrag_range_args {
1363#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) 1409#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
1364#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) 1410#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
1365#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) 1411#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
1412#define BTRFS_MOUNT_RECOVERY (1 << 18)
1366 1413
1367#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1414#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1368#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1415#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1978,6 +2025,55 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root)
1978 return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY; 2025 return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
1979} 2026}
1980 2027
2028/* struct btrfs_root_backup */
2029BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
2030 tree_root, 64);
2031BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
2032 tree_root_gen, 64);
2033BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
2034 tree_root_level, 8);
2035
2036BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
2037 chunk_root, 64);
2038BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
2039 chunk_root_gen, 64);
2040BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
2041 chunk_root_level, 8);
2042
2043BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
2044 extent_root, 64);
2045BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
2046 extent_root_gen, 64);
2047BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
2048 extent_root_level, 8);
2049
2050BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
2051 fs_root, 64);
2052BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
2053 fs_root_gen, 64);
2054BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
2055 fs_root_level, 8);
2056
2057BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
2058 dev_root, 64);
2059BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
2060 dev_root_gen, 64);
2061BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
2062 dev_root_level, 8);
2063
2064BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
2065 csum_root, 64);
2066BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
2067 csum_root_gen, 64);
2068BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
2069 csum_root_level, 8);
2070BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
2071 total_bytes, 64);
2072BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
2073 bytes_used, 64);
2074BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
2075 num_devices, 64);
2076
1981/* struct btrfs_super_block */ 2077/* struct btrfs_super_block */
1982 2078
1983BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); 2079BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@ -2129,6 +2225,11 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
2129 (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); 2225 (space_info->flags & BTRFS_BLOCK_GROUP_DATA));
2130} 2226}
2131 2227
2228static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
2229{
2230 return mapping_gfp_mask(mapping) & ~__GFP_FS;
2231}
2232
2132/* extent-tree.c */ 2233/* extent-tree.c */
2133static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, 2234static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
2134 unsigned num_items) 2235 unsigned num_items)
@@ -2137,6 +2238,17 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
2137 3 * num_items; 2238 3 * num_items;
2138} 2239}
2139 2240
2241/*
2242 * Doing a truncate won't result in new nodes or leaves, just what we need for
2243 * COW.
2244 */
2245static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
2246 unsigned num_items)
2247{
2248 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
2249 num_items;
2250}
2251
2140void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 2252void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
2141int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2253int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2142 struct btrfs_root *root, unsigned long count); 2254 struct btrfs_root *root, unsigned long count);
@@ -2146,6 +2258,9 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
2146 u64 num_bytes, u64 *refs, u64 *flags); 2258 u64 num_bytes, u64 *refs, u64 *flags);
2147int btrfs_pin_extent(struct btrfs_root *root, 2259int btrfs_pin_extent(struct btrfs_root *root,
2148 u64 bytenr, u64 num, int reserved); 2260 u64 bytenr, u64 num, int reserved);
2261int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
2262 struct btrfs_root *root,
2263 u64 bytenr, u64 num_bytes);
2149int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 2264int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2150 struct btrfs_root *root, 2265 struct btrfs_root *root,
2151 u64 objectid, u64 offset, u64 bytenr); 2266 u64 objectid, u64 offset, u64 bytenr);
@@ -2196,8 +2311,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
2196 u64 root_objectid, u64 owner, u64 offset); 2311 u64 root_objectid, u64 owner, u64 offset);
2197 2312
2198int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); 2313int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
2199int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 2314int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
2200 u64 num_bytes, int reserve, int sinfo); 2315 u64 start, u64 len);
2201int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 2316int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
2202 struct btrfs_root *root); 2317 struct btrfs_root *root);
2203int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 2318int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
@@ -2240,25 +2355,23 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
2240struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); 2355struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
2241void btrfs_free_block_rsv(struct btrfs_root *root, 2356void btrfs_free_block_rsv(struct btrfs_root *root,
2242 struct btrfs_block_rsv *rsv); 2357 struct btrfs_block_rsv *rsv);
2243void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, 2358int btrfs_block_rsv_add(struct btrfs_root *root,
2244 struct btrfs_block_rsv *rsv);
2245int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
2246 struct btrfs_root *root,
2247 struct btrfs_block_rsv *block_rsv, 2359 struct btrfs_block_rsv *block_rsv,
2248 u64 num_bytes); 2360 u64 num_bytes);
2249int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, 2361int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
2250 struct btrfs_root *root, 2362 struct btrfs_block_rsv *block_rsv,
2363 u64 num_bytes);
2364int btrfs_block_rsv_check(struct btrfs_root *root,
2365 struct btrfs_block_rsv *block_rsv, int min_factor);
2366int btrfs_block_rsv_refill(struct btrfs_root *root,
2251 struct btrfs_block_rsv *block_rsv, 2367 struct btrfs_block_rsv *block_rsv,
2252 u64 min_reserved, int min_factor); 2368 u64 min_reserved);
2253int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 2369int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2254 struct btrfs_block_rsv *dst_rsv, 2370 struct btrfs_block_rsv *dst_rsv,
2255 u64 num_bytes); 2371 u64 num_bytes);
2256void btrfs_block_rsv_release(struct btrfs_root *root, 2372void btrfs_block_rsv_release(struct btrfs_root *root,
2257 struct btrfs_block_rsv *block_rsv, 2373 struct btrfs_block_rsv *block_rsv,
2258 u64 num_bytes); 2374 u64 num_bytes);
2259int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
2260 struct btrfs_root *root,
2261 struct btrfs_block_rsv *rsv);
2262int btrfs_set_block_group_ro(struct btrfs_root *root, 2375int btrfs_set_block_group_ro(struct btrfs_root *root,
2263 struct btrfs_block_group_cache *cache); 2376 struct btrfs_block_group_cache *cache);
2264int btrfs_set_block_group_rw(struct btrfs_root *root, 2377int btrfs_set_block_group_rw(struct btrfs_root *root,
@@ -2379,6 +2492,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
2379 smp_mb(); 2492 smp_mb();
2380 return fs_info->closing; 2493 return fs_info->closing;
2381} 2494}
2495static inline void free_fs_info(struct btrfs_fs_info *fs_info)
2496{
2497 kfree(fs_info->delayed_root);
2498 kfree(fs_info->extent_root);
2499 kfree(fs_info->tree_root);
2500 kfree(fs_info->chunk_root);
2501 kfree(fs_info->dev_root);
2502 kfree(fs_info->csum_root);
2503 kfree(fs_info->super_copy);
2504 kfree(fs_info->super_for_commit);
2505 kfree(fs_info);
2506}
2382 2507
2383/* root-item.c */ 2508/* root-item.c */
2384int btrfs_find_root_ref(struct btrfs_root *tree_root, 2509int btrfs_find_root_ref(struct btrfs_root *tree_root,
@@ -2579,11 +2704,6 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
2579int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); 2704int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
2580int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); 2705int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2581int btrfs_orphan_cleanup(struct btrfs_root *root); 2706int btrfs_orphan_cleanup(struct btrfs_root *root);
2582void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2583 struct btrfs_pending_snapshot *pending,
2584 u64 *bytes_to_reserve);
2585void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2586 struct btrfs_pending_snapshot *pending);
2587void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, 2707void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2588 struct btrfs_root *root); 2708 struct btrfs_root *root);
2589int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); 2709int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
@@ -2697,4 +2817,20 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
2697int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 2817int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
2698 struct btrfs_scrub_progress *progress); 2818 struct btrfs_scrub_progress *progress);
2699 2819
2820/* reada.c */
2821struct reada_control {
2822 struct btrfs_root *root; /* tree to prefetch */
2823 struct btrfs_key key_start;
2824 struct btrfs_key key_end; /* exclusive */
2825 atomic_t elems;
2826 struct kref refcnt;
2827 wait_queue_head_t wait;
2828};
2829struct reada_control *btrfs_reada_add(struct btrfs_root *root,
2830 struct btrfs_key *start, struct btrfs_key *end);
2831int btrfs_reada_wait(void *handle);
2832void btrfs_reada_detach(void *handle);
2833int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
2834 u64 start, int err);
2835
2700#endif 2836#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index ae4d9cd10961..3a1b939c9ae2 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -591,7 +591,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
591 return 0; 591 return 0;
592 592
593 src_rsv = trans->block_rsv; 593 src_rsv = trans->block_rsv;
594 dst_rsv = &root->fs_info->global_block_rsv; 594 dst_rsv = &root->fs_info->delayed_block_rsv;
595 595
596 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 596 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
597 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); 597 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
@@ -609,7 +609,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
609 if (!item->bytes_reserved) 609 if (!item->bytes_reserved)
610 return; 610 return;
611 611
612 rsv = &root->fs_info->global_block_rsv; 612 rsv = &root->fs_info->delayed_block_rsv;
613 btrfs_block_rsv_release(root, rsv, 613 btrfs_block_rsv_release(root, rsv,
614 item->bytes_reserved); 614 item->bytes_reserved);
615} 615}
@@ -624,13 +624,36 @@ static int btrfs_delayed_inode_reserve_metadata(
624 u64 num_bytes; 624 u64 num_bytes;
625 int ret; 625 int ret;
626 626
627 if (!trans->bytes_reserved)
628 return 0;
629
630 src_rsv = trans->block_rsv; 627 src_rsv = trans->block_rsv;
631 dst_rsv = &root->fs_info->global_block_rsv; 628 dst_rsv = &root->fs_info->delayed_block_rsv;
632 629
633 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 630 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
631
632 /*
633 * btrfs_dirty_inode will update the inode under btrfs_join_transaction
634 * which doesn't reserve space for speed. This is a problem since we
635 * still need to reserve space for this update, so try to reserve the
636 * space.
637 *
638 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
639 * we're accounted for.
640 */
641 if (!trans->bytes_reserved &&
642 src_rsv != &root->fs_info->delalloc_block_rsv) {
643 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
644 /*
645 * Since we're under a transaction reserve_metadata_bytes could
646 * try to commit the transaction which will make it return
647 * EAGAIN to make us stop the transaction we have, so return
648 * ENOSPC instead so that btrfs_dirty_inode knows what to do.
649 */
650 if (ret == -EAGAIN)
651 ret = -ENOSPC;
652 if (!ret)
653 node->bytes_reserved = num_bytes;
654 return ret;
655 }
656
634 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); 657 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
635 if (!ret) 658 if (!ret)
636 node->bytes_reserved = num_bytes; 659 node->bytes_reserved = num_bytes;
@@ -646,7 +669,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
646 if (!node->bytes_reserved) 669 if (!node->bytes_reserved)
647 return; 670 return;
648 671
649 rsv = &root->fs_info->global_block_rsv; 672 rsv = &root->fs_info->delayed_block_rsv;
650 btrfs_block_rsv_release(root, rsv, 673 btrfs_block_rsv_release(root, rsv,
651 node->bytes_reserved); 674 node->bytes_reserved);
652 node->bytes_reserved = 0; 675 node->bytes_reserved = 0;
@@ -1026,7 +1049,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1026 path->leave_spinning = 1; 1049 path->leave_spinning = 1;
1027 1050
1028 block_rsv = trans->block_rsv; 1051 block_rsv = trans->block_rsv;
1029 trans->block_rsv = &root->fs_info->global_block_rsv; 1052 trans->block_rsv = &root->fs_info->delayed_block_rsv;
1030 1053
1031 delayed_root = btrfs_get_delayed_root(root); 1054 delayed_root = btrfs_get_delayed_root(root);
1032 1055
@@ -1069,7 +1092,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
1069 path->leave_spinning = 1; 1092 path->leave_spinning = 1;
1070 1093
1071 block_rsv = trans->block_rsv; 1094 block_rsv = trans->block_rsv;
1072 trans->block_rsv = &node->root->fs_info->global_block_rsv; 1095 trans->block_rsv = &node->root->fs_info->delayed_block_rsv;
1073 1096
1074 ret = btrfs_insert_delayed_items(trans, path, node->root, node); 1097 ret = btrfs_insert_delayed_items(trans, path, node->root, node);
1075 if (!ret) 1098 if (!ret)
@@ -1149,7 +1172,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
1149 goto free_path; 1172 goto free_path;
1150 1173
1151 block_rsv = trans->block_rsv; 1174 block_rsv = trans->block_rsv;
1152 trans->block_rsv = &root->fs_info->global_block_rsv; 1175 trans->block_rsv = &root->fs_info->delayed_block_rsv;
1153 1176
1154 ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); 1177 ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);
1155 if (!ret) 1178 if (!ret)
@@ -1686,11 +1709,8 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
1686 } 1709 }
1687 1710
1688 ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); 1711 ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
1689 /* 1712 if (ret)
1690 * we must reserve enough space when we start a new transaction, 1713 goto release_node;
1691 * so reserving metadata failure is impossible
1692 */
1693 BUG_ON(ret);
1694 1714
1695 fill_stack_inode_item(trans, &delayed_node->inode_item, inode); 1715 fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
1696 delayed_node->inode_dirty = 1; 1716 delayed_node->inode_dirty = 1;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 07ea91879a91..102c176fc29c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -256,8 +256,7 @@ void btrfs_csum_final(u32 crc, char *result)
256static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, 256static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
257 int verify) 257 int verify)
258{ 258{
259 u16 csum_size = 259 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
260 btrfs_super_csum_size(&root->fs_info->super_copy);
261 char *result = NULL; 260 char *result = NULL;
262 unsigned long len; 261 unsigned long len;
263 unsigned long cur_len; 262 unsigned long cur_len;
@@ -367,7 +366,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
367 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); 366 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
368 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; 367 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
369 while (1) { 368 while (1) {
370 ret = read_extent_buffer_pages(io_tree, eb, start, 1, 369 ret = read_extent_buffer_pages(io_tree, eb, start,
370 WAIT_COMPLETE,
371 btree_get_extent, mirror_num); 371 btree_get_extent, mirror_num);
372 if (!ret && 372 if (!ret &&
373 !verify_parent_transid(io_tree, eb, parent_transid)) 373 !verify_parent_transid(io_tree, eb, parent_transid))
@@ -608,11 +608,48 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
608 end = min_t(u64, eb->len, PAGE_CACHE_SIZE); 608 end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
609 end = eb->start + end - 1; 609 end = eb->start + end - 1;
610err: 610err:
611 if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
612 clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
613 btree_readahead_hook(root, eb, eb->start, ret);
614 }
615
611 free_extent_buffer(eb); 616 free_extent_buffer(eb);
612out: 617out:
613 return ret; 618 return ret;
614} 619}
615 620
621static int btree_io_failed_hook(struct bio *failed_bio,
622 struct page *page, u64 start, u64 end,
623 u64 mirror_num, struct extent_state *state)
624{
625 struct extent_io_tree *tree;
626 unsigned long len;
627 struct extent_buffer *eb;
628 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
629
630 tree = &BTRFS_I(page->mapping->host)->io_tree;
631 if (page->private == EXTENT_PAGE_PRIVATE)
632 goto out;
633 if (!page->private)
634 goto out;
635
636 len = page->private >> 2;
637 WARN_ON(len == 0);
638
639 eb = alloc_extent_buffer(tree, start, len, page);
640 if (eb == NULL)
641 goto out;
642
643 if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
644 clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
645 btree_readahead_hook(root, eb, eb->start, -EIO);
646 }
647 free_extent_buffer(eb);
648
649out:
650 return -EIO; /* we fixed nothing */
651}
652
616static void end_workqueue_bio(struct bio *bio, int err) 653static void end_workqueue_bio(struct bio *bio, int err)
617{ 654{
618 struct end_io_wq *end_io_wq = bio->bi_private; 655 struct end_io_wq *end_io_wq = bio->bi_private;
@@ -908,7 +945,7 @@ static int btree_readpage(struct file *file, struct page *page)
908{ 945{
909 struct extent_io_tree *tree; 946 struct extent_io_tree *tree;
910 tree = &BTRFS_I(page->mapping->host)->io_tree; 947 tree = &BTRFS_I(page->mapping->host)->io_tree;
911 return extent_read_full_page(tree, page, btree_get_extent); 948 return extent_read_full_page(tree, page, btree_get_extent, 0);
912} 949}
913 950
914static int btree_releasepage(struct page *page, gfp_t gfp_flags) 951static int btree_releasepage(struct page *page, gfp_t gfp_flags)
@@ -974,11 +1011,43 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
974 if (!buf) 1011 if (!buf)
975 return 0; 1012 return 0;
976 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, 1013 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
977 buf, 0, 0, btree_get_extent, 0); 1014 buf, 0, WAIT_NONE, btree_get_extent, 0);
978 free_extent_buffer(buf); 1015 free_extent_buffer(buf);
979 return ret; 1016 return ret;
980} 1017}
981 1018
1019int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
1020 int mirror_num, struct extent_buffer **eb)
1021{
1022 struct extent_buffer *buf = NULL;
1023 struct inode *btree_inode = root->fs_info->btree_inode;
1024 struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
1025 int ret;
1026
1027 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
1028 if (!buf)
1029 return 0;
1030
1031 set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
1032
1033 ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
1034 btree_get_extent, mirror_num);
1035 if (ret) {
1036 free_extent_buffer(buf);
1037 return ret;
1038 }
1039
1040 if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
1041 free_extent_buffer(buf);
1042 return -EIO;
1043 } else if (extent_buffer_uptodate(io_tree, buf, NULL)) {
1044 *eb = buf;
1045 } else {
1046 free_extent_buffer(buf);
1047 }
1048 return 0;
1049}
1050
982struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 1051struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
983 u64 bytenr, u32 blocksize) 1052 u64 bytenr, u32 blocksize)
984{ 1053{
@@ -1135,10 +1204,12 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
1135 1204
1136 generation = btrfs_root_generation(&root->root_item); 1205 generation = btrfs_root_generation(&root->root_item);
1137 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 1206 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1207 root->commit_root = NULL;
1138 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1208 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1139 blocksize, generation); 1209 blocksize, generation);
1140 if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) { 1210 if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
1141 free_extent_buffer(root->node); 1211 free_extent_buffer(root->node);
1212 root->node = NULL;
1142 return -EIO; 1213 return -EIO;
1143 } 1214 }
1144 root->commit_root = btrfs_root_node(root); 1215 root->commit_root = btrfs_root_node(root);
@@ -1577,6 +1648,235 @@ sleep:
1577 return 0; 1648 return 0;
1578} 1649}
1579 1650
1651/*
1652 * this will find the highest generation in the array of
1653 * root backups. The index of the highest array is returned,
1654 * or -1 if we can't find anything.
1655 *
1656 * We check to make sure the array is valid by comparing the
1657 * generation of the latest root in the array with the generation
1658 * in the super block. If they don't match we pitch it.
1659 */
1660static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
1661{
1662 u64 cur;
1663 int newest_index = -1;
1664 struct btrfs_root_backup *root_backup;
1665 int i;
1666
1667 for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
1668 root_backup = info->super_copy->super_roots + i;
1669 cur = btrfs_backup_tree_root_gen(root_backup);
1670 if (cur == newest_gen)
1671 newest_index = i;
1672 }
1673
1674 /* check to see if we actually wrapped around */
1675 if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
1676 root_backup = info->super_copy->super_roots;
1677 cur = btrfs_backup_tree_root_gen(root_backup);
1678 if (cur == newest_gen)
1679 newest_index = 0;
1680 }
1681 return newest_index;
1682}
1683
1684
1685/*
1686 * find the oldest backup so we know where to store new entries
1687 * in the backup array. This will set the backup_root_index
1688 * field in the fs_info struct
1689 */
1690static void find_oldest_super_backup(struct btrfs_fs_info *info,
1691 u64 newest_gen)
1692{
1693 int newest_index = -1;
1694
1695 newest_index = find_newest_super_backup(info, newest_gen);
1696 /* if there was garbage in there, just move along */
1697 if (newest_index == -1) {
1698 info->backup_root_index = 0;
1699 } else {
1700 info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
1701 }
1702}
1703
1704/*
1705 * copy all the root pointers into the super backup array.
1706 * this will bump the backup pointer by one when it is
1707 * done
1708 */
1709static void backup_super_roots(struct btrfs_fs_info *info)
1710{
1711 int next_backup;
1712 struct btrfs_root_backup *root_backup;
1713 int last_backup;
1714
1715 next_backup = info->backup_root_index;
1716 last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
1717 BTRFS_NUM_BACKUP_ROOTS;
1718
1719 /*
1720 * just overwrite the last backup if we're at the same generation
1721 * this happens only at umount
1722 */
1723 root_backup = info->super_for_commit->super_roots + last_backup;
1724 if (btrfs_backup_tree_root_gen(root_backup) ==
1725 btrfs_header_generation(info->tree_root->node))
1726 next_backup = last_backup;
1727
1728 root_backup = info->super_for_commit->super_roots + next_backup;
1729
1730 /*
1731 * make sure all of our padding and empty slots get zero filled
1732 * regardless of which ones we use today
1733 */
1734 memset(root_backup, 0, sizeof(*root_backup));
1735
1736 info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
1737
1738 btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
1739 btrfs_set_backup_tree_root_gen(root_backup,
1740 btrfs_header_generation(info->tree_root->node));
1741
1742 btrfs_set_backup_tree_root_level(root_backup,
1743 btrfs_header_level(info->tree_root->node));
1744
1745 btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
1746 btrfs_set_backup_chunk_root_gen(root_backup,
1747 btrfs_header_generation(info->chunk_root->node));
1748 btrfs_set_backup_chunk_root_level(root_backup,
1749 btrfs_header_level(info->chunk_root->node));
1750
1751 btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
1752 btrfs_set_backup_extent_root_gen(root_backup,
1753 btrfs_header_generation(info->extent_root->node));
1754 btrfs_set_backup_extent_root_level(root_backup,
1755 btrfs_header_level(info->extent_root->node));
1756
1757 /*
1758 * we might commit during log recovery, which happens before we set
1759 * the fs_root. Make sure it is valid before we fill it in.
1760 */
1761 if (info->fs_root && info->fs_root->node) {
1762 btrfs_set_backup_fs_root(root_backup,
1763 info->fs_root->node->start);
1764 btrfs_set_backup_fs_root_gen(root_backup,
1765 btrfs_header_generation(info->fs_root->node));
1766 btrfs_set_backup_fs_root_level(root_backup,
1767 btrfs_header_level(info->fs_root->node));
1768 }
1769
1770 btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
1771 btrfs_set_backup_dev_root_gen(root_backup,
1772 btrfs_header_generation(info->dev_root->node));
1773 btrfs_set_backup_dev_root_level(root_backup,
1774 btrfs_header_level(info->dev_root->node));
1775
1776 btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
1777 btrfs_set_backup_csum_root_gen(root_backup,
1778 btrfs_header_generation(info->csum_root->node));
1779 btrfs_set_backup_csum_root_level(root_backup,
1780 btrfs_header_level(info->csum_root->node));
1781
1782 btrfs_set_backup_total_bytes(root_backup,
1783 btrfs_super_total_bytes(info->super_copy));
1784 btrfs_set_backup_bytes_used(root_backup,
1785 btrfs_super_bytes_used(info->super_copy));
1786 btrfs_set_backup_num_devices(root_backup,
1787 btrfs_super_num_devices(info->super_copy));
1788
1789 /*
1790 * if we don't copy this out to the super_copy, it won't get remembered
1791 * for the next commit
1792 */
1793 memcpy(&info->super_copy->super_roots,
1794 &info->super_for_commit->super_roots,
1795 sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
1796}
1797
1798/*
1799 * this copies info out of the root backup array and back into
1800 * the in-memory super block. It is meant to help iterate through
1801 * the array, so you send it the number of backups you've already
1802 * tried and the last backup index you used.
1803 *
1804 * this returns -1 when it has tried all the backups
1805 */
1806static noinline int next_root_backup(struct btrfs_fs_info *info,
1807 struct btrfs_super_block *super,
1808 int *num_backups_tried, int *backup_index)
1809{
1810 struct btrfs_root_backup *root_backup;
1811 int newest = *backup_index;
1812
1813 if (*num_backups_tried == 0) {
1814 u64 gen = btrfs_super_generation(super);
1815
1816 newest = find_newest_super_backup(info, gen);
1817 if (newest == -1)
1818 return -1;
1819
1820 *backup_index = newest;
1821 *num_backups_tried = 1;
1822 } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
1823 /* we've tried all the backups, all done */
1824 return -1;
1825 } else {
1826 /* jump to the next oldest backup */
1827 newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
1828 BTRFS_NUM_BACKUP_ROOTS;
1829 *backup_index = newest;
1830 *num_backups_tried += 1;
1831 }
1832 root_backup = super->super_roots + newest;
1833
1834 btrfs_set_super_generation(super,
1835 btrfs_backup_tree_root_gen(root_backup));
1836 btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
1837 btrfs_set_super_root_level(super,
1838 btrfs_backup_tree_root_level(root_backup));
1839 btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
1840
1841 /*
1842 * fixme: the total bytes and num_devices need to match or we should
1843 * need a fsck
1844 */
1845 btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
1846 btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
1847 return 0;
1848}
1849
1850/* helper to cleanup tree roots */
1851static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
1852{
1853 free_extent_buffer(info->tree_root->node);
1854 free_extent_buffer(info->tree_root->commit_root);
1855 free_extent_buffer(info->dev_root->node);
1856 free_extent_buffer(info->dev_root->commit_root);
1857 free_extent_buffer(info->extent_root->node);
1858 free_extent_buffer(info->extent_root->commit_root);
1859 free_extent_buffer(info->csum_root->node);
1860 free_extent_buffer(info->csum_root->commit_root);
1861
1862 info->tree_root->node = NULL;
1863 info->tree_root->commit_root = NULL;
1864 info->dev_root->node = NULL;
1865 info->dev_root->commit_root = NULL;
1866 info->extent_root->node = NULL;
1867 info->extent_root->commit_root = NULL;
1868 info->csum_root->node = NULL;
1869 info->csum_root->commit_root = NULL;
1870
1871 if (chunk_root) {
1872 free_extent_buffer(info->chunk_root->node);
1873 free_extent_buffer(info->chunk_root->commit_root);
1874 info->chunk_root->node = NULL;
1875 info->chunk_root->commit_root = NULL;
1876 }
1877}
1878
1879
1580struct btrfs_root *open_ctree(struct super_block *sb, 1880struct btrfs_root *open_ctree(struct super_block *sb,
1581 struct btrfs_fs_devices *fs_devices, 1881 struct btrfs_fs_devices *fs_devices,
1582 char *options) 1882 char *options)
@@ -1604,6 +1904,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1604 1904
1605 int ret; 1905 int ret;
1606 int err = -EINVAL; 1906 int err = -EINVAL;
1907 int num_backups_tried = 0;
1908 int backup_index = 0;
1607 1909
1608 struct btrfs_super_block *disk_super; 1910 struct btrfs_super_block *disk_super;
1609 1911
@@ -1648,6 +1950,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1648 spin_lock_init(&fs_info->fs_roots_radix_lock); 1950 spin_lock_init(&fs_info->fs_roots_radix_lock);
1649 spin_lock_init(&fs_info->delayed_iput_lock); 1951 spin_lock_init(&fs_info->delayed_iput_lock);
1650 spin_lock_init(&fs_info->defrag_inodes_lock); 1952 spin_lock_init(&fs_info->defrag_inodes_lock);
1953 spin_lock_init(&fs_info->free_chunk_lock);
1651 mutex_init(&fs_info->reloc_mutex); 1954 mutex_init(&fs_info->reloc_mutex);
1652 1955
1653 init_completion(&fs_info->kobj_unregister); 1956 init_completion(&fs_info->kobj_unregister);
@@ -1665,8 +1968,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1665 btrfs_init_block_rsv(&fs_info->trans_block_rsv); 1968 btrfs_init_block_rsv(&fs_info->trans_block_rsv);
1666 btrfs_init_block_rsv(&fs_info->chunk_block_rsv); 1969 btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
1667 btrfs_init_block_rsv(&fs_info->empty_block_rsv); 1970 btrfs_init_block_rsv(&fs_info->empty_block_rsv);
1668 INIT_LIST_HEAD(&fs_info->durable_block_rsv_list); 1971 btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
1669 mutex_init(&fs_info->durable_block_rsv_mutex);
1670 atomic_set(&fs_info->nr_async_submits, 0); 1972 atomic_set(&fs_info->nr_async_submits, 0);
1671 atomic_set(&fs_info->async_delalloc_pages, 0); 1973 atomic_set(&fs_info->async_delalloc_pages, 0);
1672 atomic_set(&fs_info->async_submit_draining, 0); 1974 atomic_set(&fs_info->async_submit_draining, 0);
@@ -1677,6 +1979,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1677 fs_info->metadata_ratio = 0; 1979 fs_info->metadata_ratio = 0;
1678 fs_info->defrag_inodes = RB_ROOT; 1980 fs_info->defrag_inodes = RB_ROOT;
1679 fs_info->trans_no_join = 0; 1981 fs_info->trans_no_join = 0;
1982 fs_info->free_chunk_space = 0;
1983
1984 /* readahead state */
1985 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
1986 spin_lock_init(&fs_info->reada_lock);
1680 1987
1681 fs_info->thread_pool_size = min_t(unsigned long, 1988 fs_info->thread_pool_size = min_t(unsigned long,
1682 num_online_cpus() + 2, 8); 1989 num_online_cpus() + 2, 8);
@@ -1766,14 +2073,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1766 goto fail_alloc; 2073 goto fail_alloc;
1767 } 2074 }
1768 2075
1769 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); 2076 memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
1770 memcpy(&fs_info->super_for_commit, &fs_info->super_copy, 2077 memcpy(fs_info->super_for_commit, fs_info->super_copy,
1771 sizeof(fs_info->super_for_commit)); 2078 sizeof(*fs_info->super_for_commit));
1772 brelse(bh); 2079 brelse(bh);
1773 2080
1774 memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE); 2081 memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
1775 2082
1776 disk_super = &fs_info->super_copy; 2083 disk_super = fs_info->super_copy;
1777 if (!btrfs_super_root(disk_super)) 2084 if (!btrfs_super_root(disk_super))
1778 goto fail_alloc; 2085 goto fail_alloc;
1779 2086
@@ -1783,6 +2090,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1783 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); 2090 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
1784 2091
1785 /* 2092 /*
2093 * run through our array of backup supers and setup
2094 * our ring pointer to the oldest one
2095 */
2096 generation = btrfs_super_generation(disk_super);
2097 find_oldest_super_backup(fs_info, generation);
2098
2099 /*
1786 * In the long term, we'll store the compression type in the super 2100 * In the long term, we'll store the compression type in the super
1787 * block, and it'll be used for per file compression control. 2101 * block, and it'll be used for per file compression control.
1788 */ 2102 */
@@ -1870,6 +2184,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1870 btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta", 2184 btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
1871 fs_info->thread_pool_size, 2185 fs_info->thread_pool_size,
1872 &fs_info->generic_worker); 2186 &fs_info->generic_worker);
2187 btrfs_init_workers(&fs_info->readahead_workers, "readahead",
2188 fs_info->thread_pool_size,
2189 &fs_info->generic_worker);
1873 2190
1874 /* 2191 /*
1875 * endios are largely parallel and should have a very 2192 * endios are largely parallel and should have a very
@@ -1880,6 +2197,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1880 2197
1881 fs_info->endio_write_workers.idle_thresh = 2; 2198 fs_info->endio_write_workers.idle_thresh = 2;
1882 fs_info->endio_meta_write_workers.idle_thresh = 2; 2199 fs_info->endio_meta_write_workers.idle_thresh = 2;
2200 fs_info->readahead_workers.idle_thresh = 2;
1883 2201
1884 btrfs_start_workers(&fs_info->workers, 1); 2202 btrfs_start_workers(&fs_info->workers, 1);
1885 btrfs_start_workers(&fs_info->generic_worker, 1); 2203 btrfs_start_workers(&fs_info->generic_worker, 1);
@@ -1893,6 +2211,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1893 btrfs_start_workers(&fs_info->endio_freespace_worker, 1); 2211 btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
1894 btrfs_start_workers(&fs_info->delayed_workers, 1); 2212 btrfs_start_workers(&fs_info->delayed_workers, 1);
1895 btrfs_start_workers(&fs_info->caching_workers, 1); 2213 btrfs_start_workers(&fs_info->caching_workers, 1);
2214 btrfs_start_workers(&fs_info->readahead_workers, 1);
1896 2215
1897 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 2216 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1898 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 2217 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1939,7 +2258,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1939 if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { 2258 if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
1940 printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", 2259 printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
1941 sb->s_id); 2260 sb->s_id);
1942 goto fail_chunk_root; 2261 goto fail_tree_roots;
1943 } 2262 }
1944 btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); 2263 btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
1945 chunk_root->commit_root = btrfs_root_node(chunk_root); 2264 chunk_root->commit_root = btrfs_root_node(chunk_root);
@@ -1954,11 +2273,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1954 if (ret) { 2273 if (ret) {
1955 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", 2274 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
1956 sb->s_id); 2275 sb->s_id);
1957 goto fail_chunk_root; 2276 goto fail_tree_roots;
1958 } 2277 }
1959 2278
1960 btrfs_close_extra_devices(fs_devices); 2279 btrfs_close_extra_devices(fs_devices);
1961 2280
2281retry_root_backup:
1962 blocksize = btrfs_level_size(tree_root, 2282 blocksize = btrfs_level_size(tree_root,
1963 btrfs_super_root_level(disk_super)); 2283 btrfs_super_root_level(disk_super));
1964 generation = btrfs_super_generation(disk_super); 2284 generation = btrfs_super_generation(disk_super);
@@ -1966,32 +2286,33 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1966 tree_root->node = read_tree_block(tree_root, 2286 tree_root->node = read_tree_block(tree_root,
1967 btrfs_super_root(disk_super), 2287 btrfs_super_root(disk_super),
1968 blocksize, generation); 2288 blocksize, generation);
1969 if (!tree_root->node) 2289 if (!tree_root->node ||
1970 goto fail_chunk_root; 2290 !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
1971 if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
1972 printk(KERN_WARNING "btrfs: failed to read tree root on %s\n", 2291 printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
1973 sb->s_id); 2292 sb->s_id);
1974 goto fail_tree_root; 2293
2294 goto recovery_tree_root;
1975 } 2295 }
2296
1976 btrfs_set_root_node(&tree_root->root_item, tree_root->node); 2297 btrfs_set_root_node(&tree_root->root_item, tree_root->node);
1977 tree_root->commit_root = btrfs_root_node(tree_root); 2298 tree_root->commit_root = btrfs_root_node(tree_root);
1978 2299
1979 ret = find_and_setup_root(tree_root, fs_info, 2300 ret = find_and_setup_root(tree_root, fs_info,
1980 BTRFS_EXTENT_TREE_OBJECTID, extent_root); 2301 BTRFS_EXTENT_TREE_OBJECTID, extent_root);
1981 if (ret) 2302 if (ret)
1982 goto fail_tree_root; 2303 goto recovery_tree_root;
1983 extent_root->track_dirty = 1; 2304 extent_root->track_dirty = 1;
1984 2305
1985 ret = find_and_setup_root(tree_root, fs_info, 2306 ret = find_and_setup_root(tree_root, fs_info,
1986 BTRFS_DEV_TREE_OBJECTID, dev_root); 2307 BTRFS_DEV_TREE_OBJECTID, dev_root);
1987 if (ret) 2308 if (ret)
1988 goto fail_extent_root; 2309 goto recovery_tree_root;
1989 dev_root->track_dirty = 1; 2310 dev_root->track_dirty = 1;
1990 2311
1991 ret = find_and_setup_root(tree_root, fs_info, 2312 ret = find_and_setup_root(tree_root, fs_info,
1992 BTRFS_CSUM_TREE_OBJECTID, csum_root); 2313 BTRFS_CSUM_TREE_OBJECTID, csum_root);
1993 if (ret) 2314 if (ret)
1994 goto fail_dev_root; 2315 goto recovery_tree_root;
1995 2316
1996 csum_root->track_dirty = 1; 2317 csum_root->track_dirty = 1;
1997 2318
@@ -2124,22 +2445,13 @@ fail_cleaner:
2124 2445
2125fail_block_groups: 2446fail_block_groups:
2126 btrfs_free_block_groups(fs_info); 2447 btrfs_free_block_groups(fs_info);
2127 free_extent_buffer(csum_root->node); 2448
2128 free_extent_buffer(csum_root->commit_root); 2449fail_tree_roots:
2129fail_dev_root: 2450 free_root_pointers(fs_info, 1);
2130 free_extent_buffer(dev_root->node); 2451
2131 free_extent_buffer(dev_root->commit_root);
2132fail_extent_root:
2133 free_extent_buffer(extent_root->node);
2134 free_extent_buffer(extent_root->commit_root);
2135fail_tree_root:
2136 free_extent_buffer(tree_root->node);
2137 free_extent_buffer(tree_root->commit_root);
2138fail_chunk_root:
2139 free_extent_buffer(chunk_root->node);
2140 free_extent_buffer(chunk_root->commit_root);
2141fail_sb_buffer: 2452fail_sb_buffer:
2142 btrfs_stop_workers(&fs_info->generic_worker); 2453 btrfs_stop_workers(&fs_info->generic_worker);
2454 btrfs_stop_workers(&fs_info->readahead_workers);
2143 btrfs_stop_workers(&fs_info->fixup_workers); 2455 btrfs_stop_workers(&fs_info->fixup_workers);
2144 btrfs_stop_workers(&fs_info->delalloc_workers); 2456 btrfs_stop_workers(&fs_info->delalloc_workers);
2145 btrfs_stop_workers(&fs_info->workers); 2457 btrfs_stop_workers(&fs_info->workers);
@@ -2152,7 +2464,6 @@ fail_sb_buffer:
2152 btrfs_stop_workers(&fs_info->delayed_workers); 2464 btrfs_stop_workers(&fs_info->delayed_workers);
2153 btrfs_stop_workers(&fs_info->caching_workers); 2465 btrfs_stop_workers(&fs_info->caching_workers);
2154fail_alloc: 2466fail_alloc:
2155 kfree(fs_info->delayed_root);
2156fail_iput: 2467fail_iput:
2157 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2468 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2158 iput(fs_info->btree_inode); 2469 iput(fs_info->btree_inode);
@@ -2164,13 +2475,27 @@ fail_bdi:
2164fail_srcu: 2475fail_srcu:
2165 cleanup_srcu_struct(&fs_info->subvol_srcu); 2476 cleanup_srcu_struct(&fs_info->subvol_srcu);
2166fail: 2477fail:
2167 kfree(extent_root); 2478 free_fs_info(fs_info);
2168 kfree(tree_root);
2169 kfree(fs_info);
2170 kfree(chunk_root);
2171 kfree(dev_root);
2172 kfree(csum_root);
2173 return ERR_PTR(err); 2479 return ERR_PTR(err);
2480
2481recovery_tree_root:
2482
2483 if (!btrfs_test_opt(tree_root, RECOVERY))
2484 goto fail_tree_roots;
2485
2486 free_root_pointers(fs_info, 0);
2487
2488 /* don't use the log in recovery mode, it won't be valid */
2489 btrfs_set_super_log_root(disk_super, 0);
2490
2491 /* we can't trust the free space cache either */
2492 btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
2493
2494 ret = next_root_backup(fs_info, fs_info->super_copy,
2495 &num_backups_tried, &backup_index);
2496 if (ret == -1)
2497 goto fail_block_groups;
2498 goto retry_root_backup;
2174} 2499}
2175 2500
2176static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) 2501static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
@@ -2338,10 +2663,11 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2338 int total_errors = 0; 2663 int total_errors = 0;
2339 u64 flags; 2664 u64 flags;
2340 2665
2341 max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 2666 max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
2342 do_barriers = !btrfs_test_opt(root, NOBARRIER); 2667 do_barriers = !btrfs_test_opt(root, NOBARRIER);
2668 backup_super_roots(root->fs_info);
2343 2669
2344 sb = &root->fs_info->super_for_commit; 2670 sb = root->fs_info->super_for_commit;
2345 dev_item = &sb->dev_item; 2671 dev_item = &sb->dev_item;
2346 2672
2347 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2673 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
@@ -2545,8 +2871,6 @@ int close_ctree(struct btrfs_root *root)
2545 /* clear out the rbtree of defraggable inodes */ 2871 /* clear out the rbtree of defraggable inodes */
2546 btrfs_run_defrag_inodes(root->fs_info); 2872 btrfs_run_defrag_inodes(root->fs_info);
2547 2873
2548 btrfs_put_block_group_cache(fs_info);
2549
2550 /* 2874 /*
2551 * Here come 2 situations when btrfs is broken to flip readonly: 2875 * Here come 2 situations when btrfs is broken to flip readonly:
2552 * 2876 *
@@ -2572,6 +2896,8 @@ int close_ctree(struct btrfs_root *root)
2572 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 2896 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2573 } 2897 }
2574 2898
2899 btrfs_put_block_group_cache(fs_info);
2900
2575 kthread_stop(root->fs_info->transaction_kthread); 2901 kthread_stop(root->fs_info->transaction_kthread);
2576 kthread_stop(root->fs_info->cleaner_kthread); 2902 kthread_stop(root->fs_info->cleaner_kthread);
2577 2903
@@ -2603,7 +2929,6 @@ int close_ctree(struct btrfs_root *root)
2603 del_fs_roots(fs_info); 2929 del_fs_roots(fs_info);
2604 2930
2605 iput(fs_info->btree_inode); 2931 iput(fs_info->btree_inode);
2606 kfree(fs_info->delayed_root);
2607 2932
2608 btrfs_stop_workers(&fs_info->generic_worker); 2933 btrfs_stop_workers(&fs_info->generic_worker);
2609 btrfs_stop_workers(&fs_info->fixup_workers); 2934 btrfs_stop_workers(&fs_info->fixup_workers);
@@ -2617,6 +2942,7 @@ int close_ctree(struct btrfs_root *root)
2617 btrfs_stop_workers(&fs_info->submit_workers); 2942 btrfs_stop_workers(&fs_info->submit_workers);
2618 btrfs_stop_workers(&fs_info->delayed_workers); 2943 btrfs_stop_workers(&fs_info->delayed_workers);
2619 btrfs_stop_workers(&fs_info->caching_workers); 2944 btrfs_stop_workers(&fs_info->caching_workers);
2945 btrfs_stop_workers(&fs_info->readahead_workers);
2620 2946
2621 btrfs_close_devices(fs_info->fs_devices); 2947 btrfs_close_devices(fs_info->fs_devices);
2622 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2948 btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2624,12 +2950,7 @@ int close_ctree(struct btrfs_root *root)
2624 bdi_destroy(&fs_info->bdi); 2950 bdi_destroy(&fs_info->bdi);
2625 cleanup_srcu_struct(&fs_info->subvol_srcu); 2951 cleanup_srcu_struct(&fs_info->subvol_srcu);
2626 2952
2627 kfree(fs_info->extent_root); 2953 free_fs_info(fs_info);
2628 kfree(fs_info->tree_root);
2629 kfree(fs_info->chunk_root);
2630 kfree(fs_info->dev_root);
2631 kfree(fs_info->csum_root);
2632 kfree(fs_info);
2633 2954
2634 return 0; 2955 return 0;
2635} 2956}
@@ -2735,7 +3056,8 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2735 return ret; 3056 return ret;
2736} 3057}
2737 3058
2738int btree_lock_page_hook(struct page *page) 3059static int btree_lock_page_hook(struct page *page, void *data,
3060 void (*flush_fn)(void *))
2739{ 3061{
2740 struct inode *inode = page->mapping->host; 3062 struct inode *inode = page->mapping->host;
2741 struct btrfs_root *root = BTRFS_I(inode)->root; 3063 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2752,7 +3074,10 @@ int btree_lock_page_hook(struct page *page)
2752 if (!eb) 3074 if (!eb)
2753 goto out; 3075 goto out;
2754 3076
2755 btrfs_tree_lock(eb); 3077 if (!btrfs_try_tree_write_lock(eb)) {
3078 flush_fn(data);
3079 btrfs_tree_lock(eb);
3080 }
2756 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3081 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
2757 3082
2758 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3083 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
@@ -2767,7 +3092,10 @@ int btree_lock_page_hook(struct page *page)
2767 btrfs_tree_unlock(eb); 3092 btrfs_tree_unlock(eb);
2768 free_extent_buffer(eb); 3093 free_extent_buffer(eb);
2769out: 3094out:
2770 lock_page(page); 3095 if (!trylock_page(page)) {
3096 flush_fn(data);
3097 lock_page(page);
3098 }
2771 return 0; 3099 return 0;
2772} 3100}
2773 3101
@@ -3123,6 +3451,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
3123static struct extent_io_ops btree_extent_io_ops = { 3451static struct extent_io_ops btree_extent_io_ops = {
3124 .write_cache_pages_lock_hook = btree_lock_page_hook, 3452 .write_cache_pages_lock_hook = btree_lock_page_hook,
3125 .readpage_end_io_hook = btree_readpage_end_io_hook, 3453 .readpage_end_io_hook = btree_readpage_end_io_hook,
3454 .readpage_io_failed_hook = btree_io_failed_hook,
3126 .submit_bio_hook = btree_submit_bio_hook, 3455 .submit_bio_hook = btree_submit_bio_hook,
3127 /* note we're sharing with inode.c for the merge bio hook */ 3456 /* note we're sharing with inode.c for the merge bio hook */
3128 .merge_bio_hook = btrfs_merge_bio_hook, 3457 .merge_bio_hook = btrfs_merge_bio_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index bec3ea4bd67f..c99d0a8f13fa 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -40,6 +40,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
40 u32 blocksize, u64 parent_transid); 40 u32 blocksize, u64 parent_transid);
41int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, 41int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
42 u64 parent_transid); 42 u64 parent_transid);
43int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
44 int mirror_num, struct extent_buffer **eb);
43struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, 45struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
44 u64 bytenr, u32 blocksize); 46 u64 bytenr, u32 blocksize);
45int clean_tree_block(struct btrfs_trans_handle *trans, 47int clean_tree_block(struct btrfs_trans_handle *trans,
@@ -83,8 +85,6 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
83 struct btrfs_fs_info *fs_info); 85 struct btrfs_fs_info *fs_info);
84int btrfs_add_log_tree(struct btrfs_trans_handle *trans, 86int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
85 struct btrfs_root *root); 87 struct btrfs_root *root);
86int btree_lock_page_hook(struct page *page);
87
88 88
89#ifdef CONFIG_DEBUG_LOCK_ALLOC 89#ifdef CONFIG_DEBUG_LOCK_ALLOC
90void btrfs_init_lockdep(void); 90void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5be06a2462f..9879bd474632 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -23,6 +23,7 @@
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ratelimit.h>
26#include "compat.h" 27#include "compat.h"
27#include "hash.h" 28#include "hash.h"
28#include "ctree.h" 29#include "ctree.h"
@@ -52,6 +53,21 @@ enum {
52 CHUNK_ALLOC_LIMITED = 2, 53 CHUNK_ALLOC_LIMITED = 2,
53}; 54};
54 55
56/*
57 * Control how reservations are dealt with.
58 *
59 * RESERVE_FREE - freeing a reservation.
60 * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
61 * ENOSPC accounting
62 * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
63 * bytes_may_use as the ENOSPC accounting is done elsewhere
64 */
65enum {
66 RESERVE_FREE = 0,
67 RESERVE_ALLOC = 1,
68 RESERVE_ALLOC_NO_ACCOUNT = 2,
69};
70
55static int update_block_group(struct btrfs_trans_handle *trans, 71static int update_block_group(struct btrfs_trans_handle *trans,
56 struct btrfs_root *root, 72 struct btrfs_root *root,
57 u64 bytenr, u64 num_bytes, int alloc); 73 u64 bytenr, u64 num_bytes, int alloc);
@@ -81,6 +97,8 @@ static int find_next_key(struct btrfs_path *path, int level,
81 struct btrfs_key *key); 97 struct btrfs_key *key);
82static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 98static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
83 int dump_block_groups); 99 int dump_block_groups);
100static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
101 u64 num_bytes, int reserve);
84 102
85static noinline int 103static noinline int
86block_group_cache_done(struct btrfs_block_group_cache *cache) 104block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -104,7 +122,6 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
104 if (atomic_dec_and_test(&cache->count)) { 122 if (atomic_dec_and_test(&cache->count)) {
105 WARN_ON(cache->pinned > 0); 123 WARN_ON(cache->pinned > 0);
106 WARN_ON(cache->reserved > 0); 124 WARN_ON(cache->reserved > 0);
107 WARN_ON(cache->reserved_pinned > 0);
108 kfree(cache->free_space_ctl); 125 kfree(cache->free_space_ctl);
109 kfree(cache); 126 kfree(cache);
110 } 127 }
@@ -465,7 +482,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
465 * we likely hold important locks. 482 * we likely hold important locks.
466 */ 483 */
467 if (trans && (!trans->transaction->in_commit) && 484 if (trans && (!trans->transaction->in_commit) &&
468 (root && root != root->fs_info->tree_root)) { 485 (root && root != root->fs_info->tree_root) &&
486 btrfs_test_opt(root, SPACE_CACHE)) {
469 spin_lock(&cache->lock); 487 spin_lock(&cache->lock);
470 if (cache->cached != BTRFS_CACHE_NO) { 488 if (cache->cached != BTRFS_CACHE_NO) {
471 spin_unlock(&cache->lock); 489 spin_unlock(&cache->lock);
@@ -1770,18 +1788,18 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1770{ 1788{
1771 int ret; 1789 int ret;
1772 u64 discarded_bytes = 0; 1790 u64 discarded_bytes = 0;
1773 struct btrfs_multi_bio *multi = NULL; 1791 struct btrfs_bio *bbio = NULL;
1774 1792
1775 1793
1776 /* Tell the block device(s) that the sectors can be discarded */ 1794 /* Tell the block device(s) that the sectors can be discarded */
1777 ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, 1795 ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
1778 bytenr, &num_bytes, &multi, 0); 1796 bytenr, &num_bytes, &bbio, 0);
1779 if (!ret) { 1797 if (!ret) {
1780 struct btrfs_bio_stripe *stripe = multi->stripes; 1798 struct btrfs_bio_stripe *stripe = bbio->stripes;
1781 int i; 1799 int i;
1782 1800
1783 1801
1784 for (i = 0; i < multi->num_stripes; i++, stripe++) { 1802 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1785 if (!stripe->dev->can_discard) 1803 if (!stripe->dev->can_discard)
1786 continue; 1804 continue;
1787 1805
@@ -1800,7 +1818,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1800 */ 1818 */
1801 ret = 0; 1819 ret = 0;
1802 } 1820 }
1803 kfree(multi); 1821 kfree(bbio);
1804 } 1822 }
1805 1823
1806 if (actual_bytes) 1824 if (actual_bytes)
@@ -2700,6 +2718,13 @@ again:
2700 goto again; 2718 goto again;
2701 } 2719 }
2702 2720
2721 /* We've already setup this transaction, go ahead and exit */
2722 if (block_group->cache_generation == trans->transid &&
2723 i_size_read(inode)) {
2724 dcs = BTRFS_DC_SETUP;
2725 goto out_put;
2726 }
2727
2703 /* 2728 /*
2704 * We want to set the generation to 0, that way if anything goes wrong 2729 * We want to set the generation to 0, that way if anything goes wrong
2705 * from here on out we know not to trust this cache when we load up next 2730 * from here on out we know not to trust this cache when we load up next
@@ -2749,12 +2774,15 @@ again:
2749 if (!ret) 2774 if (!ret)
2750 dcs = BTRFS_DC_SETUP; 2775 dcs = BTRFS_DC_SETUP;
2751 btrfs_free_reserved_data_space(inode, num_pages); 2776 btrfs_free_reserved_data_space(inode, num_pages);
2777
2752out_put: 2778out_put:
2753 iput(inode); 2779 iput(inode);
2754out_free: 2780out_free:
2755 btrfs_release_path(path); 2781 btrfs_release_path(path);
2756out: 2782out:
2757 spin_lock(&block_group->lock); 2783 spin_lock(&block_group->lock);
2784 if (!ret)
2785 block_group->cache_generation = trans->transid;
2758 block_group->disk_cache_state = dcs; 2786 block_group->disk_cache_state = dcs;
2759 spin_unlock(&block_group->lock); 2787 spin_unlock(&block_group->lock);
2760 2788
@@ -3122,16 +3150,13 @@ commit_trans:
3122 return -ENOSPC; 3150 return -ENOSPC;
3123 } 3151 }
3124 data_sinfo->bytes_may_use += bytes; 3152 data_sinfo->bytes_may_use += bytes;
3125 BTRFS_I(inode)->reserved_bytes += bytes;
3126 spin_unlock(&data_sinfo->lock); 3153 spin_unlock(&data_sinfo->lock);
3127 3154
3128 return 0; 3155 return 0;
3129} 3156}
3130 3157
3131/* 3158/*
3132 * called when we are clearing an delalloc extent from the 3159 * Called if we need to clear a data reservation for this inode.
3133 * inode's io_tree or there was an error for whatever reason
3134 * after calling btrfs_check_data_free_space
3135 */ 3160 */
3136void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) 3161void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3137{ 3162{
@@ -3144,7 +3169,6 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3144 data_sinfo = BTRFS_I(inode)->space_info; 3169 data_sinfo = BTRFS_I(inode)->space_info;
3145 spin_lock(&data_sinfo->lock); 3170 spin_lock(&data_sinfo->lock);
3146 data_sinfo->bytes_may_use -= bytes; 3171 data_sinfo->bytes_may_use -= bytes;
3147 BTRFS_I(inode)->reserved_bytes -= bytes;
3148 spin_unlock(&data_sinfo->lock); 3172 spin_unlock(&data_sinfo->lock);
3149} 3173}
3150 3174
@@ -3165,6 +3189,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
3165 struct btrfs_space_info *sinfo, u64 alloc_bytes, 3189 struct btrfs_space_info *sinfo, u64 alloc_bytes,
3166 int force) 3190 int force)
3167{ 3191{
3192 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3168 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3193 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3169 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; 3194 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3170 u64 thresh; 3195 u64 thresh;
@@ -3173,11 +3198,18 @@ static int should_alloc_chunk(struct btrfs_root *root,
3173 return 1; 3198 return 1;
3174 3199
3175 /* 3200 /*
3201 * We need to take into account the global rsv because for all intents
3202 * and purposes it's used space. Don't worry about locking the
3203 * global_rsv, it doesn't change except when the transaction commits.
3204 */
3205 num_allocated += global_rsv->size;
3206
3207 /*
3176 * in limited mode, we want to have some free space up to 3208 * in limited mode, we want to have some free space up to
3177 * about 1% of the FS size. 3209 * about 1% of the FS size.
3178 */ 3210 */
3179 if (force == CHUNK_ALLOC_LIMITED) { 3211 if (force == CHUNK_ALLOC_LIMITED) {
3180 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); 3212 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3181 thresh = max_t(u64, 64 * 1024 * 1024, 3213 thresh = max_t(u64, 64 * 1024 * 1024,
3182 div_factor_fine(thresh, 1)); 3214 div_factor_fine(thresh, 1));
3183 3215
@@ -3199,7 +3231,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
3199 if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) 3231 if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
3200 return 0; 3232 return 0;
3201 3233
3202 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); 3234 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3203 3235
3204 /* 256MB or 5% of the FS */ 3236 /* 256MB or 5% of the FS */
3205 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); 3237 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
@@ -3302,24 +3334,26 @@ out:
3302/* 3334/*
3303 * shrink metadata reservation for delalloc 3335 * shrink metadata reservation for delalloc
3304 */ 3336 */
3305static int shrink_delalloc(struct btrfs_trans_handle *trans, 3337static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
3306 struct btrfs_root *root, u64 to_reclaim, int sync) 3338 bool wait_ordered)
3307{ 3339{
3308 struct btrfs_block_rsv *block_rsv; 3340 struct btrfs_block_rsv *block_rsv;
3309 struct btrfs_space_info *space_info; 3341 struct btrfs_space_info *space_info;
3342 struct btrfs_trans_handle *trans;
3310 u64 reserved; 3343 u64 reserved;
3311 u64 max_reclaim; 3344 u64 max_reclaim;
3312 u64 reclaimed = 0; 3345 u64 reclaimed = 0;
3313 long time_left; 3346 long time_left;
3314 int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3347 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3315 int loops = 0; 3348 int loops = 0;
3316 unsigned long progress; 3349 unsigned long progress;
3317 3350
3351 trans = (struct btrfs_trans_handle *)current->journal_info;
3318 block_rsv = &root->fs_info->delalloc_block_rsv; 3352 block_rsv = &root->fs_info->delalloc_block_rsv;
3319 space_info = block_rsv->space_info; 3353 space_info = block_rsv->space_info;
3320 3354
3321 smp_mb(); 3355 smp_mb();
3322 reserved = space_info->bytes_reserved; 3356 reserved = space_info->bytes_may_use;
3323 progress = space_info->reservation_progress; 3357 progress = space_info->reservation_progress;
3324 3358
3325 if (reserved == 0) 3359 if (reserved == 0)
@@ -3334,18 +3368,20 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3334 } 3368 }
3335 3369
3336 max_reclaim = min(reserved, to_reclaim); 3370 max_reclaim = min(reserved, to_reclaim);
3337 3371 nr_pages = max_t(unsigned long, nr_pages,
3372 max_reclaim >> PAGE_CACHE_SHIFT);
3338 while (loops < 1024) { 3373 while (loops < 1024) {
3339 /* have the flusher threads jump in and do some IO */ 3374 /* have the flusher threads jump in and do some IO */
3340 smp_mb(); 3375 smp_mb();
3341 nr_pages = min_t(unsigned long, nr_pages, 3376 nr_pages = min_t(unsigned long, nr_pages,
3342 root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT); 3377 root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
3343 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); 3378 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
3379 WB_REASON_FS_FREE_SPACE);
3344 3380
3345 spin_lock(&space_info->lock); 3381 spin_lock(&space_info->lock);
3346 if (reserved > space_info->bytes_reserved) 3382 if (reserved > space_info->bytes_may_use)
3347 reclaimed += reserved - space_info->bytes_reserved; 3383 reclaimed += reserved - space_info->bytes_may_use;
3348 reserved = space_info->bytes_reserved; 3384 reserved = space_info->bytes_may_use;
3349 spin_unlock(&space_info->lock); 3385 spin_unlock(&space_info->lock);
3350 3386
3351 loops++; 3387 loops++;
@@ -3356,11 +3392,15 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3356 if (trans && trans->transaction->blocked) 3392 if (trans && trans->transaction->blocked)
3357 return -EAGAIN; 3393 return -EAGAIN;
3358 3394
3359 time_left = schedule_timeout_interruptible(1); 3395 if (wait_ordered && !trans) {
3396 btrfs_wait_ordered_extents(root, 0, 0);
3397 } else {
3398 time_left = schedule_timeout_interruptible(1);
3360 3399
3361 /* We were interrupted, exit */ 3400 /* We were interrupted, exit */
3362 if (time_left) 3401 if (time_left)
3363 break; 3402 break;
3403 }
3364 3404
3365 /* we've kicked the IO a few times, if anything has been freed, 3405 /* we've kicked the IO a few times, if anything has been freed,
3366 * exit. There is no sense in looping here for a long time 3406 * exit. There is no sense in looping here for a long time
@@ -3375,34 +3415,90 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3375 } 3415 }
3376 3416
3377 } 3417 }
3378 if (reclaimed >= to_reclaim && !trans) 3418
3379 btrfs_wait_ordered_extents(root, 0, 0);
3380 return reclaimed >= to_reclaim; 3419 return reclaimed >= to_reclaim;
3381} 3420}
3382 3421
3383/* 3422/**
3384 * Retries tells us how many times we've called reserve_metadata_bytes. The 3423 * maybe_commit_transaction - possibly commit the transaction if its ok to
3385 * idea is if this is the first call (retries == 0) then we will add to our 3424 * @root - the root we're allocating for
3386 * reserved count if we can't make the allocation in order to hold our place 3425 * @bytes - the number of bytes we want to reserve
3387 * while we go and try and free up space. That way for retries > 1 we don't try 3426 * @force - force the commit
3388 * and add space, we just check to see if the amount of unused space is >= the
3389 * total space, meaning that our reservation is valid.
3390 * 3427 *
3391 * However if we don't intend to retry this reservation, pass -1 as retries so 3428 * This will check to make sure that committing the transaction will actually
3392 * that it short circuits this logic. 3429 * get us somewhere and then commit the transaction if it does. Otherwise it
3430 * will return -ENOSPC.
3393 */ 3431 */
3394static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, 3432static int may_commit_transaction(struct btrfs_root *root,
3395 struct btrfs_root *root, 3433 struct btrfs_space_info *space_info,
3434 u64 bytes, int force)
3435{
3436 struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
3437 struct btrfs_trans_handle *trans;
3438
3439 trans = (struct btrfs_trans_handle *)current->journal_info;
3440 if (trans)
3441 return -EAGAIN;
3442
3443 if (force)
3444 goto commit;
3445
3446 /* See if there is enough pinned space to make this reservation */
3447 spin_lock(&space_info->lock);
3448 if (space_info->bytes_pinned >= bytes) {
3449 spin_unlock(&space_info->lock);
3450 goto commit;
3451 }
3452 spin_unlock(&space_info->lock);
3453
3454 /*
3455 * See if there is some space in the delayed insertion reservation for
3456 * this reservation.
3457 */
3458 if (space_info != delayed_rsv->space_info)
3459 return -ENOSPC;
3460
3461 spin_lock(&delayed_rsv->lock);
3462 if (delayed_rsv->size < bytes) {
3463 spin_unlock(&delayed_rsv->lock);
3464 return -ENOSPC;
3465 }
3466 spin_unlock(&delayed_rsv->lock);
3467
3468commit:
3469 trans = btrfs_join_transaction(root);
3470 if (IS_ERR(trans))
3471 return -ENOSPC;
3472
3473 return btrfs_commit_transaction(trans, root);
3474}
3475
3476/**
3477 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
3478 * @root - the root we're allocating for
3479 * @block_rsv - the block_rsv we're allocating for
3480 * @orig_bytes - the number of bytes we want
3481 * @flush - wether or not we can flush to make our reservation
3482 *
3483 * This will reserve orgi_bytes number of bytes from the space info associated
3484 * with the block_rsv. If there is not enough space it will make an attempt to
3485 * flush out space to make room. It will do this by flushing delalloc if
3486 * possible or committing the transaction. If flush is 0 then no attempts to
3487 * regain reservations will be made and this will fail if there is not enough
3488 * space already.
3489 */
3490static int reserve_metadata_bytes(struct btrfs_root *root,
3396 struct btrfs_block_rsv *block_rsv, 3491 struct btrfs_block_rsv *block_rsv,
3397 u64 orig_bytes, int flush) 3492 u64 orig_bytes, int flush)
3398{ 3493{
3399 struct btrfs_space_info *space_info = block_rsv->space_info; 3494 struct btrfs_space_info *space_info = block_rsv->space_info;
3400 u64 unused; 3495 u64 used;
3401 u64 num_bytes = orig_bytes; 3496 u64 num_bytes = orig_bytes;
3402 int retries = 0; 3497 int retries = 0;
3403 int ret = 0; 3498 int ret = 0;
3404 bool committed = false; 3499 bool committed = false;
3405 bool flushing = false; 3500 bool flushing = false;
3501 bool wait_ordered = false;
3406 3502
3407again: 3503again:
3408 ret = 0; 3504 ret = 0;
@@ -3419,7 +3515,7 @@ again:
3419 * deadlock since we are waiting for the flusher to finish, but 3515 * deadlock since we are waiting for the flusher to finish, but
3420 * hold the current transaction open. 3516 * hold the current transaction open.
3421 */ 3517 */
3422 if (trans) 3518 if (current->journal_info)
3423 return -EAGAIN; 3519 return -EAGAIN;
3424 ret = wait_event_interruptible(space_info->wait, 3520 ret = wait_event_interruptible(space_info->wait,
3425 !space_info->flush); 3521 !space_info->flush);
@@ -3431,9 +3527,9 @@ again:
3431 } 3527 }
3432 3528
3433 ret = -ENOSPC; 3529 ret = -ENOSPC;
3434 unused = space_info->bytes_used + space_info->bytes_reserved + 3530 used = space_info->bytes_used + space_info->bytes_reserved +
3435 space_info->bytes_pinned + space_info->bytes_readonly + 3531 space_info->bytes_pinned + space_info->bytes_readonly +
3436 space_info->bytes_may_use; 3532 space_info->bytes_may_use;
3437 3533
3438 /* 3534 /*
3439 * The idea here is that we've not already over-reserved the block group 3535 * The idea here is that we've not already over-reserved the block group
@@ -3442,10 +3538,9 @@ again:
3442 * lets start flushing stuff first and then come back and try to make 3538 * lets start flushing stuff first and then come back and try to make
3443 * our reservation. 3539 * our reservation.
3444 */ 3540 */
3445 if (unused <= space_info->total_bytes) { 3541 if (used <= space_info->total_bytes) {
3446 unused = space_info->total_bytes - unused; 3542 if (used + orig_bytes <= space_info->total_bytes) {
3447 if (unused >= num_bytes) { 3543 space_info->bytes_may_use += orig_bytes;
3448 space_info->bytes_reserved += orig_bytes;
3449 ret = 0; 3544 ret = 0;
3450 } else { 3545 } else {
3451 /* 3546 /*
@@ -3461,10 +3556,64 @@ again:
3461 * amount plus the amount of bytes that we need for this 3556 * amount plus the amount of bytes that we need for this
3462 * reservation. 3557 * reservation.
3463 */ 3558 */
3464 num_bytes = unused - space_info->total_bytes + 3559 wait_ordered = true;
3560 num_bytes = used - space_info->total_bytes +
3465 (orig_bytes * (retries + 1)); 3561 (orig_bytes * (retries + 1));
3466 } 3562 }
3467 3563
3564 if (ret) {
3565 u64 profile = btrfs_get_alloc_profile(root, 0);
3566 u64 avail;
3567
3568 /*
3569 * If we have a lot of space that's pinned, don't bother doing
3570 * the overcommit dance yet and just commit the transaction.
3571 */
3572 avail = (space_info->total_bytes - space_info->bytes_used) * 8;
3573 do_div(avail, 10);
3574 if (space_info->bytes_pinned >= avail && flush && !committed) {
3575 space_info->flush = 1;
3576 flushing = true;
3577 spin_unlock(&space_info->lock);
3578 ret = may_commit_transaction(root, space_info,
3579 orig_bytes, 1);
3580 if (ret)
3581 goto out;
3582 committed = true;
3583 goto again;
3584 }
3585
3586 spin_lock(&root->fs_info->free_chunk_lock);
3587 avail = root->fs_info->free_chunk_space;
3588
3589 /*
3590 * If we have dup, raid1 or raid10 then only half of the free
3591 * space is actually useable.
3592 */
3593 if (profile & (BTRFS_BLOCK_GROUP_DUP |
3594 BTRFS_BLOCK_GROUP_RAID1 |
3595 BTRFS_BLOCK_GROUP_RAID10))
3596 avail >>= 1;
3597
3598 /*
3599 * If we aren't flushing don't let us overcommit too much, say
3600 * 1/8th of the space. If we can flush, let it overcommit up to
3601 * 1/2 of the space.
3602 */
3603 if (flush)
3604 avail >>= 3;
3605 else
3606 avail >>= 1;
3607 spin_unlock(&root->fs_info->free_chunk_lock);
3608
3609 if (used + num_bytes < space_info->total_bytes + avail) {
3610 space_info->bytes_may_use += orig_bytes;
3611 ret = 0;
3612 } else {
3613 wait_ordered = true;
3614 }
3615 }
3616
3468 /* 3617 /*
3469 * Couldn't make our reservation, save our place so while we're trying 3618 * Couldn't make our reservation, save our place so while we're trying
3470 * to reclaim space we can actually use it instead of somebody else 3619 * to reclaim space we can actually use it instead of somebody else
@@ -3484,7 +3633,7 @@ again:
3484 * We do synchronous shrinking since we don't actually unreserve 3633 * We do synchronous shrinking since we don't actually unreserve
3485 * metadata until after the IO is completed. 3634 * metadata until after the IO is completed.
3486 */ 3635 */
3487 ret = shrink_delalloc(trans, root, num_bytes, 1); 3636 ret = shrink_delalloc(root, num_bytes, wait_ordered);
3488 if (ret < 0) 3637 if (ret < 0)
3489 goto out; 3638 goto out;
3490 3639
@@ -3496,35 +3645,17 @@ again:
3496 * so go back around and try again. 3645 * so go back around and try again.
3497 */ 3646 */
3498 if (retries < 2) { 3647 if (retries < 2) {
3648 wait_ordered = true;
3499 retries++; 3649 retries++;
3500 goto again; 3650 goto again;
3501 } 3651 }
3502 3652
3503 /*
3504 * Not enough space to be reclaimed, don't bother committing the
3505 * transaction.
3506 */
3507 spin_lock(&space_info->lock);
3508 if (space_info->bytes_pinned < orig_bytes)
3509 ret = -ENOSPC;
3510 spin_unlock(&space_info->lock);
3511 if (ret)
3512 goto out;
3513
3514 ret = -EAGAIN;
3515 if (trans)
3516 goto out;
3517
3518 ret = -ENOSPC; 3653 ret = -ENOSPC;
3519 if (committed) 3654 if (committed)
3520 goto out; 3655 goto out;
3521 3656
3522 trans = btrfs_join_transaction(root); 3657 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3523 if (IS_ERR(trans))
3524 goto out;
3525 ret = btrfs_commit_transaction(trans, root);
3526 if (!ret) { 3658 if (!ret) {
3527 trans = NULL;
3528 committed = true; 3659 committed = true;
3529 goto again; 3660 goto again;
3530 } 3661 }
@@ -3542,10 +3673,12 @@ out:
3542static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans, 3673static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
3543 struct btrfs_root *root) 3674 struct btrfs_root *root)
3544{ 3675{
3545 struct btrfs_block_rsv *block_rsv; 3676 struct btrfs_block_rsv *block_rsv = NULL;
3546 if (root->ref_cows) 3677
3678 if (root->ref_cows || root == root->fs_info->csum_root)
3547 block_rsv = trans->block_rsv; 3679 block_rsv = trans->block_rsv;
3548 else 3680
3681 if (!block_rsv)
3549 block_rsv = root->block_rsv; 3682 block_rsv = root->block_rsv;
3550 3683
3551 if (!block_rsv) 3684 if (!block_rsv)
@@ -3616,7 +3749,7 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3616 } 3749 }
3617 if (num_bytes) { 3750 if (num_bytes) {
3618 spin_lock(&space_info->lock); 3751 spin_lock(&space_info->lock);
3619 space_info->bytes_reserved -= num_bytes; 3752 space_info->bytes_may_use -= num_bytes;
3620 space_info->reservation_progress++; 3753 space_info->reservation_progress++;
3621 spin_unlock(&space_info->lock); 3754 spin_unlock(&space_info->lock);
3622 } 3755 }
@@ -3640,9 +3773,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
3640{ 3773{
3641 memset(rsv, 0, sizeof(*rsv)); 3774 memset(rsv, 0, sizeof(*rsv));
3642 spin_lock_init(&rsv->lock); 3775 spin_lock_init(&rsv->lock);
3643 atomic_set(&rsv->usage, 1);
3644 rsv->priority = 6;
3645 INIT_LIST_HEAD(&rsv->list);
3646} 3776}
3647 3777
3648struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) 3778struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
@@ -3663,38 +3793,38 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3663void btrfs_free_block_rsv(struct btrfs_root *root, 3793void btrfs_free_block_rsv(struct btrfs_root *root,
3664 struct btrfs_block_rsv *rsv) 3794 struct btrfs_block_rsv *rsv)
3665{ 3795{
3666 if (rsv && atomic_dec_and_test(&rsv->usage)) { 3796 btrfs_block_rsv_release(root, rsv, (u64)-1);
3667 btrfs_block_rsv_release(root, rsv, (u64)-1); 3797 kfree(rsv);
3668 if (!rsv->durable)
3669 kfree(rsv);
3670 }
3671} 3798}
3672 3799
3673/* 3800int btrfs_block_rsv_add(struct btrfs_root *root,
3674 * make the block_rsv struct be able to capture freed space. 3801 struct btrfs_block_rsv *block_rsv,
3675 * the captured space will re-add to the the block_rsv struct 3802 u64 num_bytes)
3676 * after transaction commit
3677 */
3678void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
3679 struct btrfs_block_rsv *block_rsv)
3680{ 3803{
3681 block_rsv->durable = 1; 3804 int ret;
3682 mutex_lock(&fs_info->durable_block_rsv_mutex); 3805
3683 list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list); 3806 if (num_bytes == 0)
3684 mutex_unlock(&fs_info->durable_block_rsv_mutex); 3807 return 0;
3808
3809 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
3810 if (!ret) {
3811 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3812 return 0;
3813 }
3814
3815 return ret;
3685} 3816}
3686 3817
3687int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, 3818int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
3688 struct btrfs_root *root, 3819 struct btrfs_block_rsv *block_rsv,
3689 struct btrfs_block_rsv *block_rsv, 3820 u64 num_bytes)
3690 u64 num_bytes)
3691{ 3821{
3692 int ret; 3822 int ret;
3693 3823
3694 if (num_bytes == 0) 3824 if (num_bytes == 0)
3695 return 0; 3825 return 0;
3696 3826
3697 ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1); 3827 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 0);
3698 if (!ret) { 3828 if (!ret) {
3699 block_rsv_add_bytes(block_rsv, num_bytes, 1); 3829 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3700 return 0; 3830 return 0;
@@ -3703,55 +3833,52 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
3703 return ret; 3833 return ret;
3704} 3834}
3705 3835
3706int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, 3836int btrfs_block_rsv_check(struct btrfs_root *root,
3707 struct btrfs_root *root, 3837 struct btrfs_block_rsv *block_rsv, int min_factor)
3708 struct btrfs_block_rsv *block_rsv,
3709 u64 min_reserved, int min_factor)
3710{ 3838{
3711 u64 num_bytes = 0; 3839 u64 num_bytes = 0;
3712 int commit_trans = 0;
3713 int ret = -ENOSPC; 3840 int ret = -ENOSPC;
3714 3841
3715 if (!block_rsv) 3842 if (!block_rsv)
3716 return 0; 3843 return 0;
3717 3844
3718 spin_lock(&block_rsv->lock); 3845 spin_lock(&block_rsv->lock);
3719 if (min_factor > 0) 3846 num_bytes = div_factor(block_rsv->size, min_factor);
3720 num_bytes = div_factor(block_rsv->size, min_factor); 3847 if (block_rsv->reserved >= num_bytes)
3721 if (min_reserved > num_bytes) 3848 ret = 0;
3722 num_bytes = min_reserved; 3849 spin_unlock(&block_rsv->lock);
3723 3850
3724 if (block_rsv->reserved >= num_bytes) { 3851 return ret;
3852}
3853
3854int btrfs_block_rsv_refill(struct btrfs_root *root,
3855 struct btrfs_block_rsv *block_rsv,
3856 u64 min_reserved)
3857{
3858 u64 num_bytes = 0;
3859 int ret = -ENOSPC;
3860
3861 if (!block_rsv)
3862 return 0;
3863
3864 spin_lock(&block_rsv->lock);
3865 num_bytes = min_reserved;
3866 if (block_rsv->reserved >= num_bytes)
3725 ret = 0; 3867 ret = 0;
3726 } else { 3868 else
3727 num_bytes -= block_rsv->reserved; 3869 num_bytes -= block_rsv->reserved;
3728 if (block_rsv->durable &&
3729 block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
3730 commit_trans = 1;
3731 }
3732 spin_unlock(&block_rsv->lock); 3870 spin_unlock(&block_rsv->lock);
3871
3733 if (!ret) 3872 if (!ret)
3734 return 0; 3873 return 0;
3735 3874
3736 if (block_rsv->refill_used) { 3875 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
3737 ret = reserve_metadata_bytes(trans, root, block_rsv, 3876 if (!ret) {
3738 num_bytes, 0); 3877 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3739 if (!ret) {
3740 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3741 return 0;
3742 }
3743 }
3744
3745 if (commit_trans) {
3746 if (trans)
3747 return -EAGAIN;
3748 trans = btrfs_join_transaction(root);
3749 BUG_ON(IS_ERR(trans));
3750 ret = btrfs_commit_transaction(trans, root);
3751 return 0; 3878 return 0;
3752 } 3879 }
3753 3880
3754 return -ENOSPC; 3881 return ret;
3755} 3882}
3756 3883
3757int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 3884int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
@@ -3783,7 +3910,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
3783 u64 num_bytes; 3910 u64 num_bytes;
3784 u64 meta_used; 3911 u64 meta_used;
3785 u64 data_used; 3912 u64 data_used;
3786 int csum_size = btrfs_super_csum_size(&fs_info->super_copy); 3913 int csum_size = btrfs_super_csum_size(fs_info->super_copy);
3787 3914
3788 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); 3915 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
3789 spin_lock(&sinfo->lock); 3916 spin_lock(&sinfo->lock);
@@ -3827,12 +3954,12 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3827 if (sinfo->total_bytes > num_bytes) { 3954 if (sinfo->total_bytes > num_bytes) {
3828 num_bytes = sinfo->total_bytes - num_bytes; 3955 num_bytes = sinfo->total_bytes - num_bytes;
3829 block_rsv->reserved += num_bytes; 3956 block_rsv->reserved += num_bytes;
3830 sinfo->bytes_reserved += num_bytes; 3957 sinfo->bytes_may_use += num_bytes;
3831 } 3958 }
3832 3959
3833 if (block_rsv->reserved >= block_rsv->size) { 3960 if (block_rsv->reserved >= block_rsv->size) {
3834 num_bytes = block_rsv->reserved - block_rsv->size; 3961 num_bytes = block_rsv->reserved - block_rsv->size;
3835 sinfo->bytes_reserved -= num_bytes; 3962 sinfo->bytes_may_use -= num_bytes;
3836 sinfo->reservation_progress++; 3963 sinfo->reservation_progress++;
3837 block_rsv->reserved = block_rsv->size; 3964 block_rsv->reserved = block_rsv->size;
3838 block_rsv->full = 1; 3965 block_rsv->full = 1;
@@ -3848,16 +3975,13 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3848 3975
3849 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 3976 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3850 fs_info->chunk_block_rsv.space_info = space_info; 3977 fs_info->chunk_block_rsv.space_info = space_info;
3851 fs_info->chunk_block_rsv.priority = 10;
3852 3978
3853 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 3979 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3854 fs_info->global_block_rsv.space_info = space_info; 3980 fs_info->global_block_rsv.space_info = space_info;
3855 fs_info->global_block_rsv.priority = 10;
3856 fs_info->global_block_rsv.refill_used = 1;
3857 fs_info->delalloc_block_rsv.space_info = space_info; 3981 fs_info->delalloc_block_rsv.space_info = space_info;
3858 fs_info->trans_block_rsv.space_info = space_info; 3982 fs_info->trans_block_rsv.space_info = space_info;
3859 fs_info->empty_block_rsv.space_info = space_info; 3983 fs_info->empty_block_rsv.space_info = space_info;
3860 fs_info->empty_block_rsv.priority = 10; 3984 fs_info->delayed_block_rsv.space_info = space_info;
3861 3985
3862 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; 3986 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
3863 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; 3987 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
@@ -3865,10 +3989,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3865 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 3989 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
3866 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 3990 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
3867 3991
3868 btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
3869
3870 btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
3871
3872 update_global_block_rsv(fs_info); 3992 update_global_block_rsv(fs_info);
3873} 3993}
3874 3994
@@ -3881,37 +4001,8 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
3881 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 4001 WARN_ON(fs_info->trans_block_rsv.reserved > 0);
3882 WARN_ON(fs_info->chunk_block_rsv.size > 0); 4002 WARN_ON(fs_info->chunk_block_rsv.size > 0);
3883 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 4003 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
3884} 4004 WARN_ON(fs_info->delayed_block_rsv.size > 0);
3885 4005 WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
3886int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
3887 struct btrfs_root *root,
3888 struct btrfs_block_rsv *rsv)
3889{
3890 struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv;
3891 u64 num_bytes;
3892 int ret;
3893
3894 /*
3895 * Truncate should be freeing data, but give us 2 items just in case it
3896 * needs to use some space. We may want to be smarter about this in the
3897 * future.
3898 */
3899 num_bytes = btrfs_calc_trans_metadata_size(root, 2);
3900
3901 /* We already have enough bytes, just return */
3902 if (rsv->reserved >= num_bytes)
3903 return 0;
3904
3905 num_bytes -= rsv->reserved;
3906
3907 /*
3908 * You should have reserved enough space before hand to do this, so this
3909 * should not fail.
3910 */
3911 ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes);
3912 BUG_ON(ret);
3913
3914 return 0;
3915} 4006}
3916 4007
3917void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 4008void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -3920,9 +4011,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3920 if (!trans->bytes_reserved) 4011 if (!trans->bytes_reserved)
3921 return; 4012 return;
3922 4013
3923 BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv); 4014 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
3924 btrfs_block_rsv_release(root, trans->block_rsv,
3925 trans->bytes_reserved);
3926 trans->bytes_reserved = 0; 4015 trans->bytes_reserved = 0;
3927} 4016}
3928 4017
@@ -3964,11 +4053,19 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
3964 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4053 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3965} 4054}
3966 4055
4056/**
4057 * drop_outstanding_extent - drop an outstanding extent
4058 * @inode: the inode we're dropping the extent for
4059 *
4060 * This is called when we are freeing up an outstanding extent, either called
4061 * after an error or after an extent is written. This will return the number of
4062 * reserved extents that need to be freed. This must be called with
4063 * BTRFS_I(inode)->lock held.
4064 */
3967static unsigned drop_outstanding_extent(struct inode *inode) 4065static unsigned drop_outstanding_extent(struct inode *inode)
3968{ 4066{
3969 unsigned dropped_extents = 0; 4067 unsigned dropped_extents = 0;
3970 4068
3971 spin_lock(&BTRFS_I(inode)->lock);
3972 BUG_ON(!BTRFS_I(inode)->outstanding_extents); 4069 BUG_ON(!BTRFS_I(inode)->outstanding_extents);
3973 BTRFS_I(inode)->outstanding_extents--; 4070 BTRFS_I(inode)->outstanding_extents--;
3974 4071
@@ -3978,19 +4075,70 @@ static unsigned drop_outstanding_extent(struct inode *inode)
3978 */ 4075 */
3979 if (BTRFS_I(inode)->outstanding_extents >= 4076 if (BTRFS_I(inode)->outstanding_extents >=
3980 BTRFS_I(inode)->reserved_extents) 4077 BTRFS_I(inode)->reserved_extents)
3981 goto out; 4078 return 0;
3982 4079
3983 dropped_extents = BTRFS_I(inode)->reserved_extents - 4080 dropped_extents = BTRFS_I(inode)->reserved_extents -
3984 BTRFS_I(inode)->outstanding_extents; 4081 BTRFS_I(inode)->outstanding_extents;
3985 BTRFS_I(inode)->reserved_extents -= dropped_extents; 4082 BTRFS_I(inode)->reserved_extents -= dropped_extents;
3986out:
3987 spin_unlock(&BTRFS_I(inode)->lock);
3988 return dropped_extents; 4083 return dropped_extents;
3989} 4084}
3990 4085
3991static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes) 4086/**
4087 * calc_csum_metadata_size - return the amount of metada space that must be
4088 * reserved/free'd for the given bytes.
4089 * @inode: the inode we're manipulating
4090 * @num_bytes: the number of bytes in question
4091 * @reserve: 1 if we are reserving space, 0 if we are freeing space
4092 *
4093 * This adjusts the number of csum_bytes in the inode and then returns the
4094 * correct amount of metadata that must either be reserved or freed. We
4095 * calculate how many checksums we can fit into one leaf and then divide the
4096 * number of bytes that will need to be checksumed by this value to figure out
4097 * how many checksums will be required. If we are adding bytes then the number
4098 * may go up and we will return the number of additional bytes that must be
4099 * reserved. If it is going down we will return the number of bytes that must
4100 * be freed.
4101 *
4102 * This must be called with BTRFS_I(inode)->lock held.
4103 */
4104static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
4105 int reserve)
3992{ 4106{
3993 return num_bytes >>= 3; 4107 struct btrfs_root *root = BTRFS_I(inode)->root;
4108 u64 csum_size;
4109 int num_csums_per_leaf;
4110 int num_csums;
4111 int old_csums;
4112
4113 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
4114 BTRFS_I(inode)->csum_bytes == 0)
4115 return 0;
4116
4117 old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4118 if (reserve)
4119 BTRFS_I(inode)->csum_bytes += num_bytes;
4120 else
4121 BTRFS_I(inode)->csum_bytes -= num_bytes;
4122 csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
4123 num_csums_per_leaf = (int)div64_u64(csum_size,
4124 sizeof(struct btrfs_csum_item) +
4125 sizeof(struct btrfs_disk_key));
4126 num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4127 num_csums = num_csums + num_csums_per_leaf - 1;
4128 num_csums = num_csums / num_csums_per_leaf;
4129
4130 old_csums = old_csums + num_csums_per_leaf - 1;
4131 old_csums = old_csums / num_csums_per_leaf;
4132
4133 /* No change, no need to reserve more */
4134 if (old_csums == num_csums)
4135 return 0;
4136
4137 if (reserve)
4138 return btrfs_calc_trans_metadata_size(root,
4139 num_csums - old_csums);
4140
4141 return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
3994} 4142}
3995 4143
3996int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) 4144int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
@@ -3999,9 +4147,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3999 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 4147 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4000 u64 to_reserve = 0; 4148 u64 to_reserve = 0;
4001 unsigned nr_extents = 0; 4149 unsigned nr_extents = 0;
4150 int flush = 1;
4002 int ret; 4151 int ret;
4003 4152
4004 if (btrfs_transaction_in_commit(root->fs_info)) 4153 if (btrfs_is_free_space_inode(root, inode))
4154 flush = 0;
4155
4156 if (flush && btrfs_transaction_in_commit(root->fs_info))
4005 schedule_timeout(1); 4157 schedule_timeout(1);
4006 4158
4007 num_bytes = ALIGN(num_bytes, root->sectorsize); 4159 num_bytes = ALIGN(num_bytes, root->sectorsize);
@@ -4017,18 +4169,29 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4017 4169
4018 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); 4170 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4019 } 4171 }
4172 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4020 spin_unlock(&BTRFS_I(inode)->lock); 4173 spin_unlock(&BTRFS_I(inode)->lock);
4021 4174
4022 to_reserve += calc_csum_metadata_size(inode, num_bytes); 4175 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4023 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
4024 if (ret) { 4176 if (ret) {
4177 u64 to_free = 0;
4025 unsigned dropped; 4178 unsigned dropped;
4179
4180 spin_lock(&BTRFS_I(inode)->lock);
4181 dropped = drop_outstanding_extent(inode);
4182 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4183 spin_unlock(&BTRFS_I(inode)->lock);
4184 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4185
4026 /* 4186 /*
4027 * We don't need the return value since our reservation failed, 4187 * Somebody could have come in and twiddled with the
4028 * we just need to clean up our counter. 4188 * reservation, so if we have to free more than we would have
4189 * reserved from this reservation go ahead and release those
4190 * bytes.
4029 */ 4191 */
4030 dropped = drop_outstanding_extent(inode); 4192 to_free -= to_reserve;
4031 WARN_ON(dropped > 1); 4193 if (to_free)
4194 btrfs_block_rsv_release(root, block_rsv, to_free);
4032 return ret; 4195 return ret;
4033 } 4196 }
4034 4197
@@ -4037,6 +4200,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4037 return 0; 4200 return 0;
4038} 4201}
4039 4202
4203/**
4204 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
4205 * @inode: the inode to release the reservation for
4206 * @num_bytes: the number of bytes we're releasing
4207 *
4208 * This will release the metadata reservation for an inode. This can be called
4209 * once we complete IO for a given set of bytes to release their metadata
4210 * reservations.
4211 */
4040void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) 4212void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4041{ 4213{
4042 struct btrfs_root *root = BTRFS_I(inode)->root; 4214 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4044,9 +4216,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4044 unsigned dropped; 4216 unsigned dropped;
4045 4217
4046 num_bytes = ALIGN(num_bytes, root->sectorsize); 4218 num_bytes = ALIGN(num_bytes, root->sectorsize);
4219 spin_lock(&BTRFS_I(inode)->lock);
4047 dropped = drop_outstanding_extent(inode); 4220 dropped = drop_outstanding_extent(inode);
4048 4221
4049 to_free = calc_csum_metadata_size(inode, num_bytes); 4222 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4223 spin_unlock(&BTRFS_I(inode)->lock);
4050 if (dropped > 0) 4224 if (dropped > 0)
4051 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4225 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4052 4226
@@ -4054,6 +4228,21 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4054 to_free); 4228 to_free);
4055} 4229}
4056 4230
4231/**
4232 * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
4233 * @inode: inode we're writing to
4234 * @num_bytes: the number of bytes we want to allocate
4235 *
4236 * This will do the following things
4237 *
4238 * o reserve space in the data space info for num_bytes
4239 * o reserve space in the metadata space info based on number of outstanding
4240 * extents and how much csums will be needed
4241 * o add to the inodes ->delalloc_bytes
4242 * o add it to the fs_info's delalloc inodes list.
4243 *
4244 * This will return 0 for success and -ENOSPC if there is no space left.
4245 */
4057int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) 4246int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4058{ 4247{
4059 int ret; 4248 int ret;
@@ -4071,6 +4260,19 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4071 return 0; 4260 return 0;
4072} 4261}
4073 4262
4263/**
4264 * btrfs_delalloc_release_space - release data and metadata space for delalloc
4265 * @inode: inode we're releasing space for
4266 * @num_bytes: the number of bytes we want to free up
4267 *
4268 * This must be matched with a call to btrfs_delalloc_reserve_space. This is
4269 * called in the case that we don't need the metadata AND data reservations
4270 * anymore. So if there is an error or we insert an inline extent.
4271 *
4272 * This function will release the metadata space that was not used and will
4273 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
4274 * list if there are no delalloc bytes left.
4275 */
4074void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) 4276void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4075{ 4277{
4076 btrfs_delalloc_release_metadata(inode, num_bytes); 4278 btrfs_delalloc_release_metadata(inode, num_bytes);
@@ -4090,12 +4292,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4090 4292
4091 /* block accounting for super block */ 4293 /* block accounting for super block */
4092 spin_lock(&info->delalloc_lock); 4294 spin_lock(&info->delalloc_lock);
4093 old_val = btrfs_super_bytes_used(&info->super_copy); 4295 old_val = btrfs_super_bytes_used(info->super_copy);
4094 if (alloc) 4296 if (alloc)
4095 old_val += num_bytes; 4297 old_val += num_bytes;
4096 else 4298 else
4097 old_val -= num_bytes; 4299 old_val -= num_bytes;
4098 btrfs_set_super_bytes_used(&info->super_copy, old_val); 4300 btrfs_set_super_bytes_used(info->super_copy, old_val);
4099 spin_unlock(&info->delalloc_lock); 4301 spin_unlock(&info->delalloc_lock);
4100 4302
4101 while (total) { 4303 while (total) {
@@ -4123,7 +4325,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4123 spin_lock(&cache->space_info->lock); 4325 spin_lock(&cache->space_info->lock);
4124 spin_lock(&cache->lock); 4326 spin_lock(&cache->lock);
4125 4327
4126 if (btrfs_super_cache_generation(&info->super_copy) != 0 && 4328 if (btrfs_test_opt(root, SPACE_CACHE) &&
4127 cache->disk_cache_state < BTRFS_DC_CLEAR) 4329 cache->disk_cache_state < BTRFS_DC_CLEAR)
4128 cache->disk_cache_state = BTRFS_DC_CLEAR; 4330 cache->disk_cache_state = BTRFS_DC_CLEAR;
4129 4331
@@ -4135,7 +4337,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4135 btrfs_set_block_group_used(&cache->item, old_val); 4337 btrfs_set_block_group_used(&cache->item, old_val);
4136 cache->reserved -= num_bytes; 4338 cache->reserved -= num_bytes;
4137 cache->space_info->bytes_reserved -= num_bytes; 4339 cache->space_info->bytes_reserved -= num_bytes;
4138 cache->space_info->reservation_progress++;
4139 cache->space_info->bytes_used += num_bytes; 4340 cache->space_info->bytes_used += num_bytes;
4140 cache->space_info->disk_used += num_bytes * factor; 4341 cache->space_info->disk_used += num_bytes * factor;
4141 spin_unlock(&cache->lock); 4342 spin_unlock(&cache->lock);
@@ -4187,7 +4388,6 @@ static int pin_down_extent(struct btrfs_root *root,
4187 if (reserved) { 4388 if (reserved) {
4188 cache->reserved -= num_bytes; 4389 cache->reserved -= num_bytes;
4189 cache->space_info->bytes_reserved -= num_bytes; 4390 cache->space_info->bytes_reserved -= num_bytes;
4190 cache->space_info->reservation_progress++;
4191 } 4391 }
4192 spin_unlock(&cache->lock); 4392 spin_unlock(&cache->lock);
4193 spin_unlock(&cache->space_info->lock); 4393 spin_unlock(&cache->space_info->lock);
@@ -4215,45 +4415,82 @@ int btrfs_pin_extent(struct btrfs_root *root,
4215} 4415}
4216 4416
4217/* 4417/*
4218 * update size of reserved extents. this function may return -EAGAIN 4418 * this function must be called within transaction
4219 * if 'reserve' is true or 'sinfo' is false. 4419 */
4420int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
4421 struct btrfs_root *root,
4422 u64 bytenr, u64 num_bytes)
4423{
4424 struct btrfs_block_group_cache *cache;
4425
4426 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4427 BUG_ON(!cache);
4428
4429 /*
4430 * pull in the free space cache (if any) so that our pin
4431 * removes the free space from the cache. We have load_only set
4432 * to one because the slow code to read in the free extents does check
4433 * the pinned extents.
4434 */
4435 cache_block_group(cache, trans, root, 1);
4436
4437 pin_down_extent(root, cache, bytenr, num_bytes, 0);
4438
4439 /* remove us from the free space cache (if we're there at all) */
4440 btrfs_remove_free_space(cache, bytenr, num_bytes);
4441 btrfs_put_block_group(cache);
4442 return 0;
4443}
4444
4445/**
4446 * btrfs_update_reserved_bytes - update the block_group and space info counters
4447 * @cache: The cache we are manipulating
4448 * @num_bytes: The number of bytes in question
4449 * @reserve: One of the reservation enums
4450 *
4451 * This is called by the allocator when it reserves space, or by somebody who is
4452 * freeing space that was never actually used on disk. For example if you
4453 * reserve some space for a new leaf in transaction A and before transaction A
4454 * commits you free that leaf, you call this with reserve set to 0 in order to
4455 * clear the reservation.
4456 *
4457 * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
4458 * ENOSPC accounting. For data we handle the reservation through clearing the
4459 * delalloc bits in the io_tree. We have to do this since we could end up
4460 * allocating less disk space for the amount of data we have reserved in the
4461 * case of compression.
4462 *
4463 * If this is a reservation and the block group has become read only we cannot
4464 * make the reservation and return -EAGAIN, otherwise this function always
4465 * succeeds.
4220 */ 4466 */
4221int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 4467static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4222 u64 num_bytes, int reserve, int sinfo) 4468 u64 num_bytes, int reserve)
4223{ 4469{
4470 struct btrfs_space_info *space_info = cache->space_info;
4224 int ret = 0; 4471 int ret = 0;
4225 if (sinfo) { 4472 spin_lock(&space_info->lock);
4226 struct btrfs_space_info *space_info = cache->space_info; 4473 spin_lock(&cache->lock);
4227 spin_lock(&space_info->lock); 4474 if (reserve != RESERVE_FREE) {
4228 spin_lock(&cache->lock);
4229 if (reserve) {
4230 if (cache->ro) {
4231 ret = -EAGAIN;
4232 } else {
4233 cache->reserved += num_bytes;
4234 space_info->bytes_reserved += num_bytes;
4235 }
4236 } else {
4237 if (cache->ro)
4238 space_info->bytes_readonly += num_bytes;
4239 cache->reserved -= num_bytes;
4240 space_info->bytes_reserved -= num_bytes;
4241 space_info->reservation_progress++;
4242 }
4243 spin_unlock(&cache->lock);
4244 spin_unlock(&space_info->lock);
4245 } else {
4246 spin_lock(&cache->lock);
4247 if (cache->ro) { 4475 if (cache->ro) {
4248 ret = -EAGAIN; 4476 ret = -EAGAIN;
4249 } else { 4477 } else {
4250 if (reserve) 4478 cache->reserved += num_bytes;
4251 cache->reserved += num_bytes; 4479 space_info->bytes_reserved += num_bytes;
4252 else 4480 if (reserve == RESERVE_ALLOC) {
4253 cache->reserved -= num_bytes; 4481 BUG_ON(space_info->bytes_may_use < num_bytes);
4482 space_info->bytes_may_use -= num_bytes;
4483 }
4254 } 4484 }
4255 spin_unlock(&cache->lock); 4485 } else {
4486 if (cache->ro)
4487 space_info->bytes_readonly += num_bytes;
4488 cache->reserved -= num_bytes;
4489 space_info->bytes_reserved -= num_bytes;
4490 space_info->reservation_progress++;
4256 } 4491 }
4492 spin_unlock(&cache->lock);
4493 spin_unlock(&space_info->lock);
4257 return ret; 4494 return ret;
4258} 4495}
4259 4496
@@ -4319,13 +4556,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4319 spin_lock(&cache->lock); 4556 spin_lock(&cache->lock);
4320 cache->pinned -= len; 4557 cache->pinned -= len;
4321 cache->space_info->bytes_pinned -= len; 4558 cache->space_info->bytes_pinned -= len;
4322 if (cache->ro) { 4559 if (cache->ro)
4323 cache->space_info->bytes_readonly += len; 4560 cache->space_info->bytes_readonly += len;
4324 } else if (cache->reserved_pinned > 0) {
4325 len = min(len, cache->reserved_pinned);
4326 cache->reserved_pinned -= len;
4327 cache->space_info->bytes_reserved += len;
4328 }
4329 spin_unlock(&cache->lock); 4561 spin_unlock(&cache->lock);
4330 spin_unlock(&cache->space_info->lock); 4562 spin_unlock(&cache->space_info->lock);
4331 } 4563 }
@@ -4340,11 +4572,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4340{ 4572{
4341 struct btrfs_fs_info *fs_info = root->fs_info; 4573 struct btrfs_fs_info *fs_info = root->fs_info;
4342 struct extent_io_tree *unpin; 4574 struct extent_io_tree *unpin;
4343 struct btrfs_block_rsv *block_rsv;
4344 struct btrfs_block_rsv *next_rsv;
4345 u64 start; 4575 u64 start;
4346 u64 end; 4576 u64 end;
4347 int idx;
4348 int ret; 4577 int ret;
4349 4578
4350 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 4579 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -4367,30 +4596,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4367 cond_resched(); 4596 cond_resched();
4368 } 4597 }
4369 4598
4370 mutex_lock(&fs_info->durable_block_rsv_mutex);
4371 list_for_each_entry_safe(block_rsv, next_rsv,
4372 &fs_info->durable_block_rsv_list, list) {
4373
4374 idx = trans->transid & 0x1;
4375 if (block_rsv->freed[idx] > 0) {
4376 block_rsv_add_bytes(block_rsv,
4377 block_rsv->freed[idx], 0);
4378 block_rsv->freed[idx] = 0;
4379 }
4380 if (atomic_read(&block_rsv->usage) == 0) {
4381 btrfs_block_rsv_release(root, block_rsv, (u64)-1);
4382
4383 if (block_rsv->freed[0] == 0 &&
4384 block_rsv->freed[1] == 0) {
4385 list_del_init(&block_rsv->list);
4386 kfree(block_rsv);
4387 }
4388 } else {
4389 btrfs_block_rsv_release(root, block_rsv, 0);
4390 }
4391 }
4392 mutex_unlock(&fs_info->durable_block_rsv_mutex);
4393
4394 return 0; 4599 return 0;
4395} 4600}
4396 4601
@@ -4668,7 +4873,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4668 struct extent_buffer *buf, 4873 struct extent_buffer *buf,
4669 u64 parent, int last_ref) 4874 u64 parent, int last_ref)
4670{ 4875{
4671 struct btrfs_block_rsv *block_rsv;
4672 struct btrfs_block_group_cache *cache = NULL; 4876 struct btrfs_block_group_cache *cache = NULL;
4673 int ret; 4877 int ret;
4674 4878
@@ -4683,64 +4887,24 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4683 if (!last_ref) 4887 if (!last_ref)
4684 return; 4888 return;
4685 4889
4686 block_rsv = get_block_rsv(trans, root);
4687 cache = btrfs_lookup_block_group(root->fs_info, buf->start); 4890 cache = btrfs_lookup_block_group(root->fs_info, buf->start);
4688 if (block_rsv->space_info != cache->space_info)
4689 goto out;
4690 4891
4691 if (btrfs_header_generation(buf) == trans->transid) { 4892 if (btrfs_header_generation(buf) == trans->transid) {
4692 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 4893 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4693 ret = check_ref_cleanup(trans, root, buf->start); 4894 ret = check_ref_cleanup(trans, root, buf->start);
4694 if (!ret) 4895 if (!ret)
4695 goto pin; 4896 goto out;
4696 } 4897 }
4697 4898
4698 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 4899 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
4699 pin_down_extent(root, cache, buf->start, buf->len, 1); 4900 pin_down_extent(root, cache, buf->start, buf->len, 1);
4700 goto pin; 4901 goto out;
4701 } 4902 }
4702 4903
4703 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 4904 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4704 4905
4705 btrfs_add_free_space(cache, buf->start, buf->len); 4906 btrfs_add_free_space(cache, buf->start, buf->len);
4706 ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0); 4907 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
4707 if (ret == -EAGAIN) {
4708 /* block group became read-only */
4709 btrfs_update_reserved_bytes(cache, buf->len, 0, 1);
4710 goto out;
4711 }
4712
4713 ret = 1;
4714 spin_lock(&block_rsv->lock);
4715 if (block_rsv->reserved < block_rsv->size) {
4716 block_rsv->reserved += buf->len;
4717 ret = 0;
4718 }
4719 spin_unlock(&block_rsv->lock);
4720
4721 if (ret) {
4722 spin_lock(&cache->space_info->lock);
4723 cache->space_info->bytes_reserved -= buf->len;
4724 cache->space_info->reservation_progress++;
4725 spin_unlock(&cache->space_info->lock);
4726 }
4727 goto out;
4728 }
4729pin:
4730 if (block_rsv->durable && !cache->ro) {
4731 ret = 0;
4732 spin_lock(&cache->lock);
4733 if (!cache->ro) {
4734 cache->reserved_pinned += buf->len;
4735 ret = 1;
4736 }
4737 spin_unlock(&cache->lock);
4738
4739 if (ret) {
4740 spin_lock(&block_rsv->lock);
4741 block_rsv->freed[trans->transid & 0x1] += buf->len;
4742 spin_unlock(&block_rsv->lock);
4743 }
4744 } 4908 }
4745out: 4909out:
4746 /* 4910 /*
@@ -4883,10 +5047,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4883 int last_ptr_loop = 0; 5047 int last_ptr_loop = 0;
4884 int loop = 0; 5048 int loop = 0;
4885 int index = 0; 5049 int index = 0;
5050 int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
5051 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
4886 bool found_uncached_bg = false; 5052 bool found_uncached_bg = false;
4887 bool failed_cluster_refill = false; 5053 bool failed_cluster_refill = false;
4888 bool failed_alloc = false; 5054 bool failed_alloc = false;
4889 bool use_cluster = true; 5055 bool use_cluster = true;
5056 bool have_caching_bg = false;
4890 u64 ideal_cache_percent = 0; 5057 u64 ideal_cache_percent = 0;
4891 u64 ideal_cache_offset = 0; 5058 u64 ideal_cache_offset = 0;
4892 5059
@@ -4969,6 +5136,7 @@ ideal_cache:
4969 } 5136 }
4970 } 5137 }
4971search: 5138search:
5139 have_caching_bg = false;
4972 down_read(&space_info->groups_sem); 5140 down_read(&space_info->groups_sem);
4973 list_for_each_entry(block_group, &space_info->block_groups[index], 5141 list_for_each_entry(block_group, &space_info->block_groups[index],
4974 list) { 5142 list) {
@@ -5177,6 +5345,8 @@ refill_cluster:
5177 failed_alloc = true; 5345 failed_alloc = true;
5178 goto have_block_group; 5346 goto have_block_group;
5179 } else if (!offset) { 5347 } else if (!offset) {
5348 if (!cached)
5349 have_caching_bg = true;
5180 goto loop; 5350 goto loop;
5181 } 5351 }
5182checks: 5352checks:
@@ -5202,8 +5372,8 @@ checks:
5202 search_start - offset); 5372 search_start - offset);
5203 BUG_ON(offset > search_start); 5373 BUG_ON(offset > search_start);
5204 5374
5205 ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1, 5375 ret = btrfs_update_reserved_bytes(block_group, num_bytes,
5206 (data & BTRFS_BLOCK_GROUP_DATA)); 5376 alloc_type);
5207 if (ret == -EAGAIN) { 5377 if (ret == -EAGAIN) {
5208 btrfs_add_free_space(block_group, offset, num_bytes); 5378 btrfs_add_free_space(block_group, offset, num_bytes);
5209 goto loop; 5379 goto loop;
@@ -5227,6 +5397,9 @@ loop:
5227 } 5397 }
5228 up_read(&space_info->groups_sem); 5398 up_read(&space_info->groups_sem);
5229 5399
5400 if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
5401 goto search;
5402
5230 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) 5403 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
5231 goto search; 5404 goto search;
5232 5405
@@ -5325,7 +5498,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
5325 int index = 0; 5498 int index = 0;
5326 5499
5327 spin_lock(&info->lock); 5500 spin_lock(&info->lock);
5328 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 5501 printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
5502 (unsigned long long)info->flags,
5329 (unsigned long long)(info->total_bytes - info->bytes_used - 5503 (unsigned long long)(info->total_bytes - info->bytes_used -
5330 info->bytes_pinned - info->bytes_reserved - 5504 info->bytes_pinned - info->bytes_reserved -
5331 info->bytes_readonly), 5505 info->bytes_readonly),
@@ -5411,7 +5585,8 @@ again:
5411 return ret; 5585 return ret;
5412} 5586}
5413 5587
5414int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) 5588static int __btrfs_free_reserved_extent(struct btrfs_root *root,
5589 u64 start, u64 len, int pin)
5415{ 5590{
5416 struct btrfs_block_group_cache *cache; 5591 struct btrfs_block_group_cache *cache;
5417 int ret = 0; 5592 int ret = 0;
@@ -5426,8 +5601,12 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
5426 if (btrfs_test_opt(root, DISCARD)) 5601 if (btrfs_test_opt(root, DISCARD))
5427 ret = btrfs_discard_extent(root, start, len, NULL); 5602 ret = btrfs_discard_extent(root, start, len, NULL);
5428 5603
5429 btrfs_add_free_space(cache, start, len); 5604 if (pin)
5430 btrfs_update_reserved_bytes(cache, len, 0, 1); 5605 pin_down_extent(root, cache, start, len, 1);
5606 else {
5607 btrfs_add_free_space(cache, start, len);
5608 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
5609 }
5431 btrfs_put_block_group(cache); 5610 btrfs_put_block_group(cache);
5432 5611
5433 trace_btrfs_reserved_extent_free(root, start, len); 5612 trace_btrfs_reserved_extent_free(root, start, len);
@@ -5435,6 +5614,18 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
5435 return ret; 5614 return ret;
5436} 5615}
5437 5616
5617int btrfs_free_reserved_extent(struct btrfs_root *root,
5618 u64 start, u64 len)
5619{
5620 return __btrfs_free_reserved_extent(root, start, len, 0);
5621}
5622
5623int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
5624 u64 start, u64 len)
5625{
5626 return __btrfs_free_reserved_extent(root, start, len, 1);
5627}
5628
5438static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 5629static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5439 struct btrfs_root *root, 5630 struct btrfs_root *root,
5440 u64 parent, u64 root_objectid, 5631 u64 parent, u64 root_objectid,
@@ -5630,7 +5821,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5630 put_caching_control(caching_ctl); 5821 put_caching_control(caching_ctl);
5631 } 5822 }
5632 5823
5633 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1); 5824 ret = btrfs_update_reserved_bytes(block_group, ins->offset,
5825 RESERVE_ALLOC_NO_ACCOUNT);
5634 BUG_ON(ret); 5826 BUG_ON(ret);
5635 btrfs_put_block_group(block_group); 5827 btrfs_put_block_group(block_group);
5636 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 5828 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
@@ -5687,8 +5879,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5687 block_rsv = get_block_rsv(trans, root); 5879 block_rsv = get_block_rsv(trans, root);
5688 5880
5689 if (block_rsv->size == 0) { 5881 if (block_rsv->size == 0) {
5690 ret = reserve_metadata_bytes(trans, root, block_rsv, 5882 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
5691 blocksize, 0);
5692 /* 5883 /*
5693 * If we couldn't reserve metadata bytes try and use some from 5884 * If we couldn't reserve metadata bytes try and use some from
5694 * the global reserve. 5885 * the global reserve.
@@ -5708,13 +5899,15 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5708 if (!ret) 5899 if (!ret)
5709 return block_rsv; 5900 return block_rsv;
5710 if (ret) { 5901 if (ret) {
5711 WARN_ON(1); 5902 static DEFINE_RATELIMIT_STATE(_rs,
5712 ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize, 5903 DEFAULT_RATELIMIT_INTERVAL,
5713 0); 5904 /*DEFAULT_RATELIMIT_BURST*/ 2);
5905 if (__ratelimit(&_rs)) {
5906 printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
5907 WARN_ON(1);
5908 }
5909 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
5714 if (!ret) { 5910 if (!ret) {
5715 spin_lock(&block_rsv->lock);
5716 block_rsv->size += blocksize;
5717 spin_unlock(&block_rsv->lock);
5718 return block_rsv; 5911 return block_rsv;
5719 } else if (ret && block_rsv != global_rsv) { 5912 } else if (ret && block_rsv != global_rsv) {
5720 ret = block_rsv_use_bytes(global_rsv, blocksize); 5913 ret = block_rsv_use_bytes(global_rsv, blocksize);
@@ -6592,12 +6785,9 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
6592 cache->bytes_super - btrfs_block_group_used(&cache->item); 6785 cache->bytes_super - btrfs_block_group_used(&cache->item);
6593 6786
6594 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 6787 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
6595 sinfo->bytes_may_use + sinfo->bytes_readonly + 6788 sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
6596 cache->reserved_pinned + num_bytes + min_allocable_bytes <= 6789 min_allocable_bytes <= sinfo->total_bytes) {
6597 sinfo->total_bytes) {
6598 sinfo->bytes_readonly += num_bytes; 6790 sinfo->bytes_readonly += num_bytes;
6599 sinfo->bytes_reserved += cache->reserved_pinned;
6600 cache->reserved_pinned = 0;
6601 cache->ro = 1; 6791 cache->ro = 1;
6602 ret = 0; 6792 ret = 0;
6603 } 6793 }
@@ -6964,7 +7154,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
6964 struct btrfs_space_info, 7154 struct btrfs_space_info,
6965 list); 7155 list);
6966 if (space_info->bytes_pinned > 0 || 7156 if (space_info->bytes_pinned > 0 ||
6967 space_info->bytes_reserved > 0) { 7157 space_info->bytes_reserved > 0 ||
7158 space_info->bytes_may_use > 0) {
6968 WARN_ON(1); 7159 WARN_ON(1);
6969 dump_space_info(space_info, 0, 0); 7160 dump_space_info(space_info, 0, 0);
6970 } 7161 }
@@ -7006,14 +7197,12 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7006 return -ENOMEM; 7197 return -ENOMEM;
7007 path->reada = 1; 7198 path->reada = 1;
7008 7199
7009 cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); 7200 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
7010 if (cache_gen != 0 && 7201 if (btrfs_test_opt(root, SPACE_CACHE) &&
7011 btrfs_super_generation(&root->fs_info->super_copy) != cache_gen) 7202 btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
7012 need_clear = 1; 7203 need_clear = 1;
7013 if (btrfs_test_opt(root, CLEAR_CACHE)) 7204 if (btrfs_test_opt(root, CLEAR_CACHE))
7014 need_clear = 1; 7205 need_clear = 1;
7015 if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
7016 printk(KERN_INFO "btrfs: disk space caching is enabled\n");
7017 7206
7018 while (1) { 7207 while (1) {
7019 ret = find_first_block_group(root, path, &key); 7208 ret = find_first_block_group(root, path, &key);
@@ -7252,7 +7441,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7252 goto out; 7441 goto out;
7253 } 7442 }
7254 7443
7255 inode = lookup_free_space_inode(root, block_group, path); 7444 inode = lookup_free_space_inode(tree_root, block_group, path);
7256 if (!IS_ERR(inode)) { 7445 if (!IS_ERR(inode)) {
7257 ret = btrfs_orphan_add(trans, inode); 7446 ret = btrfs_orphan_add(trans, inode);
7258 BUG_ON(ret); 7447 BUG_ON(ret);
@@ -7268,7 +7457,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7268 spin_unlock(&block_group->lock); 7457 spin_unlock(&block_group->lock);
7269 } 7458 }
7270 /* One for our lookup ref */ 7459 /* One for our lookup ref */
7271 iput(inode); 7460 btrfs_add_delayed_iput(inode);
7272 } 7461 }
7273 7462
7274 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 7463 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@ -7339,7 +7528,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
7339 int mixed = 0; 7528 int mixed = 0;
7340 int ret; 7529 int ret;
7341 7530
7342 disk_super = &fs_info->super_copy; 7531 disk_super = fs_info->super_copy;
7343 if (!btrfs_super_root(disk_super)) 7532 if (!btrfs_super_root(disk_super))
7344 return 1; 7533 return 1;
7345 7534
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d418164a35f1..1f87c4d0e7a0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -17,6 +17,7 @@
17#include "compat.h" 17#include "compat.h"
18#include "ctree.h" 18#include "ctree.h"
19#include "btrfs_inode.h" 19#include "btrfs_inode.h"
20#include "volumes.h"
20 21
21static struct kmem_cache *extent_state_cache; 22static struct kmem_cache *extent_state_cache;
22static struct kmem_cache *extent_buffer_cache; 23static struct kmem_cache *extent_buffer_cache;
@@ -894,6 +895,194 @@ search_again:
894 goto again; 895 goto again;
895} 896}
896 897
898/**
899 * convert_extent - convert all bits in a given range from one bit to another
900 * @tree: the io tree to search
901 * @start: the start offset in bytes
902 * @end: the end offset in bytes (inclusive)
903 * @bits: the bits to set in this range
904 * @clear_bits: the bits to clear in this range
905 * @mask: the allocation mask
906 *
907 * This will go through and set bits for the given range. If any states exist
908 * already in this range they are set with the given bit and cleared of the
909 * clear_bits. This is only meant to be used by things that are mergeable, ie
910 * converting from say DELALLOC to DIRTY. This is not meant to be used with
911 * boundary bits like LOCK.
912 */
913int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
914 int bits, int clear_bits, gfp_t mask)
915{
916 struct extent_state *state;
917 struct extent_state *prealloc = NULL;
918 struct rb_node *node;
919 int err = 0;
920 u64 last_start;
921 u64 last_end;
922
923again:
924 if (!prealloc && (mask & __GFP_WAIT)) {
925 prealloc = alloc_extent_state(mask);
926 if (!prealloc)
927 return -ENOMEM;
928 }
929
930 spin_lock(&tree->lock);
931 /*
932 * this search will find all the extents that end after
933 * our range starts.
934 */
935 node = tree_search(tree, start);
936 if (!node) {
937 prealloc = alloc_extent_state_atomic(prealloc);
938 if (!prealloc)
939 return -ENOMEM;
940 err = insert_state(tree, prealloc, start, end, &bits);
941 prealloc = NULL;
942 BUG_ON(err == -EEXIST);
943 goto out;
944 }
945 state = rb_entry(node, struct extent_state, rb_node);
946hit_next:
947 last_start = state->start;
948 last_end = state->end;
949
950 /*
951 * | ---- desired range ---- |
952 * | state |
953 *
954 * Just lock what we found and keep going
955 */
956 if (state->start == start && state->end <= end) {
957 struct rb_node *next_node;
958
959 set_state_bits(tree, state, &bits);
960 clear_state_bit(tree, state, &clear_bits, 0);
961
962 merge_state(tree, state);
963 if (last_end == (u64)-1)
964 goto out;
965
966 start = last_end + 1;
967 next_node = rb_next(&state->rb_node);
968 if (next_node && start < end && prealloc && !need_resched()) {
969 state = rb_entry(next_node, struct extent_state,
970 rb_node);
971 if (state->start == start)
972 goto hit_next;
973 }
974 goto search_again;
975 }
976
977 /*
978 * | ---- desired range ---- |
979 * | state |
980 * or
981 * | ------------- state -------------- |
982 *
983 * We need to split the extent we found, and may flip bits on
984 * second half.
985 *
986 * If the extent we found extends past our
987 * range, we just split and search again. It'll get split
988 * again the next time though.
989 *
990 * If the extent we found is inside our range, we set the
991 * desired bit on it.
992 */
993 if (state->start < start) {
994 prealloc = alloc_extent_state_atomic(prealloc);
995 if (!prealloc)
996 return -ENOMEM;
997 err = split_state(tree, state, prealloc, start);
998 BUG_ON(err == -EEXIST);
999 prealloc = NULL;
1000 if (err)
1001 goto out;
1002 if (state->end <= end) {
1003 set_state_bits(tree, state, &bits);
1004 clear_state_bit(tree, state, &clear_bits, 0);
1005 merge_state(tree, state);
1006 if (last_end == (u64)-1)
1007 goto out;
1008 start = last_end + 1;
1009 }
1010 goto search_again;
1011 }
1012 /*
1013 * | ---- desired range ---- |
1014 * | state | or | state |
1015 *
1016 * There's a hole, we need to insert something in it and
1017 * ignore the extent we found.
1018 */
1019 if (state->start > start) {
1020 u64 this_end;
1021 if (end < last_start)
1022 this_end = end;
1023 else
1024 this_end = last_start - 1;
1025
1026 prealloc = alloc_extent_state_atomic(prealloc);
1027 if (!prealloc)
1028 return -ENOMEM;
1029
1030 /*
1031 * Avoid to free 'prealloc' if it can be merged with
1032 * the later extent.
1033 */
1034 err = insert_state(tree, prealloc, start, this_end,
1035 &bits);
1036 BUG_ON(err == -EEXIST);
1037 if (err) {
1038 free_extent_state(prealloc);
1039 prealloc = NULL;
1040 goto out;
1041 }
1042 prealloc = NULL;
1043 start = this_end + 1;
1044 goto search_again;
1045 }
1046 /*
1047 * | ---- desired range ---- |
1048 * | state |
1049 * We need to split the extent, and set the bit
1050 * on the first half
1051 */
1052 if (state->start <= end && state->end > end) {
1053 prealloc = alloc_extent_state_atomic(prealloc);
1054 if (!prealloc)
1055 return -ENOMEM;
1056
1057 err = split_state(tree, state, prealloc, end + 1);
1058 BUG_ON(err == -EEXIST);
1059
1060 set_state_bits(tree, prealloc, &bits);
1061 clear_state_bit(tree, prealloc, &clear_bits, 0);
1062
1063 merge_state(tree, prealloc);
1064 prealloc = NULL;
1065 goto out;
1066 }
1067
1068 goto search_again;
1069
1070out:
1071 spin_unlock(&tree->lock);
1072 if (prealloc)
1073 free_extent_state(prealloc);
1074
1075 return err;
1076
1077search_again:
1078 if (start > end)
1079 goto out;
1080 spin_unlock(&tree->lock);
1081 if (mask & __GFP_WAIT)
1082 cond_resched();
1083 goto again;
1084}
1085
897/* wrappers around set/clear extent bit */ 1086/* wrappers around set/clear extent bit */
898int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1087int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
899 gfp_t mask) 1088 gfp_t mask)
@@ -919,7 +1108,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
919 struct extent_state **cached_state, gfp_t mask) 1108 struct extent_state **cached_state, gfp_t mask)
920{ 1109{
921 return set_extent_bit(tree, start, end, 1110 return set_extent_bit(tree, start, end,
922 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, 1111 EXTENT_DELALLOC | EXTENT_UPTODATE,
923 0, NULL, cached_state, mask); 1112 0, NULL, cached_state, mask);
924} 1113}
925 1114
@@ -1599,6 +1788,368 @@ static int check_page_writeback(struct extent_io_tree *tree,
1599 return 0; 1788 return 0;
1600} 1789}
1601 1790
1791/*
1792 * When IO fails, either with EIO or csum verification fails, we
1793 * try other mirrors that might have a good copy of the data. This
1794 * io_failure_record is used to record state as we go through all the
1795 * mirrors. If another mirror has good data, the page is set up to date
1796 * and things continue. If a good mirror can't be found, the original
1797 * bio end_io callback is called to indicate things have failed.
1798 */
1799struct io_failure_record {
1800 struct page *page;
1801 u64 start;
1802 u64 len;
1803 u64 logical;
1804 unsigned long bio_flags;
1805 int this_mirror;
1806 int failed_mirror;
1807 int in_validation;
1808};
1809
1810static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1811 int did_repair)
1812{
1813 int ret;
1814 int err = 0;
1815 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1816
1817 set_state_private(failure_tree, rec->start, 0);
1818 ret = clear_extent_bits(failure_tree, rec->start,
1819 rec->start + rec->len - 1,
1820 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1821 if (ret)
1822 err = ret;
1823
1824 if (did_repair) {
1825 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1826 rec->start + rec->len - 1,
1827 EXTENT_DAMAGED, GFP_NOFS);
1828 if (ret && !err)
1829 err = ret;
1830 }
1831
1832 kfree(rec);
1833 return err;
1834}
1835
1836static void repair_io_failure_callback(struct bio *bio, int err)
1837{
1838 complete(bio->bi_private);
1839}
1840
1841/*
1842 * this bypasses the standard btrfs submit functions deliberately, as
1843 * the standard behavior is to write all copies in a raid setup. here we only
1844 * want to write the one bad copy. so we do the mapping for ourselves and issue
1845 * submit_bio directly.
1846 * to avoid any synchonization issues, wait for the data after writing, which
1847 * actually prevents the read that triggered the error from finishing.
1848 * currently, there can be no more than two copies of every data bit. thus,
1849 * exactly one rewrite is required.
1850 */
1851int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1852 u64 length, u64 logical, struct page *page,
1853 int mirror_num)
1854{
1855 struct bio *bio;
1856 struct btrfs_device *dev;
1857 DECLARE_COMPLETION_ONSTACK(compl);
1858 u64 map_length = 0;
1859 u64 sector;
1860 struct btrfs_bio *bbio = NULL;
1861 int ret;
1862
1863 BUG_ON(!mirror_num);
1864
1865 bio = bio_alloc(GFP_NOFS, 1);
1866 if (!bio)
1867 return -EIO;
1868 bio->bi_private = &compl;
1869 bio->bi_end_io = repair_io_failure_callback;
1870 bio->bi_size = 0;
1871 map_length = length;
1872
1873 ret = btrfs_map_block(map_tree, WRITE, logical,
1874 &map_length, &bbio, mirror_num);
1875 if (ret) {
1876 bio_put(bio);
1877 return -EIO;
1878 }
1879 BUG_ON(mirror_num != bbio->mirror_num);
1880 sector = bbio->stripes[mirror_num-1].physical >> 9;
1881 bio->bi_sector = sector;
1882 dev = bbio->stripes[mirror_num-1].dev;
1883 kfree(bbio);
1884 if (!dev || !dev->bdev || !dev->writeable) {
1885 bio_put(bio);
1886 return -EIO;
1887 }
1888 bio->bi_bdev = dev->bdev;
1889 bio_add_page(bio, page, length, start-page_offset(page));
1890 submit_bio(WRITE_SYNC, bio);
1891 wait_for_completion(&compl);
1892
1893 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1894 /* try to remap that extent elsewhere? */
1895 bio_put(bio);
1896 return -EIO;
1897 }
1898
1899 printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
1900 "sector %llu)\n", page->mapping->host->i_ino, start,
1901 dev->name, sector);
1902
1903 bio_put(bio);
1904 return 0;
1905}
1906
1907/*
1908 * each time an IO finishes, we do a fast check in the IO failure tree
1909 * to see if we need to process or clean up an io_failure_record
1910 */
1911static int clean_io_failure(u64 start, struct page *page)
1912{
1913 u64 private;
1914 u64 private_failure;
1915 struct io_failure_record *failrec;
1916 struct btrfs_mapping_tree *map_tree;
1917 struct extent_state *state;
1918 int num_copies;
1919 int did_repair = 0;
1920 int ret;
1921 struct inode *inode = page->mapping->host;
1922
1923 private = 0;
1924 ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1925 (u64)-1, 1, EXTENT_DIRTY, 0);
1926 if (!ret)
1927 return 0;
1928
1929 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
1930 &private_failure);
1931 if (ret)
1932 return 0;
1933
1934 failrec = (struct io_failure_record *)(unsigned long) private_failure;
1935 BUG_ON(!failrec->this_mirror);
1936
1937 if (failrec->in_validation) {
1938 /* there was no real error, just free the record */
1939 pr_debug("clean_io_failure: freeing dummy error at %llu\n",
1940 failrec->start);
1941 did_repair = 1;
1942 goto out;
1943 }
1944
1945 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1946 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1947 failrec->start,
1948 EXTENT_LOCKED);
1949 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1950
1951 if (state && state->start == failrec->start) {
1952 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
1953 num_copies = btrfs_num_copies(map_tree, failrec->logical,
1954 failrec->len);
1955 if (num_copies > 1) {
1956 ret = repair_io_failure(map_tree, start, failrec->len,
1957 failrec->logical, page,
1958 failrec->failed_mirror);
1959 did_repair = !ret;
1960 }
1961 }
1962
1963out:
1964 if (!ret)
1965 ret = free_io_failure(inode, failrec, did_repair);
1966
1967 return ret;
1968}
1969
1970/*
1971 * this is a generic handler for readpage errors (default
1972 * readpage_io_failed_hook). if other copies exist, read those and write back
1973 * good data to the failed position. does not investigate in remapping the
1974 * failed extent elsewhere, hoping the device will be smart enough to do this as
1975 * needed
1976 */
1977
1978static int bio_readpage_error(struct bio *failed_bio, struct page *page,
1979 u64 start, u64 end, int failed_mirror,
1980 struct extent_state *state)
1981{
1982 struct io_failure_record *failrec = NULL;
1983 u64 private;
1984 struct extent_map *em;
1985 struct inode *inode = page->mapping->host;
1986 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1987 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
1988 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1989 struct bio *bio;
1990 int num_copies;
1991 int ret;
1992 int read_mode;
1993 u64 logical;
1994
1995 BUG_ON(failed_bio->bi_rw & REQ_WRITE);
1996
1997 ret = get_state_private(failure_tree, start, &private);
1998 if (ret) {
1999 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2000 if (!failrec)
2001 return -ENOMEM;
2002 failrec->start = start;
2003 failrec->len = end - start + 1;
2004 failrec->this_mirror = 0;
2005 failrec->bio_flags = 0;
2006 failrec->in_validation = 0;
2007
2008 read_lock(&em_tree->lock);
2009 em = lookup_extent_mapping(em_tree, start, failrec->len);
2010 if (!em) {
2011 read_unlock(&em_tree->lock);
2012 kfree(failrec);
2013 return -EIO;
2014 }
2015
2016 if (em->start > start || em->start + em->len < start) {
2017 free_extent_map(em);
2018 em = NULL;
2019 }
2020 read_unlock(&em_tree->lock);
2021
2022 if (!em || IS_ERR(em)) {
2023 kfree(failrec);
2024 return -EIO;
2025 }
2026 logical = start - em->start;
2027 logical = em->block_start + logical;
2028 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2029 logical = em->block_start;
2030 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2031 extent_set_compress_type(&failrec->bio_flags,
2032 em->compress_type);
2033 }
2034 pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
2035 "len=%llu\n", logical, start, failrec->len);
2036 failrec->logical = logical;
2037 free_extent_map(em);
2038
2039 /* set the bits in the private failure tree */
2040 ret = set_extent_bits(failure_tree, start, end,
2041 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2042 if (ret >= 0)
2043 ret = set_state_private(failure_tree, start,
2044 (u64)(unsigned long)failrec);
2045 /* set the bits in the inode's tree */
2046 if (ret >= 0)
2047 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
2048 GFP_NOFS);
2049 if (ret < 0) {
2050 kfree(failrec);
2051 return ret;
2052 }
2053 } else {
2054 failrec = (struct io_failure_record *)(unsigned long)private;
2055 pr_debug("bio_readpage_error: (found) logical=%llu, "
2056 "start=%llu, len=%llu, validation=%d\n",
2057 failrec->logical, failrec->start, failrec->len,
2058 failrec->in_validation);
2059 /*
2060 * when data can be on disk more than twice, add to failrec here
2061 * (e.g. with a list for failed_mirror) to make
2062 * clean_io_failure() clean all those errors at once.
2063 */
2064 }
2065 num_copies = btrfs_num_copies(
2066 &BTRFS_I(inode)->root->fs_info->mapping_tree,
2067 failrec->logical, failrec->len);
2068 if (num_copies == 1) {
2069 /*
2070 * we only have a single copy of the data, so don't bother with
2071 * all the retry and error correction code that follows. no
2072 * matter what the error is, it is very likely to persist.
2073 */
2074 pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
2075 "state=%p, num_copies=%d, next_mirror %d, "
2076 "failed_mirror %d\n", state, num_copies,
2077 failrec->this_mirror, failed_mirror);
2078 free_io_failure(inode, failrec, 0);
2079 return -EIO;
2080 }
2081
2082 if (!state) {
2083 spin_lock(&tree->lock);
2084 state = find_first_extent_bit_state(tree, failrec->start,
2085 EXTENT_LOCKED);
2086 if (state && state->start != failrec->start)
2087 state = NULL;
2088 spin_unlock(&tree->lock);
2089 }
2090
2091 /*
2092 * there are two premises:
2093 * a) deliver good data to the caller
2094 * b) correct the bad sectors on disk
2095 */
2096 if (failed_bio->bi_vcnt > 1) {
2097 /*
2098 * to fulfill b), we need to know the exact failing sectors, as
2099 * we don't want to rewrite any more than the failed ones. thus,
2100 * we need separate read requests for the failed bio
2101 *
2102 * if the following BUG_ON triggers, our validation request got
2103 * merged. we need separate requests for our algorithm to work.
2104 */
2105 BUG_ON(failrec->in_validation);
2106 failrec->in_validation = 1;
2107 failrec->this_mirror = failed_mirror;
2108 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2109 } else {
2110 /*
2111 * we're ready to fulfill a) and b) alongside. get a good copy
2112 * of the failed sector and if we succeed, we have setup
2113 * everything for repair_io_failure to do the rest for us.
2114 */
2115 if (failrec->in_validation) {
2116 BUG_ON(failrec->this_mirror != failed_mirror);
2117 failrec->in_validation = 0;
2118 failrec->this_mirror = 0;
2119 }
2120 failrec->failed_mirror = failed_mirror;
2121 failrec->this_mirror++;
2122 if (failrec->this_mirror == failed_mirror)
2123 failrec->this_mirror++;
2124 read_mode = READ_SYNC;
2125 }
2126
2127 if (!state || failrec->this_mirror > num_copies) {
2128 pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
2129 "next_mirror %d, failed_mirror %d\n", state,
2130 num_copies, failrec->this_mirror, failed_mirror);
2131 free_io_failure(inode, failrec, 0);
2132 return -EIO;
2133 }
2134
2135 bio = bio_alloc(GFP_NOFS, 1);
2136 bio->bi_private = state;
2137 bio->bi_end_io = failed_bio->bi_end_io;
2138 bio->bi_sector = failrec->logical >> 9;
2139 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
2140 bio->bi_size = 0;
2141
2142 bio_add_page(bio, page, failrec->len, start - page_offset(page));
2143
2144 pr_debug("bio_readpage_error: submitting new read[%#x] to "
2145 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
2146 failrec->this_mirror, num_copies, failrec->in_validation);
2147
2148 tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
2149 failrec->bio_flags, 0);
2150 return 0;
2151}
2152
1602/* lots and lots of room for performance fixes in the end_bio funcs */ 2153/* lots and lots of room for performance fixes in the end_bio funcs */
1603 2154
1604/* 2155/*
@@ -1697,6 +2248,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1697 struct extent_state *cached = NULL; 2248 struct extent_state *cached = NULL;
1698 struct extent_state *state; 2249 struct extent_state *state;
1699 2250
2251 pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
2252 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
2253 (long int)bio->bi_bdev);
1700 tree = &BTRFS_I(page->mapping->host)->io_tree; 2254 tree = &BTRFS_I(page->mapping->host)->io_tree;
1701 2255
1702 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 2256 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1727,11 +2281,19 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1727 state); 2281 state);
1728 if (ret) 2282 if (ret)
1729 uptodate = 0; 2283 uptodate = 0;
2284 else
2285 clean_io_failure(start, page);
1730 } 2286 }
1731 if (!uptodate && tree->ops && 2287 if (!uptodate) {
1732 tree->ops->readpage_io_failed_hook) { 2288 u64 failed_mirror;
1733 ret = tree->ops->readpage_io_failed_hook(bio, page, 2289 failed_mirror = (u64)bio->bi_bdev;
1734 start, end, NULL); 2290 if (tree->ops && tree->ops->readpage_io_failed_hook)
2291 ret = tree->ops->readpage_io_failed_hook(
2292 bio, page, start, end,
2293 failed_mirror, state);
2294 else
2295 ret = bio_readpage_error(bio, page, start, end,
2296 failed_mirror, NULL);
1735 if (ret == 0) { 2297 if (ret == 0) {
1736 uptodate = 2298 uptodate =
1737 test_bit(BIO_UPTODATE, &bio->bi_flags); 2299 test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -1811,6 +2373,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1811 mirror_num, bio_flags, start); 2373 mirror_num, bio_flags, start);
1812 else 2374 else
1813 submit_bio(rw, bio); 2375 submit_bio(rw, bio);
2376
1814 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 2377 if (bio_flagged(bio, BIO_EOPNOTSUPP))
1815 ret = -EOPNOTSUPP; 2378 ret = -EOPNOTSUPP;
1816 bio_put(bio); 2379 bio_put(bio);
@@ -2076,16 +2639,16 @@ out:
2076} 2639}
2077 2640
2078int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 2641int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2079 get_extent_t *get_extent) 2642 get_extent_t *get_extent, int mirror_num)
2080{ 2643{
2081 struct bio *bio = NULL; 2644 struct bio *bio = NULL;
2082 unsigned long bio_flags = 0; 2645 unsigned long bio_flags = 0;
2083 int ret; 2646 int ret;
2084 2647
2085 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, 2648 ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
2086 &bio_flags); 2649 &bio_flags);
2087 if (bio) 2650 if (bio)
2088 ret = submit_one_bio(READ, bio, 0, bio_flags); 2651 ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
2089 return ret; 2652 return ret;
2090} 2653}
2091 2654
@@ -2136,6 +2699,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2136 int compressed; 2699 int compressed;
2137 int write_flags; 2700 int write_flags;
2138 unsigned long nr_written = 0; 2701 unsigned long nr_written = 0;
2702 bool fill_delalloc = true;
2139 2703
2140 if (wbc->sync_mode == WB_SYNC_ALL) 2704 if (wbc->sync_mode == WB_SYNC_ALL)
2141 write_flags = WRITE_SYNC; 2705 write_flags = WRITE_SYNC;
@@ -2145,6 +2709,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2145 trace___extent_writepage(page, inode, wbc); 2709 trace___extent_writepage(page, inode, wbc);
2146 2710
2147 WARN_ON(!PageLocked(page)); 2711 WARN_ON(!PageLocked(page));
2712
2713 ClearPageError(page);
2714
2148 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2715 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2149 if (page->index > end_index || 2716 if (page->index > end_index ||
2150 (page->index == end_index && !pg_offset)) { 2717 (page->index == end_index && !pg_offset)) {
@@ -2166,10 +2733,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2166 2733
2167 set_page_extent_mapped(page); 2734 set_page_extent_mapped(page);
2168 2735
2736 if (!tree->ops || !tree->ops->fill_delalloc)
2737 fill_delalloc = false;
2738
2169 delalloc_start = start; 2739 delalloc_start = start;
2170 delalloc_end = 0; 2740 delalloc_end = 0;
2171 page_started = 0; 2741 page_started = 0;
2172 if (!epd->extent_locked) { 2742 if (!epd->extent_locked && fill_delalloc) {
2173 u64 delalloc_to_write = 0; 2743 u64 delalloc_to_write = 0;
2174 /* 2744 /*
2175 * make sure the wbc mapping index is at least updated 2745 * make sure the wbc mapping index is at least updated
@@ -2421,10 +2991,16 @@ retry:
2421 * swizzled back from swapper_space to tmpfs file 2991 * swizzled back from swapper_space to tmpfs file
2422 * mapping 2992 * mapping
2423 */ 2993 */
2424 if (tree->ops && tree->ops->write_cache_pages_lock_hook) 2994 if (tree->ops &&
2425 tree->ops->write_cache_pages_lock_hook(page); 2995 tree->ops->write_cache_pages_lock_hook) {
2426 else 2996 tree->ops->write_cache_pages_lock_hook(page,
2427 lock_page(page); 2997 data, flush_fn);
2998 } else {
2999 if (!trylock_page(page)) {
3000 flush_fn(data);
3001 lock_page(page);
3002 }
3003 }
2428 3004
2429 if (unlikely(page->mapping != mapping)) { 3005 if (unlikely(page->mapping != mapping)) {
2430 unlock_page(page); 3006 unlock_page(page);
@@ -2926,7 +3502,7 @@ out:
2926 return ret; 3502 return ret;
2927} 3503}
2928 3504
2929static inline struct page *extent_buffer_page(struct extent_buffer *eb, 3505inline struct page *extent_buffer_page(struct extent_buffer *eb,
2930 unsigned long i) 3506 unsigned long i)
2931{ 3507{
2932 struct page *p; 3508 struct page *p;
@@ -2951,7 +3527,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2951 return p; 3527 return p;
2952} 3528}
2953 3529
2954static inline unsigned long num_extent_pages(u64 start, u64 len) 3530inline unsigned long num_extent_pages(u64 start, u64 len)
2955{ 3531{
2956 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - 3532 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2957 (start >> PAGE_CACHE_SHIFT); 3533 (start >> PAGE_CACHE_SHIFT);
@@ -3204,6 +3780,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3204 PAGECACHE_TAG_DIRTY); 3780 PAGECACHE_TAG_DIRTY);
3205 } 3781 }
3206 spin_unlock_irq(&page->mapping->tree_lock); 3782 spin_unlock_irq(&page->mapping->tree_lock);
3783 ClearPageError(page);
3207 unlock_page(page); 3784 unlock_page(page);
3208 } 3785 }
3209 return 0; 3786 return 0;
@@ -3349,8 +3926,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3349} 3926}
3350 3927
3351int read_extent_buffer_pages(struct extent_io_tree *tree, 3928int read_extent_buffer_pages(struct extent_io_tree *tree,
3352 struct extent_buffer *eb, 3929 struct extent_buffer *eb, u64 start, int wait,
3353 u64 start, int wait,
3354 get_extent_t *get_extent, int mirror_num) 3930 get_extent_t *get_extent, int mirror_num)
3355{ 3931{
3356 unsigned long i; 3932 unsigned long i;
@@ -3386,7 +3962,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3386 num_pages = num_extent_pages(eb->start, eb->len); 3962 num_pages = num_extent_pages(eb->start, eb->len);
3387 for (i = start_i; i < num_pages; i++) { 3963 for (i = start_i; i < num_pages; i++) {
3388 page = extent_buffer_page(eb, i); 3964 page = extent_buffer_page(eb, i);
3389 if (!wait) { 3965 if (wait == WAIT_NONE) {
3390 if (!trylock_page(page)) 3966 if (!trylock_page(page))
3391 goto unlock_exit; 3967 goto unlock_exit;
3392 } else { 3968 } else {
@@ -3430,7 +4006,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3430 if (bio) 4006 if (bio)
3431 submit_one_bio(READ, bio, mirror_num, bio_flags); 4007 submit_one_bio(READ, bio, mirror_num, bio_flags);
3432 4008
3433 if (ret || !wait) 4009 if (ret || wait != WAIT_COMPLETE)
3434 return ret; 4010 return ret;
3435 4011
3436 for (i = start_i; i < num_pages; i++) { 4012 for (i = start_i; i < num_pages; i++) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7b2f0c3e7929..feb9be0e23bc 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -17,6 +17,8 @@
17#define EXTENT_NODATASUM (1 << 10) 17#define EXTENT_NODATASUM (1 << 10)
18#define EXTENT_DO_ACCOUNTING (1 << 11) 18#define EXTENT_DO_ACCOUNTING (1 << 11)
19#define EXTENT_FIRST_DELALLOC (1 << 12) 19#define EXTENT_FIRST_DELALLOC (1 << 12)
20#define EXTENT_NEED_WAIT (1 << 13)
21#define EXTENT_DAMAGED (1 << 14)
20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 22#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) 23#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
22 24
@@ -32,6 +34,7 @@
32#define EXTENT_BUFFER_BLOCKING 1 34#define EXTENT_BUFFER_BLOCKING 1
33#define EXTENT_BUFFER_DIRTY 2 35#define EXTENT_BUFFER_DIRTY 2
34#define EXTENT_BUFFER_CORRUPT 3 36#define EXTENT_BUFFER_CORRUPT 3
37#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */
35 38
36/* these are flags for extent_clear_unlock_delalloc */ 39/* these are flags for extent_clear_unlock_delalloc */
37#define EXTENT_CLEAR_UNLOCK_PAGE 0x1 40#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -67,7 +70,7 @@ struct extent_io_ops {
67 unsigned long bio_flags); 70 unsigned long bio_flags);
68 int (*readpage_io_hook)(struct page *page, u64 start, u64 end); 71 int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
69 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, 72 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
70 u64 start, u64 end, 73 u64 start, u64 end, u64 failed_mirror,
71 struct extent_state *state); 74 struct extent_state *state);
72 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, 75 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
73 u64 start, u64 end, 76 u64 start, u64 end,
@@ -85,7 +88,8 @@ struct extent_io_ops {
85 struct extent_state *other); 88 struct extent_state *other);
86 void (*split_extent_hook)(struct inode *inode, 89 void (*split_extent_hook)(struct inode *inode,
87 struct extent_state *orig, u64 split); 90 struct extent_state *orig, u64 split);
88 int (*write_cache_pages_lock_hook)(struct page *page); 91 int (*write_cache_pages_lock_hook)(struct page *page, void *data,
92 void (*flush_fn)(void *));
89}; 93};
90 94
91struct extent_io_tree { 95struct extent_io_tree {
@@ -185,7 +189,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
185int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, 189int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
186 gfp_t mask); 190 gfp_t mask);
187int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 191int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
188 get_extent_t *get_extent); 192 get_extent_t *get_extent, int mirror_num);
189int __init extent_io_init(void); 193int __init extent_io_init(void);
190void extent_io_exit(void); 194void extent_io_exit(void);
191 195
@@ -214,6 +218,8 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
214 gfp_t mask); 218 gfp_t mask);
215int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 219int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
216 gfp_t mask); 220 gfp_t mask);
221int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
222 int bits, int clear_bits, gfp_t mask);
217int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 223int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
218 struct extent_state **cached_state, gfp_t mask); 224 struct extent_state **cached_state, gfp_t mask);
219int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 225int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@ -248,9 +254,14 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
248struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 254struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
249 u64 start, unsigned long len); 255 u64 start, unsigned long len);
250void free_extent_buffer(struct extent_buffer *eb); 256void free_extent_buffer(struct extent_buffer *eb);
257#define WAIT_NONE 0
258#define WAIT_COMPLETE 1
259#define WAIT_PAGE_LOCK 2
251int read_extent_buffer_pages(struct extent_io_tree *tree, 260int read_extent_buffer_pages(struct extent_io_tree *tree,
252 struct extent_buffer *eb, u64 start, int wait, 261 struct extent_buffer *eb, u64 start, int wait,
253 get_extent_t *get_extent, int mirror_num); 262 get_extent_t *get_extent, int mirror_num);
263unsigned long num_extent_pages(u64 start, u64 len);
264struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i);
254 265
255static inline void extent_buffer_get(struct extent_buffer *eb) 266static inline void extent_buffer_get(struct extent_buffer *eb)
256{ 267{
@@ -300,4 +311,10 @@ int extent_clear_unlock_delalloc(struct inode *inode,
300struct bio * 311struct bio *
301btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 312btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
302 gfp_t gfp_flags); 313 gfp_t gfp_flags);
314
315struct btrfs_mapping_tree;
316
317int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
318 u64 length, u64 logical, struct page *page,
319 int mirror_num);
303#endif 320#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a1cb7821becd..c7fb3a4247d3 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -91,8 +91,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
91 struct btrfs_csum_item *item; 91 struct btrfs_csum_item *item;
92 struct extent_buffer *leaf; 92 struct extent_buffer *leaf;
93 u64 csum_offset = 0; 93 u64 csum_offset = 0;
94 u16 csum_size = 94 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
95 btrfs_super_csum_size(&root->fs_info->super_copy);
96 int csums_in_item; 95 int csums_in_item;
97 96
98 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 97 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -162,8 +161,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
162 u64 item_last_offset = 0; 161 u64 item_last_offset = 0;
163 u64 disk_bytenr; 162 u64 disk_bytenr;
164 u32 diff; 163 u32 diff;
165 u16 csum_size = 164 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
166 btrfs_super_csum_size(&root->fs_info->super_copy);
167 int ret; 165 int ret;
168 struct btrfs_path *path; 166 struct btrfs_path *path;
169 struct btrfs_csum_item *item = NULL; 167 struct btrfs_csum_item *item = NULL;
@@ -290,7 +288,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
290 int ret; 288 int ret;
291 size_t size; 289 size_t size;
292 u64 csum_end; 290 u64 csum_end;
293 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); 291 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
294 292
295 path = btrfs_alloc_path(); 293 path = btrfs_alloc_path();
296 if (!path) 294 if (!path)
@@ -492,8 +490,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
492 u64 bytenr, u64 len) 490 u64 bytenr, u64 len)
493{ 491{
494 struct extent_buffer *leaf; 492 struct extent_buffer *leaf;
495 u16 csum_size = 493 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
496 btrfs_super_csum_size(&root->fs_info->super_copy);
497 u64 csum_end; 494 u64 csum_end;
498 u64 end_byte = bytenr + len; 495 u64 end_byte = bytenr + len;
499 u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; 496 u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
@@ -549,8 +546,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
549 u64 csum_end; 546 u64 csum_end;
550 struct extent_buffer *leaf; 547 struct extent_buffer *leaf;
551 int ret; 548 int ret;
552 u16 csum_size = 549 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
553 btrfs_super_csum_size(&root->fs_info->super_copy);
554 int blocksize_bits = root->fs_info->sb->s_blocksize_bits; 550 int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
555 551
556 root = root->fs_info->csum_root; 552 root = root->fs_info->csum_root;
@@ -676,8 +672,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
676 struct btrfs_sector_sum *sector_sum; 672 struct btrfs_sector_sum *sector_sum;
677 u32 nritems; 673 u32 nritems;
678 u32 ins_size; 674 u32 ins_size;
679 u16 csum_size = 675 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
680 btrfs_super_csum_size(&root->fs_info->super_copy);
681 676
682 path = btrfs_alloc_path(); 677 path = btrfs_alloc_path();
683 if (!path) 678 if (!path)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1266f6e9cdb2..dafdfa059bf6 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1069,6 +1069,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1069 int i; 1069 int i;
1070 unsigned long index = pos >> PAGE_CACHE_SHIFT; 1070 unsigned long index = pos >> PAGE_CACHE_SHIFT;
1071 struct inode *inode = fdentry(file)->d_inode; 1071 struct inode *inode = fdentry(file)->d_inode;
1072 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1072 int err = 0; 1073 int err = 0;
1073 int faili = 0; 1074 int faili = 0;
1074 u64 start_pos; 1075 u64 start_pos;
@@ -1080,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1080again: 1081again:
1081 for (i = 0; i < num_pages; i++) { 1082 for (i = 0; i < num_pages; i++) {
1082 pages[i] = find_or_create_page(inode->i_mapping, index + i, 1083 pages[i] = find_or_create_page(inode->i_mapping, index + i,
1083 GFP_NOFS); 1084 mask);
1084 if (!pages[i]) { 1085 if (!pages[i]) {
1085 faili = i - 1; 1086 faili = i - 1;
1086 err = -ENOMEM; 1087 err = -ENOMEM;
@@ -1615,10 +1616,6 @@ static long btrfs_fallocate(struct file *file, int mode,
1615 goto out; 1616 goto out;
1616 } 1617 }
1617 1618
1618 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
1619 if (ret)
1620 goto out;
1621
1622 locked_end = alloc_end - 1; 1619 locked_end = alloc_end - 1;
1623 while (1) { 1620 while (1) {
1624 struct btrfs_ordered_extent *ordered; 1621 struct btrfs_ordered_extent *ordered;
@@ -1664,11 +1661,27 @@ static long btrfs_fallocate(struct file *file, int mode,
1664 if (em->block_start == EXTENT_MAP_HOLE || 1661 if (em->block_start == EXTENT_MAP_HOLE ||
1665 (cur_offset >= inode->i_size && 1662 (cur_offset >= inode->i_size &&
1666 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 1663 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
1664
1665 /*
1666 * Make sure we have enough space before we do the
1667 * allocation.
1668 */
1669 ret = btrfs_check_data_free_space(inode, last_byte -
1670 cur_offset);
1671 if (ret) {
1672 free_extent_map(em);
1673 break;
1674 }
1675
1667 ret = btrfs_prealloc_file_range(inode, mode, cur_offset, 1676 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
1668 last_byte - cur_offset, 1677 last_byte - cur_offset,
1669 1 << inode->i_blkbits, 1678 1 << inode->i_blkbits,
1670 offset + len, 1679 offset + len,
1671 &alloc_hint); 1680 &alloc_hint);
1681
1682 /* Let go of our reservation. */
1683 btrfs_free_reserved_data_space(inode, last_byte -
1684 cur_offset);
1672 if (ret < 0) { 1685 if (ret < 0) {
1673 free_extent_map(em); 1686 free_extent_map(em);
1674 break; 1687 break;
@@ -1694,8 +1707,6 @@ static long btrfs_fallocate(struct file *file, int mode,
1694 } 1707 }
1695 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 1708 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
1696 &cached_state, GFP_NOFS); 1709 &cached_state, GFP_NOFS);
1697
1698 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
1699out: 1710out:
1700 mutex_unlock(&inode->i_mutex); 1711 mutex_unlock(&inode->i_mutex);
1701 return ret; 1712 return ret;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 41ac927401d0..7a15fcfb3e1f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -20,6 +20,7 @@
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/math64.h> 22#include <linux/math64.h>
23#include <linux/ratelimit.h>
23#include "ctree.h" 24#include "ctree.h"
24#include "free-space-cache.h" 25#include "free-space-cache.h"
25#include "transaction.h" 26#include "transaction.h"
@@ -84,6 +85,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
84 *block_group, struct btrfs_path *path) 85 *block_group, struct btrfs_path *path)
85{ 86{
86 struct inode *inode = NULL; 87 struct inode *inode = NULL;
88 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
87 89
88 spin_lock(&block_group->lock); 90 spin_lock(&block_group->lock);
89 if (block_group->inode) 91 if (block_group->inode)
@@ -98,13 +100,14 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
98 return inode; 100 return inode;
99 101
100 spin_lock(&block_group->lock); 102 spin_lock(&block_group->lock);
101 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) { 103 if (!((BTRFS_I(inode)->flags & flags) == flags)) {
102 printk(KERN_INFO "Old style space inode found, converting.\n"); 104 printk(KERN_INFO "Old style space inode found, converting.\n");
103 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM; 105 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
106 BTRFS_INODE_NODATACOW;
104 block_group->disk_cache_state = BTRFS_DC_CLEAR; 107 block_group->disk_cache_state = BTRFS_DC_CLEAR;
105 } 108 }
106 109
107 if (!btrfs_fs_closing(root->fs_info)) { 110 if (!block_group->iref) {
108 block_group->inode = igrab(inode); 111 block_group->inode = igrab(inode);
109 block_group->iref = 1; 112 block_group->iref = 1;
110 } 113 }
@@ -122,12 +125,17 @@ int __create_free_space_inode(struct btrfs_root *root,
122 struct btrfs_free_space_header *header; 125 struct btrfs_free_space_header *header;
123 struct btrfs_inode_item *inode_item; 126 struct btrfs_inode_item *inode_item;
124 struct extent_buffer *leaf; 127 struct extent_buffer *leaf;
128 u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
125 int ret; 129 int ret;
126 130
127 ret = btrfs_insert_empty_inode(trans, root, path, ino); 131 ret = btrfs_insert_empty_inode(trans, root, path, ino);
128 if (ret) 132 if (ret)
129 return ret; 133 return ret;
130 134
135 /* We inline crc's for the free disk space cache */
136 if (ino != BTRFS_FREE_INO_OBJECTID)
137 flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
138
131 leaf = path->nodes[0]; 139 leaf = path->nodes[0];
132 inode_item = btrfs_item_ptr(leaf, path->slots[0], 140 inode_item = btrfs_item_ptr(leaf, path->slots[0],
133 struct btrfs_inode_item); 141 struct btrfs_inode_item);
@@ -140,8 +148,7 @@ int __create_free_space_inode(struct btrfs_root *root,
140 btrfs_set_inode_uid(leaf, inode_item, 0); 148 btrfs_set_inode_uid(leaf, inode_item, 0);
141 btrfs_set_inode_gid(leaf, inode_item, 0); 149 btrfs_set_inode_gid(leaf, inode_item, 0);
142 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 150 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
143 btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | 151 btrfs_set_inode_flags(leaf, inode_item, flags);
144 BTRFS_INODE_PREALLOC);
145 btrfs_set_inode_nlink(leaf, inode_item, 1); 152 btrfs_set_inode_nlink(leaf, inode_item, 1);
146 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 153 btrfs_set_inode_transid(leaf, inode_item, trans->transid);
147 btrfs_set_inode_block_group(leaf, inode_item, offset); 154 btrfs_set_inode_block_group(leaf, inode_item, offset);
@@ -191,16 +198,24 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
191 struct inode *inode) 198 struct inode *inode)
192{ 199{
193 struct btrfs_block_rsv *rsv; 200 struct btrfs_block_rsv *rsv;
201 u64 needed_bytes;
194 loff_t oldsize; 202 loff_t oldsize;
195 int ret = 0; 203 int ret = 0;
196 204
197 rsv = trans->block_rsv; 205 rsv = trans->block_rsv;
198 trans->block_rsv = root->orphan_block_rsv; 206 trans->block_rsv = &root->fs_info->global_block_rsv;
199 ret = btrfs_block_rsv_check(trans, root, 207
200 root->orphan_block_rsv, 208 /* 1 for slack space, 1 for updating the inode */
201 0, 5); 209 needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) +
202 if (ret) 210 btrfs_calc_trans_metadata_size(root, 1);
203 return ret; 211
212 spin_lock(&trans->block_rsv->lock);
213 if (trans->block_rsv->reserved < needed_bytes) {
214 spin_unlock(&trans->block_rsv->lock);
215 trans->block_rsv = rsv;
216 return -ENOSPC;
217 }
218 spin_unlock(&trans->block_rsv->lock);
204 219
205 oldsize = i_size_read(inode); 220 oldsize = i_size_read(inode);
206 btrfs_i_size_write(inode, 0); 221 btrfs_i_size_write(inode, 0);
@@ -213,13 +228,15 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
213 ret = btrfs_truncate_inode_items(trans, root, inode, 228 ret = btrfs_truncate_inode_items(trans, root, inode,
214 0, BTRFS_EXTENT_DATA_KEY); 229 0, BTRFS_EXTENT_DATA_KEY);
215 230
216 trans->block_rsv = rsv;
217 if (ret) { 231 if (ret) {
232 trans->block_rsv = rsv;
218 WARN_ON(1); 233 WARN_ON(1);
219 return ret; 234 return ret;
220 } 235 }
221 236
222 ret = btrfs_update_inode(trans, root, inode); 237 ret = btrfs_update_inode(trans, root, inode);
238 trans->block_rsv = rsv;
239
223 return ret; 240 return ret;
224} 241}
225 242
@@ -242,26 +259,342 @@ static int readahead_cache(struct inode *inode)
242 return 0; 259 return 0;
243} 260}
244 261
262struct io_ctl {
263 void *cur, *orig;
264 struct page *page;
265 struct page **pages;
266 struct btrfs_root *root;
267 unsigned long size;
268 int index;
269 int num_pages;
270 unsigned check_crcs:1;
271};
272
273static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
274 struct btrfs_root *root)
275{
276 memset(io_ctl, 0, sizeof(struct io_ctl));
277 io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
278 PAGE_CACHE_SHIFT;
279 io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages,
280 GFP_NOFS);
281 if (!io_ctl->pages)
282 return -ENOMEM;
283 io_ctl->root = root;
284 if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
285 io_ctl->check_crcs = 1;
286 return 0;
287}
288
289static void io_ctl_free(struct io_ctl *io_ctl)
290{
291 kfree(io_ctl->pages);
292}
293
294static void io_ctl_unmap_page(struct io_ctl *io_ctl)
295{
296 if (io_ctl->cur) {
297 kunmap(io_ctl->page);
298 io_ctl->cur = NULL;
299 io_ctl->orig = NULL;
300 }
301}
302
303static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
304{
305 WARN_ON(io_ctl->cur);
306 BUG_ON(io_ctl->index >= io_ctl->num_pages);
307 io_ctl->page = io_ctl->pages[io_ctl->index++];
308 io_ctl->cur = kmap(io_ctl->page);
309 io_ctl->orig = io_ctl->cur;
310 io_ctl->size = PAGE_CACHE_SIZE;
311 if (clear)
312 memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
313}
314
315static void io_ctl_drop_pages(struct io_ctl *io_ctl)
316{
317 int i;
318
319 io_ctl_unmap_page(io_ctl);
320
321 for (i = 0; i < io_ctl->num_pages; i++) {
322 ClearPageChecked(io_ctl->pages[i]);
323 unlock_page(io_ctl->pages[i]);
324 page_cache_release(io_ctl->pages[i]);
325 }
326}
327
328static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
329 int uptodate)
330{
331 struct page *page;
332 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
333 int i;
334
335 for (i = 0; i < io_ctl->num_pages; i++) {
336 page = find_or_create_page(inode->i_mapping, i, mask);
337 if (!page) {
338 io_ctl_drop_pages(io_ctl);
339 return -ENOMEM;
340 }
341 io_ctl->pages[i] = page;
342 if (uptodate && !PageUptodate(page)) {
343 btrfs_readpage(NULL, page);
344 lock_page(page);
345 if (!PageUptodate(page)) {
346 printk(KERN_ERR "btrfs: error reading free "
347 "space cache\n");
348 io_ctl_drop_pages(io_ctl);
349 return -EIO;
350 }
351 }
352 }
353
354 return 0;
355}
356
357static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
358{
359 u64 *val;
360
361 io_ctl_map_page(io_ctl, 1);
362
363 /*
364 * Skip the csum areas. If we don't check crcs then we just have a
365 * 64bit chunk at the front of the first page.
366 */
367 if (io_ctl->check_crcs) {
368 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
369 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
370 } else {
371 io_ctl->cur += sizeof(u64);
372 io_ctl->size -= sizeof(u64) * 2;
373 }
374
375 val = io_ctl->cur;
376 *val = cpu_to_le64(generation);
377 io_ctl->cur += sizeof(u64);
378}
379
380static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
381{
382 u64 *gen;
383
384 /*
385 * Skip the crc area. If we don't check crcs then we just have a 64bit
386 * chunk at the front of the first page.
387 */
388 if (io_ctl->check_crcs) {
389 io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
390 io_ctl->size -= sizeof(u64) +
391 (sizeof(u32) * io_ctl->num_pages);
392 } else {
393 io_ctl->cur += sizeof(u64);
394 io_ctl->size -= sizeof(u64) * 2;
395 }
396
397 gen = io_ctl->cur;
398 if (le64_to_cpu(*gen) != generation) {
399 printk_ratelimited(KERN_ERR "btrfs: space cache generation "
400 "(%Lu) does not match inode (%Lu)\n", *gen,
401 generation);
402 io_ctl_unmap_page(io_ctl);
403 return -EIO;
404 }
405 io_ctl->cur += sizeof(u64);
406 return 0;
407}
408
409static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
410{
411 u32 *tmp;
412 u32 crc = ~(u32)0;
413 unsigned offset = 0;
414
415 if (!io_ctl->check_crcs) {
416 io_ctl_unmap_page(io_ctl);
417 return;
418 }
419
420 if (index == 0)
421 offset = sizeof(u32) * io_ctl->num_pages;;
422
423 crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
424 PAGE_CACHE_SIZE - offset);
425 btrfs_csum_final(crc, (char *)&crc);
426 io_ctl_unmap_page(io_ctl);
427 tmp = kmap(io_ctl->pages[0]);
428 tmp += index;
429 *tmp = crc;
430 kunmap(io_ctl->pages[0]);
431}
432
433static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
434{
435 u32 *tmp, val;
436 u32 crc = ~(u32)0;
437 unsigned offset = 0;
438
439 if (!io_ctl->check_crcs) {
440 io_ctl_map_page(io_ctl, 0);
441 return 0;
442 }
443
444 if (index == 0)
445 offset = sizeof(u32) * io_ctl->num_pages;
446
447 tmp = kmap(io_ctl->pages[0]);
448 tmp += index;
449 val = *tmp;
450 kunmap(io_ctl->pages[0]);
451
452 io_ctl_map_page(io_ctl, 0);
453 crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
454 PAGE_CACHE_SIZE - offset);
455 btrfs_csum_final(crc, (char *)&crc);
456 if (val != crc) {
457 printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free "
458 "space cache\n");
459 io_ctl_unmap_page(io_ctl);
460 return -EIO;
461 }
462
463 return 0;
464}
465
466static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
467 void *bitmap)
468{
469 struct btrfs_free_space_entry *entry;
470
471 if (!io_ctl->cur)
472 return -ENOSPC;
473
474 entry = io_ctl->cur;
475 entry->offset = cpu_to_le64(offset);
476 entry->bytes = cpu_to_le64(bytes);
477 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
478 BTRFS_FREE_SPACE_EXTENT;
479 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
480 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
481
482 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
483 return 0;
484
485 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
486
487 /* No more pages to map */
488 if (io_ctl->index >= io_ctl->num_pages)
489 return 0;
490
491 /* map the next page */
492 io_ctl_map_page(io_ctl, 1);
493 return 0;
494}
495
496static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
497{
498 if (!io_ctl->cur)
499 return -ENOSPC;
500
501 /*
502 * If we aren't at the start of the current page, unmap this one and
503 * map the next one if there is any left.
504 */
505 if (io_ctl->cur != io_ctl->orig) {
506 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
507 if (io_ctl->index >= io_ctl->num_pages)
508 return -ENOSPC;
509 io_ctl_map_page(io_ctl, 0);
510 }
511
512 memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
513 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
514 if (io_ctl->index < io_ctl->num_pages)
515 io_ctl_map_page(io_ctl, 0);
516 return 0;
517}
518
519static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
520{
521 /*
522 * If we're not on the boundary we know we've modified the page and we
523 * need to crc the page.
524 */
525 if (io_ctl->cur != io_ctl->orig)
526 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
527 else
528 io_ctl_unmap_page(io_ctl);
529
530 while (io_ctl->index < io_ctl->num_pages) {
531 io_ctl_map_page(io_ctl, 1);
532 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
533 }
534}
535
536static int io_ctl_read_entry(struct io_ctl *io_ctl,
537 struct btrfs_free_space *entry, u8 *type)
538{
539 struct btrfs_free_space_entry *e;
540
541 e = io_ctl->cur;
542 entry->offset = le64_to_cpu(e->offset);
543 entry->bytes = le64_to_cpu(e->bytes);
544 *type = e->type;
545 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
546 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
547
548 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
549 return 0;
550
551 io_ctl_unmap_page(io_ctl);
552
553 if (io_ctl->index >= io_ctl->num_pages)
554 return 0;
555
556 return io_ctl_check_crc(io_ctl, io_ctl->index);
557}
558
559static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
560 struct btrfs_free_space *entry)
561{
562 int ret;
563
564 if (io_ctl->cur && io_ctl->cur != io_ctl->orig)
565 io_ctl_unmap_page(io_ctl);
566
567 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
568 if (ret)
569 return ret;
570
571 memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE);
572 io_ctl_unmap_page(io_ctl);
573
574 return 0;
575}
576
245int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, 577int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
246 struct btrfs_free_space_ctl *ctl, 578 struct btrfs_free_space_ctl *ctl,
247 struct btrfs_path *path, u64 offset) 579 struct btrfs_path *path, u64 offset)
248{ 580{
249 struct btrfs_free_space_header *header; 581 struct btrfs_free_space_header *header;
250 struct extent_buffer *leaf; 582 struct extent_buffer *leaf;
251 struct page *page; 583 struct io_ctl io_ctl;
252 struct btrfs_key key; 584 struct btrfs_key key;
585 struct btrfs_free_space *e, *n;
253 struct list_head bitmaps; 586 struct list_head bitmaps;
254 u64 num_entries; 587 u64 num_entries;
255 u64 num_bitmaps; 588 u64 num_bitmaps;
256 u64 generation; 589 u64 generation;
257 pgoff_t index = 0; 590 u8 type;
258 int ret = 0; 591 int ret = 0;
259 592
260 INIT_LIST_HEAD(&bitmaps); 593 INIT_LIST_HEAD(&bitmaps);
261 594
262 /* Nothing in the space cache, goodbye */ 595 /* Nothing in the space cache, goodbye */
263 if (!i_size_read(inode)) 596 if (!i_size_read(inode))
264 goto out; 597 return 0;
265 598
266 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 599 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
267 key.offset = offset; 600 key.offset = offset;
@@ -269,11 +602,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
269 602
270 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 603 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
271 if (ret < 0) 604 if (ret < 0)
272 goto out; 605 return 0;
273 else if (ret > 0) { 606 else if (ret > 0) {
274 btrfs_release_path(path); 607 btrfs_release_path(path);
275 ret = 0; 608 return 0;
276 goto out;
277 } 609 }
278 610
279 ret = -1; 611 ret = -1;
@@ -291,169 +623,100 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
291 " not match free space cache generation (%llu)\n", 623 " not match free space cache generation (%llu)\n",
292 (unsigned long long)BTRFS_I(inode)->generation, 624 (unsigned long long)BTRFS_I(inode)->generation,
293 (unsigned long long)generation); 625 (unsigned long long)generation);
294 goto out; 626 return 0;
295 } 627 }
296 628
297 if (!num_entries) 629 if (!num_entries)
298 goto out; 630 return 0;
299 631
632 io_ctl_init(&io_ctl, inode, root);
300 ret = readahead_cache(inode); 633 ret = readahead_cache(inode);
301 if (ret) 634 if (ret)
302 goto out; 635 goto out;
303 636
304 while (1) { 637 ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
305 struct btrfs_free_space_entry *entry; 638 if (ret)
306 struct btrfs_free_space *e; 639 goto out;
307 void *addr;
308 unsigned long offset = 0;
309 int need_loop = 0;
310 640
311 if (!num_entries && !num_bitmaps) 641 ret = io_ctl_check_crc(&io_ctl, 0);
312 break; 642 if (ret)
643 goto free_cache;
313 644
314 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 645 ret = io_ctl_check_generation(&io_ctl, generation);
315 if (!page) 646 if (ret)
647 goto free_cache;
648
649 while (num_entries) {
650 e = kmem_cache_zalloc(btrfs_free_space_cachep,
651 GFP_NOFS);
652 if (!e)
316 goto free_cache; 653 goto free_cache;
317 654
318 if (!PageUptodate(page)) { 655 ret = io_ctl_read_entry(&io_ctl, e, &type);
319 btrfs_readpage(NULL, page); 656 if (ret) {
320 lock_page(page); 657 kmem_cache_free(btrfs_free_space_cachep, e);
321 if (!PageUptodate(page)) { 658 goto free_cache;
322 unlock_page(page);
323 page_cache_release(page);
324 printk(KERN_ERR "btrfs: error reading free "
325 "space cache\n");
326 goto free_cache;
327 }
328 } 659 }
329 addr = kmap(page);
330 660
331 if (index == 0) { 661 if (!e->bytes) {
332 u64 *gen; 662 kmem_cache_free(btrfs_free_space_cachep, e);
663 goto free_cache;
664 }
333 665
334 /* 666 if (type == BTRFS_FREE_SPACE_EXTENT) {
335 * We put a bogus crc in the front of the first page in 667 spin_lock(&ctl->tree_lock);
336 * case old kernels try to mount a fs with the new 668 ret = link_free_space(ctl, e);
337 * format to make sure they discard the cache. 669 spin_unlock(&ctl->tree_lock);
338 */ 670 if (ret) {
339 addr += sizeof(u64); 671 printk(KERN_ERR "Duplicate entries in "
340 offset += sizeof(u64); 672 "free space cache, dumping\n");
341 673 kmem_cache_free(btrfs_free_space_cachep, e);
342 gen = addr;
343 if (*gen != BTRFS_I(inode)->generation) {
344 printk(KERN_ERR "btrfs: space cache generation"
345 " (%llu) does not match inode (%llu)\n",
346 (unsigned long long)*gen,
347 (unsigned long long)
348 BTRFS_I(inode)->generation);
349 kunmap(page);
350 unlock_page(page);
351 page_cache_release(page);
352 goto free_cache; 674 goto free_cache;
353 } 675 }
354 addr += sizeof(u64); 676 } else {
355 offset += sizeof(u64); 677 BUG_ON(!num_bitmaps);
356 } 678 num_bitmaps--;
357 entry = addr; 679 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
358 680 if (!e->bitmap) {
359 while (1) { 681 kmem_cache_free(
360 if (!num_entries) 682 btrfs_free_space_cachep, e);
361 break;
362
363 need_loop = 1;
364 e = kmem_cache_zalloc(btrfs_free_space_cachep,
365 GFP_NOFS);
366 if (!e) {
367 kunmap(page);
368 unlock_page(page);
369 page_cache_release(page);
370 goto free_cache; 683 goto free_cache;
371 } 684 }
372 685 spin_lock(&ctl->tree_lock);
373 e->offset = le64_to_cpu(entry->offset); 686 ret = link_free_space(ctl, e);
374 e->bytes = le64_to_cpu(entry->bytes); 687 ctl->total_bitmaps++;
375 if (!e->bytes) { 688 ctl->op->recalc_thresholds(ctl);
376 kunmap(page); 689 spin_unlock(&ctl->tree_lock);
690 if (ret) {
691 printk(KERN_ERR "Duplicate entries in "
692 "free space cache, dumping\n");
377 kmem_cache_free(btrfs_free_space_cachep, e); 693 kmem_cache_free(btrfs_free_space_cachep, e);
378 unlock_page(page);
379 page_cache_release(page);
380 goto free_cache; 694 goto free_cache;
381 } 695 }
382 696 list_add_tail(&e->list, &bitmaps);
383 if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
384 spin_lock(&ctl->tree_lock);
385 ret = link_free_space(ctl, e);
386 spin_unlock(&ctl->tree_lock);
387 if (ret) {
388 printk(KERN_ERR "Duplicate entries in "
389 "free space cache, dumping\n");
390 kunmap(page);
391 unlock_page(page);
392 page_cache_release(page);
393 goto free_cache;
394 }
395 } else {
396 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
397 if (!e->bitmap) {
398 kunmap(page);
399 kmem_cache_free(
400 btrfs_free_space_cachep, e);
401 unlock_page(page);
402 page_cache_release(page);
403 goto free_cache;
404 }
405 spin_lock(&ctl->tree_lock);
406 ret = link_free_space(ctl, e);
407 ctl->total_bitmaps++;
408 ctl->op->recalc_thresholds(ctl);
409 spin_unlock(&ctl->tree_lock);
410 if (ret) {
411 printk(KERN_ERR "Duplicate entries in "
412 "free space cache, dumping\n");
413 kunmap(page);
414 unlock_page(page);
415 page_cache_release(page);
416 goto free_cache;
417 }
418 list_add_tail(&e->list, &bitmaps);
419 }
420
421 num_entries--;
422 offset += sizeof(struct btrfs_free_space_entry);
423 if (offset + sizeof(struct btrfs_free_space_entry) >=
424 PAGE_CACHE_SIZE)
425 break;
426 entry++;
427 } 697 }
428 698
429 /* 699 num_entries--;
430 * We read an entry out of this page, we need to move on to the 700 }
431 * next page.
432 */
433 if (need_loop) {
434 kunmap(page);
435 goto next;
436 }
437 701
438 /* 702 /*
439 * We add the bitmaps at the end of the entries in order that 703 * We add the bitmaps at the end of the entries in order that
440 * the bitmap entries are added to the cache. 704 * the bitmap entries are added to the cache.
441 */ 705 */
442 e = list_entry(bitmaps.next, struct btrfs_free_space, list); 706 list_for_each_entry_safe(e, n, &bitmaps, list) {
443 list_del_init(&e->list); 707 list_del_init(&e->list);
444 memcpy(e->bitmap, addr, PAGE_CACHE_SIZE); 708 ret = io_ctl_read_bitmap(&io_ctl, e);
445 kunmap(page); 709 if (ret)
446 num_bitmaps--; 710 goto free_cache;
447next:
448 unlock_page(page);
449 page_cache_release(page);
450 index++;
451 } 711 }
452 712
713 io_ctl_drop_pages(&io_ctl);
453 ret = 1; 714 ret = 1;
454out: 715out:
716 io_ctl_free(&io_ctl);
455 return ret; 717 return ret;
456free_cache: 718free_cache:
719 io_ctl_drop_pages(&io_ctl);
457 __btrfs_remove_free_space_cache(ctl); 720 __btrfs_remove_free_space_cache(ctl);
458 goto out; 721 goto out;
459} 722}
@@ -465,7 +728,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
465 struct btrfs_root *root = fs_info->tree_root; 728 struct btrfs_root *root = fs_info->tree_root;
466 struct inode *inode; 729 struct inode *inode;
467 struct btrfs_path *path; 730 struct btrfs_path *path;
468 int ret; 731 int ret = 0;
469 bool matched; 732 bool matched;
470 u64 used = btrfs_block_group_used(&block_group->item); 733 u64 used = btrfs_block_group_used(&block_group->item);
471 734
@@ -497,6 +760,14 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
497 return 0; 760 return 0;
498 } 761 }
499 762
763 /* We may have converted the inode and made the cache invalid. */
764 spin_lock(&block_group->lock);
765 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
766 spin_unlock(&block_group->lock);
767 goto out;
768 }
769 spin_unlock(&block_group->lock);
770
500 ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, 771 ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
501 path, block_group->key.objectid); 772 path, block_group->key.objectid);
502 btrfs_free_path(path); 773 btrfs_free_path(path);
@@ -530,6 +801,19 @@ out:
530 return ret; 801 return ret;
531} 802}
532 803
804/**
805 * __btrfs_write_out_cache - write out cached info to an inode
806 * @root - the root the inode belongs to
807 * @ctl - the free space cache we are going to write out
808 * @block_group - the block_group for this cache if it belongs to a block_group
809 * @trans - the trans handle
810 * @path - the path to use
811 * @offset - the offset for the key we'll insert
812 *
813 * This function writes out a free space cache struct to disk for quick recovery
814 * on mount. This will return 0 if it was successfull in writing the cache out,
815 * and -1 if it was not.
816 */
533int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, 817int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
534 struct btrfs_free_space_ctl *ctl, 818 struct btrfs_free_space_ctl *ctl,
535 struct btrfs_block_group_cache *block_group, 819 struct btrfs_block_group_cache *block_group,
@@ -540,42 +824,24 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
540 struct extent_buffer *leaf; 824 struct extent_buffer *leaf;
541 struct rb_node *node; 825 struct rb_node *node;
542 struct list_head *pos, *n; 826 struct list_head *pos, *n;
543 struct page **pages;
544 struct page *page;
545 struct extent_state *cached_state = NULL; 827 struct extent_state *cached_state = NULL;
546 struct btrfs_free_cluster *cluster = NULL; 828 struct btrfs_free_cluster *cluster = NULL;
547 struct extent_io_tree *unpin = NULL; 829 struct extent_io_tree *unpin = NULL;
830 struct io_ctl io_ctl;
548 struct list_head bitmap_list; 831 struct list_head bitmap_list;
549 struct btrfs_key key; 832 struct btrfs_key key;
550 u64 start, end, len; 833 u64 start, end, len;
551 u64 bytes = 0;
552 u32 crc = ~(u32)0;
553 int index = 0, num_pages = 0;
554 int entries = 0; 834 int entries = 0;
555 int bitmaps = 0; 835 int bitmaps = 0;
556 int ret = -1; 836 int ret;
557 bool next_page = false; 837 int err = -1;
558 bool out_of_space = false;
559 838
560 INIT_LIST_HEAD(&bitmap_list); 839 INIT_LIST_HEAD(&bitmap_list);
561 840
562 node = rb_first(&ctl->free_space_offset);
563 if (!node)
564 return 0;
565
566 if (!i_size_read(inode)) 841 if (!i_size_read(inode))
567 return -1; 842 return -1;
568 843
569 num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> 844 io_ctl_init(&io_ctl, inode, root);
570 PAGE_CACHE_SHIFT;
571
572 filemap_write_and_wait(inode->i_mapping);
573 btrfs_wait_ordered_range(inode, inode->i_size &
574 ~(root->sectorsize - 1), (u64)-1);
575
576 pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
577 if (!pages)
578 return -1;
579 845
580 /* Get the cluster for this block_group if it exists */ 846 /* Get the cluster for this block_group if it exists */
581 if (block_group && !list_empty(&block_group->cluster_list)) 847 if (block_group && !list_empty(&block_group->cluster_list))
@@ -589,30 +855,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
589 */ 855 */
590 unpin = root->fs_info->pinned_extents; 856 unpin = root->fs_info->pinned_extents;
591 857
592 /* 858 /* Lock all pages first so we can lock the extent safely. */
593 * Lock all pages first so we can lock the extent safely. 859 io_ctl_prepare_pages(&io_ctl, inode, 0);
594 *
595 * NOTE: Because we hold the ref the entire time we're going to write to
596 * the page find_get_page should never fail, so we don't do a check
597 * after find_get_page at this point. Just putting this here so people
598 * know and don't freak out.
599 */
600 while (index < num_pages) {
601 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
602 if (!page) {
603 int i;
604
605 for (i = 0; i < num_pages; i++) {
606 unlock_page(pages[i]);
607 page_cache_release(pages[i]);
608 }
609 goto out;
610 }
611 pages[index] = page;
612 index++;
613 }
614 860
615 index = 0;
616 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 861 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
617 0, &cached_state, GFP_NOFS); 862 0, &cached_state, GFP_NOFS);
618 863
@@ -623,189 +868,111 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
623 if (block_group) 868 if (block_group)
624 start = block_group->key.objectid; 869 start = block_group->key.objectid;
625 870
626 /* Write out the extent entries */ 871 node = rb_first(&ctl->free_space_offset);
627 do { 872 if (!node && cluster) {
628 struct btrfs_free_space_entry *entry; 873 node = rb_first(&cluster->root);
629 void *addr, *orig; 874 cluster = NULL;
630 unsigned long offset = 0; 875 }
631 876
632 next_page = false; 877 /* Make sure we can fit our crcs into the first page */
878 if (io_ctl.check_crcs &&
879 (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) {
880 WARN_ON(1);
881 goto out_nospc;
882 }
633 883
634 if (index >= num_pages) { 884 io_ctl_set_generation(&io_ctl, trans->transid);
635 out_of_space = true;
636 break;
637 }
638 885
639 page = pages[index]; 886 /* Write out the extent entries */
887 while (node) {
888 struct btrfs_free_space *e;
640 889
641 orig = addr = kmap(page); 890 e = rb_entry(node, struct btrfs_free_space, offset_index);
642 if (index == 0) { 891 entries++;
643 u64 *gen;
644 892
645 /* 893 ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes,
646 * We're going to put in a bogus crc for this page to 894 e->bitmap);
647 * make sure that old kernels who aren't aware of this 895 if (ret)
648 * format will be sure to discard the cache. 896 goto out_nospc;
649 */
650 addr += sizeof(u64);
651 offset += sizeof(u64);
652 897
653 gen = addr; 898 if (e->bitmap) {
654 *gen = trans->transid; 899 list_add_tail(&e->list, &bitmap_list);
655 addr += sizeof(u64); 900 bitmaps++;
656 offset += sizeof(u64);
657 } 901 }
658 entry = addr; 902 node = rb_next(node);
659 903 if (!node && cluster) {
660 memset(addr, 0, PAGE_CACHE_SIZE - offset); 904 node = rb_first(&cluster->root);
661 while (node && !next_page) { 905 cluster = NULL;
662 struct btrfs_free_space *e;
663
664 e = rb_entry(node, struct btrfs_free_space, offset_index);
665 entries++;
666
667 entry->offset = cpu_to_le64(e->offset);
668 entry->bytes = cpu_to_le64(e->bytes);
669 if (e->bitmap) {
670 entry->type = BTRFS_FREE_SPACE_BITMAP;
671 list_add_tail(&e->list, &bitmap_list);
672 bitmaps++;
673 } else {
674 entry->type = BTRFS_FREE_SPACE_EXTENT;
675 }
676 node = rb_next(node);
677 if (!node && cluster) {
678 node = rb_first(&cluster->root);
679 cluster = NULL;
680 }
681 offset += sizeof(struct btrfs_free_space_entry);
682 if (offset + sizeof(struct btrfs_free_space_entry) >=
683 PAGE_CACHE_SIZE)
684 next_page = true;
685 entry++;
686 } 906 }
907 }
687 908
688 /* 909 /*
689 * We want to add any pinned extents to our free space cache 910 * We want to add any pinned extents to our free space cache
690 * so we don't leak the space 911 * so we don't leak the space
691 */ 912 */
692 while (block_group && !next_page && 913 while (block_group && (start < block_group->key.objectid +
693 (start < block_group->key.objectid + 914 block_group->key.offset)) {
694 block_group->key.offset)) { 915 ret = find_first_extent_bit(unpin, start, &start, &end,
695 ret = find_first_extent_bit(unpin, start, &start, &end, 916 EXTENT_DIRTY);
696 EXTENT_DIRTY); 917 if (ret) {
697 if (ret) { 918 ret = 0;
698 ret = 0; 919 break;
699 break;
700 }
701
702 /* This pinned extent is out of our range */
703 if (start >= block_group->key.objectid +
704 block_group->key.offset)
705 break;
706
707 len = block_group->key.objectid +
708 block_group->key.offset - start;
709 len = min(len, end + 1 - start);
710
711 entries++;
712 entry->offset = cpu_to_le64(start);
713 entry->bytes = cpu_to_le64(len);
714 entry->type = BTRFS_FREE_SPACE_EXTENT;
715
716 start = end + 1;
717 offset += sizeof(struct btrfs_free_space_entry);
718 if (offset + sizeof(struct btrfs_free_space_entry) >=
719 PAGE_CACHE_SIZE)
720 next_page = true;
721 entry++;
722 } 920 }
723 921
724 /* Generate bogus crc value */ 922 /* This pinned extent is out of our range */
725 if (index == 0) { 923 if (start >= block_group->key.objectid +
726 u32 *tmp; 924 block_group->key.offset)
727 crc = btrfs_csum_data(root, orig + sizeof(u64), crc, 925 break;
728 PAGE_CACHE_SIZE - sizeof(u64));
729 btrfs_csum_final(crc, (char *)&crc);
730 crc++;
731 tmp = orig;
732 *tmp = crc;
733 }
734 926
735 kunmap(page); 927 len = block_group->key.objectid +
928 block_group->key.offset - start;
929 len = min(len, end + 1 - start);
736 930
737 bytes += PAGE_CACHE_SIZE; 931 entries++;
932 ret = io_ctl_add_entry(&io_ctl, start, len, NULL);
933 if (ret)
934 goto out_nospc;
738 935
739 index++; 936 start = end + 1;
740 } while (node || next_page); 937 }
741 938
742 /* Write out the bitmaps */ 939 /* Write out the bitmaps */
743 list_for_each_safe(pos, n, &bitmap_list) { 940 list_for_each_safe(pos, n, &bitmap_list) {
744 void *addr;
745 struct btrfs_free_space *entry = 941 struct btrfs_free_space *entry =
746 list_entry(pos, struct btrfs_free_space, list); 942 list_entry(pos, struct btrfs_free_space, list);
747 943
748 if (index >= num_pages) { 944 ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap);
749 out_of_space = true; 945 if (ret)
750 break; 946 goto out_nospc;
751 }
752 page = pages[index];
753
754 addr = kmap(page);
755 memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
756 kunmap(page);
757 bytes += PAGE_CACHE_SIZE;
758
759 list_del_init(&entry->list); 947 list_del_init(&entry->list);
760 index++;
761 }
762
763 if (out_of_space) {
764 btrfs_drop_pages(pages, num_pages);
765 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
766 i_size_read(inode) - 1, &cached_state,
767 GFP_NOFS);
768 ret = 0;
769 goto out;
770 } 948 }
771 949
772 /* Zero out the rest of the pages just to make sure */ 950 /* Zero out the rest of the pages just to make sure */
773 while (index < num_pages) { 951 io_ctl_zero_remaining_pages(&io_ctl);
774 void *addr;
775
776 page = pages[index];
777 addr = kmap(page);
778 memset(addr, 0, PAGE_CACHE_SIZE);
779 kunmap(page);
780 bytes += PAGE_CACHE_SIZE;
781 index++;
782 }
783 952
784 ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0, 953 ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
785 bytes, &cached_state); 954 0, i_size_read(inode), &cached_state);
786 btrfs_drop_pages(pages, num_pages); 955 io_ctl_drop_pages(&io_ctl);
787 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 956 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
788 i_size_read(inode) - 1, &cached_state, GFP_NOFS); 957 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
789 958
790 if (ret) { 959 if (ret)
791 ret = 0;
792 goto out; 960 goto out;
793 }
794 961
795 BTRFS_I(inode)->generation = trans->transid;
796 962
797 filemap_write_and_wait(inode->i_mapping); 963 ret = filemap_write_and_wait(inode->i_mapping);
964 if (ret)
965 goto out;
798 966
799 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 967 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
800 key.offset = offset; 968 key.offset = offset;
801 key.type = 0; 969 key.type = 0;
802 970
803 ret = btrfs_search_slot(trans, root, &key, path, 1, 1); 971 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
804 if (ret < 0) { 972 if (ret < 0) {
805 ret = -1; 973 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
806 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, 974 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
807 EXTENT_DIRTY | EXTENT_DELALLOC | 975 GFP_NOFS);
808 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
809 goto out; 976 goto out;
810 } 977 }
811 leaf = path->nodes[0]; 978 leaf = path->nodes[0];
@@ -816,15 +983,16 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
816 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 983 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
817 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 984 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
818 found_key.offset != offset) { 985 found_key.offset != offset) {
819 ret = -1; 986 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
820 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, 987 inode->i_size - 1,
821 EXTENT_DIRTY | EXTENT_DELALLOC | 988 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
822 EXTENT_DO_ACCOUNTING, 0, 0, NULL, 989 NULL, GFP_NOFS);
823 GFP_NOFS);
824 btrfs_release_path(path); 990 btrfs_release_path(path);
825 goto out; 991 goto out;
826 } 992 }
827 } 993 }
994
995 BTRFS_I(inode)->generation = trans->transid;
828 header = btrfs_item_ptr(leaf, path->slots[0], 996 header = btrfs_item_ptr(leaf, path->slots[0],
829 struct btrfs_free_space_header); 997 struct btrfs_free_space_header);
830 btrfs_set_free_space_entries(leaf, header, entries); 998 btrfs_set_free_space_entries(leaf, header, entries);
@@ -833,16 +1001,26 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
833 btrfs_mark_buffer_dirty(leaf); 1001 btrfs_mark_buffer_dirty(leaf);
834 btrfs_release_path(path); 1002 btrfs_release_path(path);
835 1003
836 ret = 1; 1004 err = 0;
837
838out: 1005out:
839 kfree(pages); 1006 io_ctl_free(&io_ctl);
840 if (ret != 1) { 1007 if (err) {
841 invalidate_inode_pages2_range(inode->i_mapping, 0, index); 1008 invalidate_inode_pages2(inode->i_mapping);
842 BTRFS_I(inode)->generation = 0; 1009 BTRFS_I(inode)->generation = 0;
843 } 1010 }
844 btrfs_update_inode(trans, root, inode); 1011 btrfs_update_inode(trans, root, inode);
845 return ret; 1012 return err;
1013
1014out_nospc:
1015 list_for_each_safe(pos, n, &bitmap_list) {
1016 struct btrfs_free_space *entry =
1017 list_entry(pos, struct btrfs_free_space, list);
1018 list_del_init(&entry->list);
1019 }
1020 io_ctl_drop_pages(&io_ctl);
1021 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1022 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
1023 goto out;
846} 1024}
847 1025
848int btrfs_write_out_cache(struct btrfs_root *root, 1026int btrfs_write_out_cache(struct btrfs_root *root,
@@ -869,14 +1047,15 @@ int btrfs_write_out_cache(struct btrfs_root *root,
869 1047
870 ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, 1048 ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
871 path, block_group->key.objectid); 1049 path, block_group->key.objectid);
872 if (ret < 0) { 1050 if (ret) {
873 spin_lock(&block_group->lock); 1051 spin_lock(&block_group->lock);
874 block_group->disk_cache_state = BTRFS_DC_ERROR; 1052 block_group->disk_cache_state = BTRFS_DC_ERROR;
875 spin_unlock(&block_group->lock); 1053 spin_unlock(&block_group->lock);
876 ret = 0; 1054 ret = 0;
877 1055#ifdef DEBUG
878 printk(KERN_ERR "btrfs: failed to write free space cace " 1056 printk(KERN_ERR "btrfs: failed to write free space cace "
879 "for block group %llu\n", block_group->key.objectid); 1057 "for block group %llu\n", block_group->key.objectid);
1058#endif
880 } 1059 }
881 1060
882 iput(inode); 1061 iput(inode);
@@ -1701,6 +1880,7 @@ again:
1701 ctl->total_bitmaps--; 1880 ctl->total_bitmaps--;
1702 } 1881 }
1703 kmem_cache_free(btrfs_free_space_cachep, info); 1882 kmem_cache_free(btrfs_free_space_cachep, info);
1883 ret = 0;
1704 goto out_lock; 1884 goto out_lock;
1705 } 1885 }
1706 1886
@@ -1708,7 +1888,8 @@ again:
1708 unlink_free_space(ctl, info); 1888 unlink_free_space(ctl, info);
1709 info->offset += bytes; 1889 info->offset += bytes;
1710 info->bytes -= bytes; 1890 info->bytes -= bytes;
1711 link_free_space(ctl, info); 1891 ret = link_free_space(ctl, info);
1892 WARN_ON(ret);
1712 goto out_lock; 1893 goto out_lock;
1713 } 1894 }
1714 1895
@@ -2472,9 +2653,19 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2472 spin_unlock(&ctl->tree_lock); 2653 spin_unlock(&ctl->tree_lock);
2473 2654
2474 if (bytes >= minlen) { 2655 if (bytes >= minlen) {
2475 int update_ret; 2656 struct btrfs_space_info *space_info;
2476 update_ret = btrfs_update_reserved_bytes(block_group, 2657 int update = 0;
2477 bytes, 1, 1); 2658
2659 space_info = block_group->space_info;
2660 spin_lock(&space_info->lock);
2661 spin_lock(&block_group->lock);
2662 if (!block_group->ro) {
2663 block_group->reserved += bytes;
2664 space_info->bytes_reserved += bytes;
2665 update = 1;
2666 }
2667 spin_unlock(&block_group->lock);
2668 spin_unlock(&space_info->lock);
2478 2669
2479 ret = btrfs_error_discard_extent(fs_info->extent_root, 2670 ret = btrfs_error_discard_extent(fs_info->extent_root,
2480 start, 2671 start,
@@ -2482,9 +2673,16 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2482 &actually_trimmed); 2673 &actually_trimmed);
2483 2674
2484 btrfs_add_free_space(block_group, start, bytes); 2675 btrfs_add_free_space(block_group, start, bytes);
2485 if (!update_ret) 2676 if (update) {
2486 btrfs_update_reserved_bytes(block_group, 2677 spin_lock(&space_info->lock);
2487 bytes, 0, 1); 2678 spin_lock(&block_group->lock);
2679 if (block_group->ro)
2680 space_info->bytes_readonly += bytes;
2681 block_group->reserved -= bytes;
2682 space_info->bytes_reserved -= bytes;
2683 spin_unlock(&space_info->lock);
2684 spin_unlock(&block_group->lock);
2685 }
2488 2686
2489 if (ret) 2687 if (ret)
2490 break; 2688 break;
@@ -2643,9 +2841,13 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
2643 return 0; 2841 return 0;
2644 2842
2645 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); 2843 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
2646 if (ret < 0) 2844 if (ret) {
2845 btrfs_delalloc_release_metadata(inode, inode->i_size);
2846#ifdef DEBUG
2647 printk(KERN_ERR "btrfs: failed to write free ino cache " 2847 printk(KERN_ERR "btrfs: failed to write free ino cache "
2648 "for root %llu\n", root->root_key.objectid); 2848 "for root %llu\n", root->root_key.objectid);
2849#endif
2850 }
2649 2851
2650 iput(inode); 2852 iput(inode);
2651 return ret; 2853 return ret;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index b4087e0fa871..53dcbdf446cd 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -465,14 +465,16 @@ again:
465 /* Just to make sure we have enough space */ 465 /* Just to make sure we have enough space */
466 prealloc += 8 * PAGE_CACHE_SIZE; 466 prealloc += 8 * PAGE_CACHE_SIZE;
467 467
468 ret = btrfs_check_data_free_space(inode, prealloc); 468 ret = btrfs_delalloc_reserve_space(inode, prealloc);
469 if (ret) 469 if (ret)
470 goto out_put; 470 goto out_put;
471 471
472 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, 472 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
473 prealloc, prealloc, &alloc_hint); 473 prealloc, prealloc, &alloc_hint);
474 if (ret) 474 if (ret) {
475 btrfs_delalloc_release_space(inode, prealloc);
475 goto out_put; 476 goto out_put;
477 }
476 btrfs_free_reserved_data_space(inode, prealloc); 478 btrfs_free_reserved_data_space(inode, prealloc);
477 479
478out_put: 480out_put:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 75686a61bd45..966ddcc4c63d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -45,10 +45,10 @@
45#include "btrfs_inode.h" 45#include "btrfs_inode.h"
46#include "ioctl.h" 46#include "ioctl.h"
47#include "print-tree.h" 47#include "print-tree.h"
48#include "volumes.h"
49#include "ordered-data.h" 48#include "ordered-data.h"
50#include "xattr.h" 49#include "xattr.h"
51#include "tree-log.h" 50#include "tree-log.h"
51#include "volumes.h"
52#include "compression.h" 52#include "compression.h"
53#include "locking.h" 53#include "locking.h"
54#include "free-space-cache.h" 54#include "free-space-cache.h"
@@ -393,7 +393,10 @@ again:
393 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { 393 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
394 WARN_ON(pages); 394 WARN_ON(pages);
395 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 395 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
396 BUG_ON(!pages); 396 if (!pages) {
397 /* just bail out to the uncompressed code */
398 goto cont;
399 }
397 400
398 if (BTRFS_I(inode)->force_compress) 401 if (BTRFS_I(inode)->force_compress)
399 compress_type = BTRFS_I(inode)->force_compress; 402 compress_type = BTRFS_I(inode)->force_compress;
@@ -424,6 +427,7 @@ again:
424 will_compress = 1; 427 will_compress = 1;
425 } 428 }
426 } 429 }
430cont:
427 if (start == 0) { 431 if (start == 0) {
428 trans = btrfs_join_transaction(root); 432 trans = btrfs_join_transaction(root);
429 BUG_ON(IS_ERR(trans)); 433 BUG_ON(IS_ERR(trans));
@@ -820,7 +824,7 @@ static noinline int cow_file_range(struct inode *inode,
820 } 824 }
821 825
822 BUG_ON(disk_num_bytes > 826 BUG_ON(disk_num_bytes >
823 btrfs_super_total_bytes(&root->fs_info->super_copy)); 827 btrfs_super_total_bytes(root->fs_info->super_copy));
824 828
825 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); 829 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
826 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 830 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
@@ -1792,12 +1796,12 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1792 } 1796 }
1793 ret = 0; 1797 ret = 0;
1794out: 1798out:
1795 if (nolock) { 1799 if (root != root->fs_info->tree_root)
1796 if (trans)
1797 btrfs_end_transaction_nolock(trans, root);
1798 } else {
1799 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 1800 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
1800 if (trans) 1801 if (trans) {
1802 if (nolock)
1803 btrfs_end_transaction_nolock(trans, root);
1804 else
1801 btrfs_end_transaction(trans, root); 1805 btrfs_end_transaction(trans, root);
1802 } 1806 }
1803 1807
@@ -1819,153 +1823,9 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
1819} 1823}
1820 1824
1821/* 1825/*
1822 * When IO fails, either with EIO or csum verification fails, we
1823 * try other mirrors that might have a good copy of the data. This
1824 * io_failure_record is used to record state as we go through all the
1825 * mirrors. If another mirror has good data, the page is set up to date
1826 * and things continue. If a good mirror can't be found, the original
1827 * bio end_io callback is called to indicate things have failed.
1828 */
1829struct io_failure_record {
1830 struct page *page;
1831 u64 start;
1832 u64 len;
1833 u64 logical;
1834 unsigned long bio_flags;
1835 int last_mirror;
1836};
1837
1838static int btrfs_io_failed_hook(struct bio *failed_bio,
1839 struct page *page, u64 start, u64 end,
1840 struct extent_state *state)
1841{
1842 struct io_failure_record *failrec = NULL;
1843 u64 private;
1844 struct extent_map *em;
1845 struct inode *inode = page->mapping->host;
1846 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1847 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1848 struct bio *bio;
1849 int num_copies;
1850 int ret;
1851 int rw;
1852 u64 logical;
1853
1854 ret = get_state_private(failure_tree, start, &private);
1855 if (ret) {
1856 failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
1857 if (!failrec)
1858 return -ENOMEM;
1859 failrec->start = start;
1860 failrec->len = end - start + 1;
1861 failrec->last_mirror = 0;
1862 failrec->bio_flags = 0;
1863
1864 read_lock(&em_tree->lock);
1865 em = lookup_extent_mapping(em_tree, start, failrec->len);
1866 if (em->start > start || em->start + em->len < start) {
1867 free_extent_map(em);
1868 em = NULL;
1869 }
1870 read_unlock(&em_tree->lock);
1871
1872 if (IS_ERR_OR_NULL(em)) {
1873 kfree(failrec);
1874 return -EIO;
1875 }
1876 logical = start - em->start;
1877 logical = em->block_start + logical;
1878 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1879 logical = em->block_start;
1880 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
1881 extent_set_compress_type(&failrec->bio_flags,
1882 em->compress_type);
1883 }
1884 failrec->logical = logical;
1885 free_extent_map(em);
1886 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
1887 EXTENT_DIRTY, GFP_NOFS);
1888 set_state_private(failure_tree, start,
1889 (u64)(unsigned long)failrec);
1890 } else {
1891 failrec = (struct io_failure_record *)(unsigned long)private;
1892 }
1893 num_copies = btrfs_num_copies(
1894 &BTRFS_I(inode)->root->fs_info->mapping_tree,
1895 failrec->logical, failrec->len);
1896 failrec->last_mirror++;
1897 if (!state) {
1898 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1899 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1900 failrec->start,
1901 EXTENT_LOCKED);
1902 if (state && state->start != failrec->start)
1903 state = NULL;
1904 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1905 }
1906 if (!state || failrec->last_mirror > num_copies) {
1907 set_state_private(failure_tree, failrec->start, 0);
1908 clear_extent_bits(failure_tree, failrec->start,
1909 failrec->start + failrec->len - 1,
1910 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1911 kfree(failrec);
1912 return -EIO;
1913 }
1914 bio = bio_alloc(GFP_NOFS, 1);
1915 bio->bi_private = state;
1916 bio->bi_end_io = failed_bio->bi_end_io;
1917 bio->bi_sector = failrec->logical >> 9;
1918 bio->bi_bdev = failed_bio->bi_bdev;
1919 bio->bi_size = 0;
1920
1921 bio_add_page(bio, page, failrec->len, start - page_offset(page));
1922 if (failed_bio->bi_rw & REQ_WRITE)
1923 rw = WRITE;
1924 else
1925 rw = READ;
1926
1927 ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1928 failrec->last_mirror,
1929 failrec->bio_flags, 0);
1930 return ret;
1931}
1932
1933/*
1934 * each time an IO finishes, we do a fast check in the IO failure tree
1935 * to see if we need to process or clean up an io_failure_record
1936 */
1937static int btrfs_clean_io_failures(struct inode *inode, u64 start)
1938{
1939 u64 private;
1940 u64 private_failure;
1941 struct io_failure_record *failure;
1942 int ret;
1943
1944 private = 0;
1945 if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1946 (u64)-1, 1, EXTENT_DIRTY, 0)) {
1947 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
1948 start, &private_failure);
1949 if (ret == 0) {
1950 failure = (struct io_failure_record *)(unsigned long)
1951 private_failure;
1952 set_state_private(&BTRFS_I(inode)->io_failure_tree,
1953 failure->start, 0);
1954 clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
1955 failure->start,
1956 failure->start + failure->len - 1,
1957 EXTENT_DIRTY | EXTENT_LOCKED,
1958 GFP_NOFS);
1959 kfree(failure);
1960 }
1961 }
1962 return 0;
1963}
1964
1965/*
1966 * when reads are done, we need to check csums to verify the data is correct 1826 * when reads are done, we need to check csums to verify the data is correct
1967 * if there's a match, we allow the bio to finish. If not, we go through 1827 * if there's a match, we allow the bio to finish. If not, the code in
1968 * the io_failure_record routines to find good copies 1828 * extent_io.c will try to find good copies for us.
1969 */ 1829 */
1970static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, 1830static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
1971 struct extent_state *state) 1831 struct extent_state *state)
@@ -2011,10 +1871,6 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
2011 1871
2012 kunmap_atomic(kaddr, KM_USER0); 1872 kunmap_atomic(kaddr, KM_USER0);
2013good: 1873good:
2014 /* if the io failure tree for this inode is non-empty,
2015 * check to see if we've recovered from a failed IO
2016 */
2017 btrfs_clean_io_failures(inode, start);
2018 return 0; 1874 return 0;
2019 1875
2020zeroit: 1876zeroit:
@@ -2079,89 +1935,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
2079 up_read(&root->fs_info->cleanup_work_sem); 1935 up_read(&root->fs_info->cleanup_work_sem);
2080} 1936}
2081 1937
2082/*
2083 * calculate extra metadata reservation when snapshotting a subvolume
2084 * contains orphan files.
2085 */
2086void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2087 struct btrfs_pending_snapshot *pending,
2088 u64 *bytes_to_reserve)
2089{
2090 struct btrfs_root *root;
2091 struct btrfs_block_rsv *block_rsv;
2092 u64 num_bytes;
2093 int index;
2094
2095 root = pending->root;
2096 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2097 return;
2098
2099 block_rsv = root->orphan_block_rsv;
2100
2101 /* orphan block reservation for the snapshot */
2102 num_bytes = block_rsv->size;
2103
2104 /*
2105 * after the snapshot is created, COWing tree blocks may use more
2106 * space than it frees. So we should make sure there is enough
2107 * reserved space.
2108 */
2109 index = trans->transid & 0x1;
2110 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2111 num_bytes += block_rsv->size -
2112 (block_rsv->reserved + block_rsv->freed[index]);
2113 }
2114
2115 *bytes_to_reserve += num_bytes;
2116}
2117
2118void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2119 struct btrfs_pending_snapshot *pending)
2120{
2121 struct btrfs_root *root = pending->root;
2122 struct btrfs_root *snap = pending->snap;
2123 struct btrfs_block_rsv *block_rsv;
2124 u64 num_bytes;
2125 int index;
2126 int ret;
2127
2128 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2129 return;
2130
2131 /* refill source subvolume's orphan block reservation */
2132 block_rsv = root->orphan_block_rsv;
2133 index = trans->transid & 0x1;
2134 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2135 num_bytes = block_rsv->size -
2136 (block_rsv->reserved + block_rsv->freed[index]);
2137 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2138 root->orphan_block_rsv,
2139 num_bytes);
2140 BUG_ON(ret);
2141 }
2142
2143 /* setup orphan block reservation for the snapshot */
2144 block_rsv = btrfs_alloc_block_rsv(snap);
2145 BUG_ON(!block_rsv);
2146
2147 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2148 snap->orphan_block_rsv = block_rsv;
2149
2150 num_bytes = root->orphan_block_rsv->size;
2151 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2152 block_rsv, num_bytes);
2153 BUG_ON(ret);
2154
2155#if 0
2156 /* insert orphan item for the snapshot */
2157 WARN_ON(!root->orphan_item_inserted);
2158 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2159 snap->root_key.objectid);
2160 BUG_ON(ret);
2161 snap->orphan_item_inserted = 1;
2162#endif
2163}
2164
2165enum btrfs_orphan_cleanup_state { 1938enum btrfs_orphan_cleanup_state {
2166 ORPHAN_CLEANUP_STARTED = 1, 1939 ORPHAN_CLEANUP_STARTED = 1,
2167 ORPHAN_CLEANUP_DONE = 2, 1940 ORPHAN_CLEANUP_DONE = 2,
@@ -2247,9 +2020,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2247 } 2020 }
2248 spin_unlock(&root->orphan_lock); 2021 spin_unlock(&root->orphan_lock);
2249 2022
2250 if (block_rsv)
2251 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2252
2253 /* grab metadata reservation from transaction handle */ 2023 /* grab metadata reservation from transaction handle */
2254 if (reserve) { 2024 if (reserve) {
2255 ret = btrfs_orphan_reserve_metadata(trans, inode); 2025 ret = btrfs_orphan_reserve_metadata(trans, inode);
@@ -2316,6 +2086,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2316 struct btrfs_key key, found_key; 2086 struct btrfs_key key, found_key;
2317 struct btrfs_trans_handle *trans; 2087 struct btrfs_trans_handle *trans;
2318 struct inode *inode; 2088 struct inode *inode;
2089 u64 last_objectid = 0;
2319 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2090 int ret = 0, nr_unlink = 0, nr_truncate = 0;
2320 2091
2321 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) 2092 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
@@ -2367,41 +2138,49 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2367 * crossing root thing. we store the inode number in the 2138 * crossing root thing. we store the inode number in the
2368 * offset of the orphan item. 2139 * offset of the orphan item.
2369 */ 2140 */
2141
2142 if (found_key.offset == last_objectid) {
2143 printk(KERN_ERR "btrfs: Error removing orphan entry, "
2144 "stopping orphan cleanup\n");
2145 ret = -EINVAL;
2146 goto out;
2147 }
2148
2149 last_objectid = found_key.offset;
2150
2370 found_key.objectid = found_key.offset; 2151 found_key.objectid = found_key.offset;
2371 found_key.type = BTRFS_INODE_ITEM_KEY; 2152 found_key.type = BTRFS_INODE_ITEM_KEY;
2372 found_key.offset = 0; 2153 found_key.offset = 0;
2373 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 2154 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
2374 if (IS_ERR(inode)) { 2155 ret = PTR_RET(inode);
2375 ret = PTR_ERR(inode); 2156 if (ret && ret != -ESTALE)
2376 goto out; 2157 goto out;
2377 }
2378 2158
2379 /* 2159 /*
2380 * add this inode to the orphan list so btrfs_orphan_del does 2160 * Inode is already gone but the orphan item is still there,
2381 * the proper thing when we hit it 2161 * kill the orphan item.
2382 */ 2162 */
2383 spin_lock(&root->orphan_lock); 2163 if (ret == -ESTALE) {
2384 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2164 trans = btrfs_start_transaction(root, 1);
2385 spin_unlock(&root->orphan_lock);
2386
2387 /*
2388 * if this is a bad inode, means we actually succeeded in
2389 * removing the inode, but not the orphan record, which means
2390 * we need to manually delete the orphan since iput will just
2391 * do a destroy_inode
2392 */
2393 if (is_bad_inode(inode)) {
2394 trans = btrfs_start_transaction(root, 0);
2395 if (IS_ERR(trans)) { 2165 if (IS_ERR(trans)) {
2396 ret = PTR_ERR(trans); 2166 ret = PTR_ERR(trans);
2397 goto out; 2167 goto out;
2398 } 2168 }
2399 btrfs_orphan_del(trans, inode); 2169 ret = btrfs_del_orphan_item(trans, root,
2170 found_key.objectid);
2171 BUG_ON(ret);
2400 btrfs_end_transaction(trans, root); 2172 btrfs_end_transaction(trans, root);
2401 iput(inode);
2402 continue; 2173 continue;
2403 } 2174 }
2404 2175
2176 /*
2177 * add this inode to the orphan list so btrfs_orphan_del does
2178 * the proper thing when we hit it
2179 */
2180 spin_lock(&root->orphan_lock);
2181 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2182 spin_unlock(&root->orphan_lock);
2183
2405 /* if we have links, this was a truncate, lets do that */ 2184 /* if we have links, this was a truncate, lets do that */
2406 if (inode->i_nlink) { 2185 if (inode->i_nlink) {
2407 if (!S_ISREG(inode->i_mode)) { 2186 if (!S_ISREG(inode->i_mode)) {
@@ -2835,7 +2614,16 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2835 u64 ino = btrfs_ino(inode); 2614 u64 ino = btrfs_ino(inode);
2836 u64 dir_ino = btrfs_ino(dir); 2615 u64 dir_ino = btrfs_ino(dir);
2837 2616
2838 trans = btrfs_start_transaction(root, 10); 2617 /*
2618 * 1 for the possible orphan item
2619 * 1 for the dir item
2620 * 1 for the dir index
2621 * 1 for the inode ref
2622 * 1 for the inode ref in the tree log
2623 * 2 for the dir entries in the log
2624 * 1 for the inode
2625 */
2626 trans = btrfs_start_transaction(root, 8);
2839 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) 2627 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
2840 return trans; 2628 return trans;
2841 2629
@@ -2858,7 +2646,8 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2858 return ERR_PTR(-ENOMEM); 2646 return ERR_PTR(-ENOMEM);
2859 } 2647 }
2860 2648
2861 trans = btrfs_start_transaction(root, 0); 2649 /* 1 for the orphan item */
2650 trans = btrfs_start_transaction(root, 1);
2862 if (IS_ERR(trans)) { 2651 if (IS_ERR(trans)) {
2863 btrfs_free_path(path); 2652 btrfs_free_path(path);
2864 root->fs_info->enospc_unlink = 0; 2653 root->fs_info->enospc_unlink = 0;
@@ -2963,6 +2752,12 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2963 err = 0; 2752 err = 0;
2964out: 2753out:
2965 btrfs_free_path(path); 2754 btrfs_free_path(path);
2755 /* Migrate the orphan reservation over */
2756 if (!err)
2757 err = btrfs_block_rsv_migrate(trans->block_rsv,
2758 &root->fs_info->global_block_rsv,
2759 trans->bytes_reserved);
2760
2966 if (err) { 2761 if (err) {
2967 btrfs_end_transaction(trans, root); 2762 btrfs_end_transaction(trans, root);
2968 root->fs_info->enospc_unlink = 0; 2763 root->fs_info->enospc_unlink = 0;
@@ -2977,6 +2772,9 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
2977 struct btrfs_root *root) 2772 struct btrfs_root *root)
2978{ 2773{
2979 if (trans->block_rsv == &root->fs_info->global_block_rsv) { 2774 if (trans->block_rsv == &root->fs_info->global_block_rsv) {
2775 btrfs_block_rsv_release(root, trans->block_rsv,
2776 trans->bytes_reserved);
2777 trans->block_rsv = &root->fs_info->trans_block_rsv;
2980 BUG_ON(!root->fs_info->enospc_unlink); 2778 BUG_ON(!root->fs_info->enospc_unlink);
2981 root->fs_info->enospc_unlink = 0; 2779 root->fs_info->enospc_unlink = 0;
2982 } 2780 }
@@ -3368,6 +3166,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3368 pgoff_t index = from >> PAGE_CACHE_SHIFT; 3166 pgoff_t index = from >> PAGE_CACHE_SHIFT;
3369 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3167 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3370 struct page *page; 3168 struct page *page;
3169 gfp_t mask = btrfs_alloc_write_mask(mapping);
3371 int ret = 0; 3170 int ret = 0;
3372 u64 page_start; 3171 u64 page_start;
3373 u64 page_end; 3172 u64 page_end;
@@ -3380,7 +3179,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3380 3179
3381 ret = -ENOMEM; 3180 ret = -ENOMEM;
3382again: 3181again:
3383 page = find_or_create_page(mapping, index, GFP_NOFS); 3182 page = find_or_create_page(mapping, index, mask);
3384 if (!page) { 3183 if (!page) {
3385 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3184 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3386 goto out; 3185 goto out;
@@ -3613,6 +3412,8 @@ void btrfs_evict_inode(struct inode *inode)
3613{ 3412{
3614 struct btrfs_trans_handle *trans; 3413 struct btrfs_trans_handle *trans;
3615 struct btrfs_root *root = BTRFS_I(inode)->root; 3414 struct btrfs_root *root = BTRFS_I(inode)->root;
3415 struct btrfs_block_rsv *rsv, *global_rsv;
3416 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
3616 unsigned long nr; 3417 unsigned long nr;
3617 int ret; 3418 int ret;
3618 3419
@@ -3640,22 +3441,55 @@ void btrfs_evict_inode(struct inode *inode)
3640 goto no_delete; 3441 goto no_delete;
3641 } 3442 }
3642 3443
3444 rsv = btrfs_alloc_block_rsv(root);
3445 if (!rsv) {
3446 btrfs_orphan_del(NULL, inode);
3447 goto no_delete;
3448 }
3449 rsv->size = min_size;
3450 global_rsv = &root->fs_info->global_block_rsv;
3451
3643 btrfs_i_size_write(inode, 0); 3452 btrfs_i_size_write(inode, 0);
3644 3453
3454 /*
3455 * This is a bit simpler than btrfs_truncate since
3456 *
3457 * 1) We've already reserved our space for our orphan item in the
3458 * unlink.
3459 * 2) We're going to delete the inode item, so we don't need to update
3460 * it at all.
3461 *
3462 * So we just need to reserve some slack space in case we add bytes when
3463 * doing the truncate.
3464 */
3645 while (1) { 3465 while (1) {
3646 trans = btrfs_join_transaction(root); 3466 ret = btrfs_block_rsv_refill(root, rsv, min_size);
3647 BUG_ON(IS_ERR(trans)); 3467
3648 trans->block_rsv = root->orphan_block_rsv; 3468 /*
3469 * Try and steal from the global reserve since we will
3470 * likely not use this space anyway, we want to try as
3471 * hard as possible to get this to work.
3472 */
3473 if (ret)
3474 ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
3649 3475
3650 ret = btrfs_block_rsv_check(trans, root,
3651 root->orphan_block_rsv, 0, 5);
3652 if (ret) { 3476 if (ret) {
3653 BUG_ON(ret != -EAGAIN); 3477 printk(KERN_WARNING "Could not get space for a "
3654 ret = btrfs_commit_transaction(trans, root); 3478 "delete, will truncate on mount %d\n", ret);
3655 BUG_ON(ret); 3479 btrfs_orphan_del(NULL, inode);
3656 continue; 3480 btrfs_free_block_rsv(root, rsv);
3481 goto no_delete;
3657 } 3482 }
3658 3483
3484 trans = btrfs_start_transaction(root, 0);
3485 if (IS_ERR(trans)) {
3486 btrfs_orphan_del(NULL, inode);
3487 btrfs_free_block_rsv(root, rsv);
3488 goto no_delete;
3489 }
3490
3491 trans->block_rsv = rsv;
3492
3659 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 3493 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
3660 if (ret != -EAGAIN) 3494 if (ret != -EAGAIN)
3661 break; 3495 break;
@@ -3664,14 +3498,17 @@ void btrfs_evict_inode(struct inode *inode)
3664 btrfs_end_transaction(trans, root); 3498 btrfs_end_transaction(trans, root);
3665 trans = NULL; 3499 trans = NULL;
3666 btrfs_btree_balance_dirty(root, nr); 3500 btrfs_btree_balance_dirty(root, nr);
3667
3668 } 3501 }
3669 3502
3503 btrfs_free_block_rsv(root, rsv);
3504
3670 if (ret == 0) { 3505 if (ret == 0) {
3506 trans->block_rsv = root->orphan_block_rsv;
3671 ret = btrfs_orphan_del(trans, inode); 3507 ret = btrfs_orphan_del(trans, inode);
3672 BUG_ON(ret); 3508 BUG_ON(ret);
3673 } 3509 }
3674 3510
3511 trans->block_rsv = &root->fs_info->trans_block_rsv;
3675 if (!(root == root->fs_info->tree_root || 3512 if (!(root == root->fs_info->tree_root ||
3676 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) 3513 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
3677 btrfs_return_ino(root, btrfs_ino(inode)); 3514 btrfs_return_ino(root, btrfs_ino(inode));
@@ -5795,8 +5632,7 @@ again:
5795 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { 5632 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
5796 ret = btrfs_ordered_update_i_size(inode, 0, ordered); 5633 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5797 if (!ret) 5634 if (!ret)
5798 ret = btrfs_update_inode(trans, root, inode); 5635 err = btrfs_update_inode(trans, root, inode);
5799 err = ret;
5800 goto out; 5636 goto out;
5801 } 5637 }
5802 5638
@@ -6289,7 +6125,7 @@ int btrfs_readpage(struct file *file, struct page *page)
6289{ 6125{
6290 struct extent_io_tree *tree; 6126 struct extent_io_tree *tree;
6291 tree = &BTRFS_I(page->mapping->host)->io_tree; 6127 tree = &BTRFS_I(page->mapping->host)->io_tree;
6292 return extent_read_full_page(tree, page, btrfs_get_extent); 6128 return extent_read_full_page(tree, page, btrfs_get_extent, 0);
6293} 6129}
6294 6130
6295static int btrfs_writepage(struct page *page, struct writeback_control *wbc) 6131static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
@@ -6541,6 +6377,7 @@ static int btrfs_truncate(struct inode *inode)
6541 struct btrfs_trans_handle *trans; 6377 struct btrfs_trans_handle *trans;
6542 unsigned long nr; 6378 unsigned long nr;
6543 u64 mask = root->sectorsize - 1; 6379 u64 mask = root->sectorsize - 1;
6380 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
6544 6381
6545 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); 6382 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
6546 if (ret) 6383 if (ret)
@@ -6588,19 +6425,23 @@ static int btrfs_truncate(struct inode *inode)
6588 rsv = btrfs_alloc_block_rsv(root); 6425 rsv = btrfs_alloc_block_rsv(root);
6589 if (!rsv) 6426 if (!rsv)
6590 return -ENOMEM; 6427 return -ENOMEM;
6591 btrfs_add_durable_block_rsv(root->fs_info, rsv); 6428 rsv->size = min_size;
6592 6429
6430 /*
6431 * 1 for the truncate slack space
6432 * 1 for the orphan item we're going to add
6433 * 1 for the orphan item deletion
6434 * 1 for updating the inode.
6435 */
6593 trans = btrfs_start_transaction(root, 4); 6436 trans = btrfs_start_transaction(root, 4);
6594 if (IS_ERR(trans)) { 6437 if (IS_ERR(trans)) {
6595 err = PTR_ERR(trans); 6438 err = PTR_ERR(trans);
6596 goto out; 6439 goto out;
6597 } 6440 }
6598 6441
6599 /* 6442 /* Migrate the slack space for the truncate to our reserve */
6600 * Reserve space for the truncate process. Truncate should be adding 6443 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
6601 * space, but if there are snapshots it may end up using space. 6444 min_size);
6602 */
6603 ret = btrfs_truncate_reserve_metadata(trans, root, rsv);
6604 BUG_ON(ret); 6445 BUG_ON(ret);
6605 6446
6606 ret = btrfs_orphan_add(trans, inode); 6447 ret = btrfs_orphan_add(trans, inode);
@@ -6609,21 +6450,6 @@ static int btrfs_truncate(struct inode *inode)
6609 goto out; 6450 goto out;
6610 } 6451 }
6611 6452
6612 nr = trans->blocks_used;
6613 btrfs_end_transaction(trans, root);
6614 btrfs_btree_balance_dirty(root, nr);
6615
6616 /*
6617 * Ok so we've already migrated our bytes over for the truncate, so here
6618 * just reserve the one slot we need for updating the inode.
6619 */
6620 trans = btrfs_start_transaction(root, 1);
6621 if (IS_ERR(trans)) {
6622 err = PTR_ERR(trans);
6623 goto out;
6624 }
6625 trans->block_rsv = rsv;
6626
6627 /* 6453 /*
6628 * setattr is responsible for setting the ordered_data_close flag, 6454 * setattr is responsible for setting the ordered_data_close flag,
6629 * but that is only tested during the last file release. That 6455 * but that is only tested during the last file release. That
@@ -6645,20 +6471,30 @@ static int btrfs_truncate(struct inode *inode)
6645 btrfs_add_ordered_operation(trans, root, inode); 6471 btrfs_add_ordered_operation(trans, root, inode);
6646 6472
6647 while (1) { 6473 while (1) {
6474 ret = btrfs_block_rsv_refill(root, rsv, min_size);
6475 if (ret) {
6476 /*
6477 * This can only happen with the original transaction we
6478 * started above, every other time we shouldn't have a
6479 * transaction started yet.
6480 */
6481 if (ret == -EAGAIN)
6482 goto end_trans;
6483 err = ret;
6484 break;
6485 }
6486
6648 if (!trans) { 6487 if (!trans) {
6649 trans = btrfs_start_transaction(root, 3); 6488 /* Just need the 1 for updating the inode */
6489 trans = btrfs_start_transaction(root, 1);
6650 if (IS_ERR(trans)) { 6490 if (IS_ERR(trans)) {
6651 err = PTR_ERR(trans); 6491 err = PTR_ERR(trans);
6652 goto out; 6492 goto out;
6653 } 6493 }
6654
6655 ret = btrfs_truncate_reserve_metadata(trans, root,
6656 rsv);
6657 BUG_ON(ret);
6658
6659 trans->block_rsv = rsv;
6660 } 6494 }
6661 6495
6496 trans->block_rsv = rsv;
6497
6662 ret = btrfs_truncate_inode_items(trans, root, inode, 6498 ret = btrfs_truncate_inode_items(trans, root, inode,
6663 inode->i_size, 6499 inode->i_size,
6664 BTRFS_EXTENT_DATA_KEY); 6500 BTRFS_EXTENT_DATA_KEY);
@@ -6673,7 +6509,7 @@ static int btrfs_truncate(struct inode *inode)
6673 err = ret; 6509 err = ret;
6674 break; 6510 break;
6675 } 6511 }
6676 6512end_trans:
6677 nr = trans->blocks_used; 6513 nr = trans->blocks_used;
6678 btrfs_end_transaction(trans, root); 6514 btrfs_end_transaction(trans, root);
6679 trans = NULL; 6515 trans = NULL;
@@ -6755,9 +6591,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6755 ei->last_sub_trans = 0; 6591 ei->last_sub_trans = 0;
6756 ei->logged_trans = 0; 6592 ei->logged_trans = 0;
6757 ei->delalloc_bytes = 0; 6593 ei->delalloc_bytes = 0;
6758 ei->reserved_bytes = 0;
6759 ei->disk_i_size = 0; 6594 ei->disk_i_size = 0;
6760 ei->flags = 0; 6595 ei->flags = 0;
6596 ei->csum_bytes = 0;
6761 ei->index_cnt = (u64)-1; 6597 ei->index_cnt = (u64)-1;
6762 ei->last_unlink_trans = 0; 6598 ei->last_unlink_trans = 0;
6763 6599
@@ -6803,6 +6639,8 @@ void btrfs_destroy_inode(struct inode *inode)
6803 WARN_ON(inode->i_data.nrpages); 6639 WARN_ON(inode->i_data.nrpages);
6804 WARN_ON(BTRFS_I(inode)->outstanding_extents); 6640 WARN_ON(BTRFS_I(inode)->outstanding_extents);
6805 WARN_ON(BTRFS_I(inode)->reserved_extents); 6641 WARN_ON(BTRFS_I(inode)->reserved_extents);
6642 WARN_ON(BTRFS_I(inode)->delalloc_bytes);
6643 WARN_ON(BTRFS_I(inode)->csum_bytes);
6806 6644
6807 /* 6645 /*
6808 * This can happen where we create an inode, but somebody else also 6646 * This can happen where we create an inode, but somebody else also
@@ -7420,7 +7258,6 @@ static struct extent_io_ops btrfs_extent_io_ops = {
7420 .readpage_end_io_hook = btrfs_readpage_end_io_hook, 7258 .readpage_end_io_hook = btrfs_readpage_end_io_hook,
7421 .writepage_end_io_hook = btrfs_writepage_end_io_hook, 7259 .writepage_end_io_hook = btrfs_writepage_end_io_hook,
7422 .writepage_start_hook = btrfs_writepage_start_hook, 7260 .writepage_start_hook = btrfs_writepage_start_hook,
7423 .readpage_io_failed_hook = btrfs_io_failed_hook,
7424 .set_bit_hook = btrfs_set_bit_hook, 7261 .set_bit_hook = btrfs_set_bit_hook,
7425 .clear_bit_hook = btrfs_clear_bit_hook, 7262 .clear_bit_hook = btrfs_clear_bit_hook,
7426 .merge_extent_hook = btrfs_merge_extent_hook, 7263 .merge_extent_hook = btrfs_merge_extent_hook,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index dae5dfe41ba5..4a34c472f126 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -51,6 +51,7 @@
51#include "volumes.h" 51#include "volumes.h"
52#include "locking.h" 52#include "locking.h"
53#include "inode-map.h" 53#include "inode-map.h"
54#include "backref.h"
54 55
55/* Mask out flags that are inappropriate for the given type of inode. */ 56/* Mask out flags that are inappropriate for the given type of inode. */
56static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) 57static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -117,7 +118,7 @@ void btrfs_update_iflags(struct inode *inode)
117/* 118/*
118 * Inherit flags from the parent inode. 119 * Inherit flags from the parent inode.
119 * 120 *
120 * Unlike extN we don't have any flags we don't want to inherit currently. 121 * Currently only the compression flags and the cow flags are inherited.
121 */ 122 */
122void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) 123void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
123{ 124{
@@ -128,12 +129,17 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
128 129
129 flags = BTRFS_I(dir)->flags; 130 flags = BTRFS_I(dir)->flags;
130 131
131 if (S_ISREG(inode->i_mode)) 132 if (flags & BTRFS_INODE_NOCOMPRESS) {
132 flags &= ~BTRFS_INODE_DIRSYNC; 133 BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
133 else if (!S_ISDIR(inode->i_mode)) 134 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
134 flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME); 135 } else if (flags & BTRFS_INODE_COMPRESS) {
136 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
137 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
138 }
139
140 if (flags & BTRFS_INODE_NODATACOW)
141 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
135 142
136 BTRFS_I(inode)->flags = flags;
137 btrfs_update_iflags(inode); 143 btrfs_update_iflags(inode);
138} 144}
139 145
@@ -277,6 +283,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
277 struct fstrim_range range; 283 struct fstrim_range range;
278 u64 minlen = ULLONG_MAX; 284 u64 minlen = ULLONG_MAX;
279 u64 num_devices = 0; 285 u64 num_devices = 0;
286 u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
280 int ret; 287 int ret;
281 288
282 if (!capable(CAP_SYS_ADMIN)) 289 if (!capable(CAP_SYS_ADMIN))
@@ -295,12 +302,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
295 } 302 }
296 } 303 }
297 rcu_read_unlock(); 304 rcu_read_unlock();
305
298 if (!num_devices) 306 if (!num_devices)
299 return -EOPNOTSUPP; 307 return -EOPNOTSUPP;
300
301 if (copy_from_user(&range, arg, sizeof(range))) 308 if (copy_from_user(&range, arg, sizeof(range)))
302 return -EFAULT; 309 return -EFAULT;
310 if (range.start > total_bytes)
311 return -EINVAL;
303 312
313 range.len = min(range.len, total_bytes - range.start);
304 range.minlen = max(range.minlen, minlen); 314 range.minlen = max(range.minlen, minlen);
305 ret = btrfs_trim_fs(root, &range); 315 ret = btrfs_trim_fs(root, &range);
306 if (ret < 0) 316 if (ret < 0)
@@ -760,7 +770,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
760 int ret = 1; 770 int ret = 1;
761 771
762 /* 772 /*
763 * make sure that once we start defragging and extent, we keep on 773 * make sure that once we start defragging an extent, we keep on
764 * defragging it 774 * defragging it
765 */ 775 */
766 if (start < *defrag_end) 776 if (start < *defrag_end)
@@ -805,7 +815,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
805 * extent will force at least part of that big extent to be defragged. 815 * extent will force at least part of that big extent to be defragged.
806 */ 816 */
807 if (ret) { 817 if (ret) {
808 *last_len += len;
809 *defrag_end = extent_map_end(em); 818 *defrag_end = extent_map_end(em);
810 } else { 819 } else {
811 *last_len = 0; 820 *last_len = 0;
@@ -843,6 +852,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
843 int i_done; 852 int i_done;
844 struct btrfs_ordered_extent *ordered; 853 struct btrfs_ordered_extent *ordered;
845 struct extent_state *cached_state = NULL; 854 struct extent_state *cached_state = NULL;
855 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
846 856
847 if (isize == 0) 857 if (isize == 0)
848 return 0; 858 return 0;
@@ -860,7 +870,7 @@ again:
860 for (i = 0; i < num_pages; i++) { 870 for (i = 0; i < num_pages; i++) {
861 struct page *page; 871 struct page *page;
862 page = find_or_create_page(inode->i_mapping, 872 page = find_or_create_page(inode->i_mapping,
863 start_index + i, GFP_NOFS); 873 start_index + i, mask);
864 if (!page) 874 if (!page)
865 break; 875 break;
866 876
@@ -972,18 +982,20 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
972 struct btrfs_super_block *disk_super; 982 struct btrfs_super_block *disk_super;
973 struct file_ra_state *ra = NULL; 983 struct file_ra_state *ra = NULL;
974 unsigned long last_index; 984 unsigned long last_index;
985 u64 isize = i_size_read(inode);
975 u64 features; 986 u64 features;
976 u64 last_len = 0; 987 u64 last_len = 0;
977 u64 skip = 0; 988 u64 skip = 0;
978 u64 defrag_end = 0; 989 u64 defrag_end = 0;
979 u64 newer_off = range->start; 990 u64 newer_off = range->start;
980 int newer_left = 0;
981 unsigned long i; 991 unsigned long i;
992 unsigned long ra_index = 0;
982 int ret; 993 int ret;
983 int defrag_count = 0; 994 int defrag_count = 0;
984 int compress_type = BTRFS_COMPRESS_ZLIB; 995 int compress_type = BTRFS_COMPRESS_ZLIB;
985 int extent_thresh = range->extent_thresh; 996 int extent_thresh = range->extent_thresh;
986 int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; 997 int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
998 int cluster = max_cluster;
987 u64 new_align = ~((u64)128 * 1024 - 1); 999 u64 new_align = ~((u64)128 * 1024 - 1);
988 struct page **pages = NULL; 1000 struct page **pages = NULL;
989 1001
@@ -997,7 +1009,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
997 compress_type = range->compress_type; 1009 compress_type = range->compress_type;
998 } 1010 }
999 1011
1000 if (inode->i_size == 0) 1012 if (isize == 0)
1001 return 0; 1013 return 0;
1002 1014
1003 /* 1015 /*
@@ -1013,7 +1025,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1013 ra = &file->f_ra; 1025 ra = &file->f_ra;
1014 } 1026 }
1015 1027
1016 pages = kmalloc(sizeof(struct page *) * newer_cluster, 1028 pages = kmalloc(sizeof(struct page *) * max_cluster,
1017 GFP_NOFS); 1029 GFP_NOFS);
1018 if (!pages) { 1030 if (!pages) {
1019 ret = -ENOMEM; 1031 ret = -ENOMEM;
@@ -1022,10 +1034,10 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1022 1034
1023 /* find the last page to defrag */ 1035 /* find the last page to defrag */
1024 if (range->start + range->len > range->start) { 1036 if (range->start + range->len > range->start) {
1025 last_index = min_t(u64, inode->i_size - 1, 1037 last_index = min_t(u64, isize - 1,
1026 range->start + range->len - 1) >> PAGE_CACHE_SHIFT; 1038 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
1027 } else { 1039 } else {
1028 last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; 1040 last_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1029 } 1041 }
1030 1042
1031 if (newer_than) { 1043 if (newer_than) {
@@ -1038,14 +1050,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1038 * the extents in the file evenly spaced 1050 * the extents in the file evenly spaced
1039 */ 1051 */
1040 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; 1052 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
1041 newer_left = newer_cluster;
1042 } else 1053 } else
1043 goto out_ra; 1054 goto out_ra;
1044 } else { 1055 } else {
1045 i = range->start >> PAGE_CACHE_SHIFT; 1056 i = range->start >> PAGE_CACHE_SHIFT;
1046 } 1057 }
1047 if (!max_to_defrag) 1058 if (!max_to_defrag)
1048 max_to_defrag = last_index - 1; 1059 max_to_defrag = last_index;
1049 1060
1050 /* 1061 /*
1051 * make writeback starts from i, so the defrag range can be 1062 * make writeback starts from i, so the defrag range can be
@@ -1079,18 +1090,31 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1079 i = max(i + 1, next); 1090 i = max(i + 1, next);
1080 continue; 1091 continue;
1081 } 1092 }
1093
1094 if (!newer_than) {
1095 cluster = (PAGE_CACHE_ALIGN(defrag_end) >>
1096 PAGE_CACHE_SHIFT) - i;
1097 cluster = min(cluster, max_cluster);
1098 } else {
1099 cluster = max_cluster;
1100 }
1101
1082 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) 1102 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
1083 BTRFS_I(inode)->force_compress = compress_type; 1103 BTRFS_I(inode)->force_compress = compress_type;
1084 1104
1085 btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster); 1105 if (i + cluster > ra_index) {
1106 ra_index = max(i, ra_index);
1107 btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
1108 cluster);
1109 ra_index += max_cluster;
1110 }
1086 1111
1087 ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster); 1112 ret = cluster_pages_for_defrag(inode, pages, i, cluster);
1088 if (ret < 0) 1113 if (ret < 0)
1089 goto out_ra; 1114 goto out_ra;
1090 1115
1091 defrag_count += ret; 1116 defrag_count += ret;
1092 balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); 1117 balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
1093 i += ret;
1094 1118
1095 if (newer_than) { 1119 if (newer_than) {
1096 if (newer_off == (u64)-1) 1120 if (newer_off == (u64)-1)
@@ -1105,12 +1129,17 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1105 if (!ret) { 1129 if (!ret) {
1106 range->start = newer_off; 1130 range->start = newer_off;
1107 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; 1131 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
1108 newer_left = newer_cluster;
1109 } else { 1132 } else {
1110 break; 1133 break;
1111 } 1134 }
1112 } else { 1135 } else {
1113 i++; 1136 if (ret > 0) {
1137 i += ret;
1138 last_len += ret << PAGE_CACHE_SHIFT;
1139 } else {
1140 i++;
1141 last_len = 0;
1142 }
1114 } 1143 }
1115 } 1144 }
1116 1145
@@ -1136,16 +1165,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1136 mutex_unlock(&inode->i_mutex); 1165 mutex_unlock(&inode->i_mutex);
1137 } 1166 }
1138 1167
1139 disk_super = &root->fs_info->super_copy; 1168 disk_super = root->fs_info->super_copy;
1140 features = btrfs_super_incompat_flags(disk_super); 1169 features = btrfs_super_incompat_flags(disk_super);
1141 if (range->compress_type == BTRFS_COMPRESS_LZO) { 1170 if (range->compress_type == BTRFS_COMPRESS_LZO) {
1142 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; 1171 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
1143 btrfs_set_super_incompat_flags(disk_super, features); 1172 btrfs_set_super_incompat_flags(disk_super, features);
1144 } 1173 }
1145 1174
1146 if (!file) 1175 ret = defrag_count;
1147 kfree(ra);
1148 return defrag_count;
1149 1176
1150out_ra: 1177out_ra:
1151 if (!file) 1178 if (!file)
@@ -2587,7 +2614,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2587 return PTR_ERR(trans); 2614 return PTR_ERR(trans);
2588 } 2615 }
2589 2616
2590 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 2617 dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
2591 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, 2618 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
2592 dir_id, "default", 7, 1); 2619 dir_id, "default", 7, 1);
2593 if (IS_ERR_OR_NULL(di)) { 2620 if (IS_ERR_OR_NULL(di)) {
@@ -2603,7 +2630,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2603 btrfs_mark_buffer_dirty(path->nodes[0]); 2630 btrfs_mark_buffer_dirty(path->nodes[0]);
2604 btrfs_free_path(path); 2631 btrfs_free_path(path);
2605 2632
2606 disk_super = &root->fs_info->super_copy; 2633 disk_super = root->fs_info->super_copy;
2607 features = btrfs_super_incompat_flags(disk_super); 2634 features = btrfs_super_incompat_flags(disk_super);
2608 if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) { 2635 if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
2609 features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL; 2636 features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
@@ -2864,6 +2891,144 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
2864 return ret; 2891 return ret;
2865} 2892}
2866 2893
2894static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
2895{
2896 int ret = 0;
2897 int i;
2898 u64 rel_ptr;
2899 int size;
2900 struct btrfs_ioctl_ino_path_args *ipa = NULL;
2901 struct inode_fs_paths *ipath = NULL;
2902 struct btrfs_path *path;
2903
2904 if (!capable(CAP_SYS_ADMIN))
2905 return -EPERM;
2906
2907 path = btrfs_alloc_path();
2908 if (!path) {
2909 ret = -ENOMEM;
2910 goto out;
2911 }
2912
2913 ipa = memdup_user(arg, sizeof(*ipa));
2914 if (IS_ERR(ipa)) {
2915 ret = PTR_ERR(ipa);
2916 ipa = NULL;
2917 goto out;
2918 }
2919
2920 size = min_t(u32, ipa->size, 4096);
2921 ipath = init_ipath(size, root, path);
2922 if (IS_ERR(ipath)) {
2923 ret = PTR_ERR(ipath);
2924 ipath = NULL;
2925 goto out;
2926 }
2927
2928 ret = paths_from_inode(ipa->inum, ipath);
2929 if (ret < 0)
2930 goto out;
2931
2932 for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
2933 rel_ptr = ipath->fspath->val[i] - (u64)ipath->fspath->val;
2934 ipath->fspath->val[i] = rel_ptr;
2935 }
2936
2937 ret = copy_to_user((void *)ipa->fspath, (void *)ipath->fspath, size);
2938 if (ret) {
2939 ret = -EFAULT;
2940 goto out;
2941 }
2942
2943out:
2944 btrfs_free_path(path);
2945 free_ipath(ipath);
2946 kfree(ipa);
2947
2948 return ret;
2949}
2950
2951static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
2952{
2953 struct btrfs_data_container *inodes = ctx;
2954 const size_t c = 3 * sizeof(u64);
2955
2956 if (inodes->bytes_left >= c) {
2957 inodes->bytes_left -= c;
2958 inodes->val[inodes->elem_cnt] = inum;
2959 inodes->val[inodes->elem_cnt + 1] = offset;
2960 inodes->val[inodes->elem_cnt + 2] = root;
2961 inodes->elem_cnt += 3;
2962 } else {
2963 inodes->bytes_missing += c - inodes->bytes_left;
2964 inodes->bytes_left = 0;
2965 inodes->elem_missed += 3;
2966 }
2967
2968 return 0;
2969}
2970
2971static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
2972 void __user *arg)
2973{
2974 int ret = 0;
2975 int size;
2976 u64 extent_offset;
2977 struct btrfs_ioctl_logical_ino_args *loi;
2978 struct btrfs_data_container *inodes = NULL;
2979 struct btrfs_path *path = NULL;
2980 struct btrfs_key key;
2981
2982 if (!capable(CAP_SYS_ADMIN))
2983 return -EPERM;
2984
2985 loi = memdup_user(arg, sizeof(*loi));
2986 if (IS_ERR(loi)) {
2987 ret = PTR_ERR(loi);
2988 loi = NULL;
2989 goto out;
2990 }
2991
2992 path = btrfs_alloc_path();
2993 if (!path) {
2994 ret = -ENOMEM;
2995 goto out;
2996 }
2997
2998 size = min_t(u32, loi->size, 4096);
2999 inodes = init_data_container(size);
3000 if (IS_ERR(inodes)) {
3001 ret = PTR_ERR(inodes);
3002 inodes = NULL;
3003 goto out;
3004 }
3005
3006 ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
3007
3008 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
3009 ret = -ENOENT;
3010 if (ret < 0)
3011 goto out;
3012
3013 extent_offset = loi->logical - key.objectid;
3014 ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
3015 extent_offset, build_ino_list, inodes);
3016
3017 if (ret < 0)
3018 goto out;
3019
3020 ret = copy_to_user((void *)loi->inodes, (void *)inodes, size);
3021 if (ret)
3022 ret = -EFAULT;
3023
3024out:
3025 btrfs_free_path(path);
3026 kfree(inodes);
3027 kfree(loi);
3028
3029 return ret;
3030}
3031
2867long btrfs_ioctl(struct file *file, unsigned int 3032long btrfs_ioctl(struct file *file, unsigned int
2868 cmd, unsigned long arg) 3033 cmd, unsigned long arg)
2869{ 3034{
@@ -2921,6 +3086,10 @@ long btrfs_ioctl(struct file *file, unsigned int
2921 return btrfs_ioctl_tree_search(file, argp); 3086 return btrfs_ioctl_tree_search(file, argp);
2922 case BTRFS_IOC_INO_LOOKUP: 3087 case BTRFS_IOC_INO_LOOKUP:
2923 return btrfs_ioctl_ino_lookup(file, argp); 3088 return btrfs_ioctl_ino_lookup(file, argp);
3089 case BTRFS_IOC_INO_PATHS:
3090 return btrfs_ioctl_ino_to_path(root, argp);
3091 case BTRFS_IOC_LOGICAL_INO:
3092 return btrfs_ioctl_logical_to_ino(root, argp);
2924 case BTRFS_IOC_SPACE_INFO: 3093 case BTRFS_IOC_SPACE_INFO:
2925 return btrfs_ioctl_space_info(root, argp); 3094 return btrfs_ioctl_space_info(root, argp);
2926 case BTRFS_IOC_SYNC: 3095 case BTRFS_IOC_SYNC:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index ad1ea789fcb4..252ae9915de8 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -193,6 +193,30 @@ struct btrfs_ioctl_space_args {
193 struct btrfs_ioctl_space_info spaces[0]; 193 struct btrfs_ioctl_space_info spaces[0];
194}; 194};
195 195
196struct btrfs_data_container {
197 __u32 bytes_left; /* out -- bytes not needed to deliver output */
198 __u32 bytes_missing; /* out -- additional bytes needed for result */
199 __u32 elem_cnt; /* out */
200 __u32 elem_missed; /* out */
201 __u64 val[0]; /* out */
202};
203
204struct btrfs_ioctl_ino_path_args {
205 __u64 inum; /* in */
206 __u32 size; /* in */
207 __u64 reserved[4];
208 /* struct btrfs_data_container *fspath; out */
209 __u64 fspath; /* out */
210};
211
212struct btrfs_ioctl_logical_ino_args {
213 __u64 logical; /* in */
214 __u32 size; /* in */
215 __u64 reserved[4];
216 /* struct btrfs_data_container *inodes; out */
217 __u64 inodes;
218};
219
196#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ 220#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
197 struct btrfs_ioctl_vol_args) 221 struct btrfs_ioctl_vol_args)
198#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ 222#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -248,4 +272,9 @@ struct btrfs_ioctl_space_args {
248 struct btrfs_ioctl_dev_info_args) 272 struct btrfs_ioctl_dev_info_args)
249#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ 273#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
250 struct btrfs_ioctl_fs_info_args) 274 struct btrfs_ioctl_fs_info_args)
275#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
276 struct btrfs_ioctl_ino_path_args)
277#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
278 struct btrfs_ioctl_ino_path_args)
279
251#endif 280#endif
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index fb2605d998e9..f38e452486b8 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -158,8 +158,7 @@ static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
158void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) 158void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
159{ 159{
160 int i; 160 int i;
161 u32 type; 161 u32 type, nr;
162 u32 nr = btrfs_header_nritems(l);
163 struct btrfs_item *item; 162 struct btrfs_item *item;
164 struct btrfs_root_item *ri; 163 struct btrfs_root_item *ri;
165 struct btrfs_dir_item *di; 164 struct btrfs_dir_item *di;
@@ -172,6 +171,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
172 struct btrfs_key key; 171 struct btrfs_key key;
173 struct btrfs_key found_key; 172 struct btrfs_key found_key;
174 173
174 if (!l)
175 return;
176
177 nr = btrfs_header_nritems(l);
178
175 printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", 179 printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
176 (unsigned long long)btrfs_header_bytenr(l), nr, 180 (unsigned long long)btrfs_header_bytenr(l), nr,
177 btrfs_leaf_free_space(root, l)); 181 btrfs_leaf_free_space(root, l));
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
new file mode 100644
index 000000000000..2373b39a132b
--- /dev/null
+++ b/fs/btrfs/reada.c
@@ -0,0 +1,951 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/pagemap.h>
21#include <linux/writeback.h>
22#include <linux/blkdev.h>
23#include <linux/rbtree.h>
24#include <linux/slab.h>
25#include <linux/workqueue.h>
26#include "ctree.h"
27#include "volumes.h"
28#include "disk-io.h"
29#include "transaction.h"
30
31#undef DEBUG
32
33/*
34 * This is the implementation for the generic read ahead framework.
35 *
36 * To trigger a readahead, btrfs_reada_add must be called. It will start
37 * a read ahead for the given range [start, end) on tree root. The returned
38 * handle can either be used to wait on the readahead to finish
39 * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
40 *
41 * The read ahead works as follows:
42 * On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
43 * reada_start_machine will then search for extents to prefetch and trigger
44 * some reads. When a read finishes for a node, all contained node/leaf
45 * pointers that lie in the given range will also be enqueued. The reads will
46 * be triggered in sequential order, thus giving a big win over a naive
47 * enumeration. It will also make use of multi-device layouts. Each disk
48 * will have its on read pointer and all disks will by utilized in parallel.
49 * Also will no two disks read both sides of a mirror simultaneously, as this
50 * would waste seeking capacity. Instead both disks will read different parts
51 * of the filesystem.
52 * Any number of readaheads can be started in parallel. The read order will be
53 * determined globally, i.e. 2 parallel readaheads will normally finish faster
54 * than the 2 started one after another.
55 */
56
57#define MAX_MIRRORS 2
58#define MAX_IN_FLIGHT 6
59
60struct reada_extctl {
61 struct list_head list;
62 struct reada_control *rc;
63 u64 generation;
64};
65
66struct reada_extent {
67 u64 logical;
68 struct btrfs_key top;
69 u32 blocksize;
70 int err;
71 struct list_head extctl;
72 struct kref refcnt;
73 spinlock_t lock;
74 struct reada_zone *zones[MAX_MIRRORS];
75 int nzones;
76 struct btrfs_device *scheduled_for;
77};
78
79struct reada_zone {
80 u64 start;
81 u64 end;
82 u64 elems;
83 struct list_head list;
84 spinlock_t lock;
85 int locked;
86 struct btrfs_device *device;
87 struct btrfs_device *devs[MAX_MIRRORS]; /* full list, incl self */
88 int ndevs;
89 struct kref refcnt;
90};
91
92struct reada_machine_work {
93 struct btrfs_work work;
94 struct btrfs_fs_info *fs_info;
95};
96
97static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *);
98static void reada_control_release(struct kref *kref);
99static void reada_zone_release(struct kref *kref);
100static void reada_start_machine(struct btrfs_fs_info *fs_info);
101static void __reada_start_machine(struct btrfs_fs_info *fs_info);
102
103static int reada_add_block(struct reada_control *rc, u64 logical,
104 struct btrfs_key *top, int level, u64 generation);
105
106/* recurses */
107/* in case of err, eb might be NULL */
108static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
109 u64 start, int err)
110{
111 int level = 0;
112 int nritems;
113 int i;
114 u64 bytenr;
115 u64 generation;
116 struct reada_extent *re;
117 struct btrfs_fs_info *fs_info = root->fs_info;
118 struct list_head list;
119 unsigned long index = start >> PAGE_CACHE_SHIFT;
120 struct btrfs_device *for_dev;
121
122 if (eb)
123 level = btrfs_header_level(eb);
124
125 /* find extent */
126 spin_lock(&fs_info->reada_lock);
127 re = radix_tree_lookup(&fs_info->reada_tree, index);
128 if (re)
129 kref_get(&re->refcnt);
130 spin_unlock(&fs_info->reada_lock);
131
132 if (!re)
133 return -1;
134
135 spin_lock(&re->lock);
136 /*
137 * just take the full list from the extent. afterwards we
138 * don't need the lock anymore
139 */
140 list_replace_init(&re->extctl, &list);
141 for_dev = re->scheduled_for;
142 re->scheduled_for = NULL;
143 spin_unlock(&re->lock);
144
145 if (err == 0) {
146 nritems = level ? btrfs_header_nritems(eb) : 0;
147 generation = btrfs_header_generation(eb);
148 /*
149 * FIXME: currently we just set nritems to 0 if this is a leaf,
150 * effectively ignoring the content. In a next step we could
151 * trigger more readahead depending from the content, e.g.
152 * fetch the checksums for the extents in the leaf.
153 */
154 } else {
155 /*
156 * this is the error case, the extent buffer has not been
157 * read correctly. We won't access anything from it and
158 * just cleanup our data structures. Effectively this will
159 * cut the branch below this node from read ahead.
160 */
161 nritems = 0;
162 generation = 0;
163 }
164
165 for (i = 0; i < nritems; i++) {
166 struct reada_extctl *rec;
167 u64 n_gen;
168 struct btrfs_key key;
169 struct btrfs_key next_key;
170
171 btrfs_node_key_to_cpu(eb, &key, i);
172 if (i + 1 < nritems)
173 btrfs_node_key_to_cpu(eb, &next_key, i + 1);
174 else
175 next_key = re->top;
176 bytenr = btrfs_node_blockptr(eb, i);
177 n_gen = btrfs_node_ptr_generation(eb, i);
178
179 list_for_each_entry(rec, &list, list) {
180 struct reada_control *rc = rec->rc;
181
182 /*
183 * if the generation doesn't match, just ignore this
184 * extctl. This will probably cut off a branch from
185 * prefetch. Alternatively one could start a new (sub-)
186 * prefetch for this branch, starting again from root.
187 * FIXME: move the generation check out of this loop
188 */
189#ifdef DEBUG
190 if (rec->generation != generation) {
191 printk(KERN_DEBUG "generation mismatch for "
192 "(%llu,%d,%llu) %llu != %llu\n",
193 key.objectid, key.type, key.offset,
194 rec->generation, generation);
195 }
196#endif
197 if (rec->generation == generation &&
198 btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
199 btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
200 reada_add_block(rc, bytenr, &next_key,
201 level - 1, n_gen);
202 }
203 }
204 /*
205 * free extctl records
206 */
207 while (!list_empty(&list)) {
208 struct reada_control *rc;
209 struct reada_extctl *rec;
210
211 rec = list_first_entry(&list, struct reada_extctl, list);
212 list_del(&rec->list);
213 rc = rec->rc;
214 kfree(rec);
215
216 kref_get(&rc->refcnt);
217 if (atomic_dec_and_test(&rc->elems)) {
218 kref_put(&rc->refcnt, reada_control_release);
219 wake_up(&rc->wait);
220 }
221 kref_put(&rc->refcnt, reada_control_release);
222
223 reada_extent_put(fs_info, re); /* one ref for each entry */
224 }
225 reada_extent_put(fs_info, re); /* our ref */
226 if (for_dev)
227 atomic_dec(&for_dev->reada_in_flight);
228
229 return 0;
230}
231
232/*
233 * start is passed separately in case eb in NULL, which may be the case with
234 * failed I/O
235 */
236int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
237 u64 start, int err)
238{
239 int ret;
240
241 ret = __readahead_hook(root, eb, start, err);
242
243 reada_start_machine(root->fs_info);
244
245 return ret;
246}
247
248static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
249 struct btrfs_device *dev, u64 logical,
250 struct btrfs_bio *bbio)
251{
252 int ret;
253 int looped = 0;
254 struct reada_zone *zone;
255 struct btrfs_block_group_cache *cache = NULL;
256 u64 start;
257 u64 end;
258 int i;
259
260again:
261 zone = NULL;
262 spin_lock(&fs_info->reada_lock);
263 ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
264 logical >> PAGE_CACHE_SHIFT, 1);
265 if (ret == 1)
266 kref_get(&zone->refcnt);
267 spin_unlock(&fs_info->reada_lock);
268
269 if (ret == 1) {
270 if (logical >= zone->start && logical < zone->end)
271 return zone;
272 spin_lock(&fs_info->reada_lock);
273 kref_put(&zone->refcnt, reada_zone_release);
274 spin_unlock(&fs_info->reada_lock);
275 }
276
277 if (looped)
278 return NULL;
279
280 cache = btrfs_lookup_block_group(fs_info, logical);
281 if (!cache)
282 return NULL;
283
284 start = cache->key.objectid;
285 end = start + cache->key.offset - 1;
286 btrfs_put_block_group(cache);
287
288 zone = kzalloc(sizeof(*zone), GFP_NOFS);
289 if (!zone)
290 return NULL;
291
292 zone->start = start;
293 zone->end = end;
294 INIT_LIST_HEAD(&zone->list);
295 spin_lock_init(&zone->lock);
296 zone->locked = 0;
297 kref_init(&zone->refcnt);
298 zone->elems = 0;
299 zone->device = dev; /* our device always sits at index 0 */
300 for (i = 0; i < bbio->num_stripes; ++i) {
301 /* bounds have already been checked */
302 zone->devs[i] = bbio->stripes[i].dev;
303 }
304 zone->ndevs = bbio->num_stripes;
305
306 spin_lock(&fs_info->reada_lock);
307 ret = radix_tree_insert(&dev->reada_zones,
308 (unsigned long)zone->end >> PAGE_CACHE_SHIFT,
309 zone);
310 spin_unlock(&fs_info->reada_lock);
311
312 if (ret) {
313 kfree(zone);
314 looped = 1;
315 goto again;
316 }
317
318 return zone;
319}
320
321static struct reada_extent *reada_find_extent(struct btrfs_root *root,
322 u64 logical,
323 struct btrfs_key *top, int level)
324{
325 int ret;
326 int looped = 0;
327 struct reada_extent *re = NULL;
328 struct btrfs_fs_info *fs_info = root->fs_info;
329 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
330 struct btrfs_bio *bbio = NULL;
331 struct btrfs_device *dev;
332 u32 blocksize;
333 u64 length;
334 int nzones = 0;
335 int i;
336 unsigned long index = logical >> PAGE_CACHE_SHIFT;
337
338again:
339 spin_lock(&fs_info->reada_lock);
340 re = radix_tree_lookup(&fs_info->reada_tree, index);
341 if (re)
342 kref_get(&re->refcnt);
343 spin_unlock(&fs_info->reada_lock);
344
345 if (re || looped)
346 return re;
347
348 re = kzalloc(sizeof(*re), GFP_NOFS);
349 if (!re)
350 return NULL;
351
352 blocksize = btrfs_level_size(root, level);
353 re->logical = logical;
354 re->blocksize = blocksize;
355 re->top = *top;
356 INIT_LIST_HEAD(&re->extctl);
357 spin_lock_init(&re->lock);
358 kref_init(&re->refcnt);
359
360 /*
361 * map block
362 */
363 length = blocksize;
364 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0);
365 if (ret || !bbio || length < blocksize)
366 goto error;
367
368 if (bbio->num_stripes > MAX_MIRRORS) {
369 printk(KERN_ERR "btrfs readahead: more than %d copies not "
370 "supported", MAX_MIRRORS);
371 goto error;
372 }
373
374 for (nzones = 0; nzones < bbio->num_stripes; ++nzones) {
375 struct reada_zone *zone;
376
377 dev = bbio->stripes[nzones].dev;
378 zone = reada_find_zone(fs_info, dev, logical, bbio);
379 if (!zone)
380 break;
381
382 re->zones[nzones] = zone;
383 spin_lock(&zone->lock);
384 if (!zone->elems)
385 kref_get(&zone->refcnt);
386 ++zone->elems;
387 spin_unlock(&zone->lock);
388 spin_lock(&fs_info->reada_lock);
389 kref_put(&zone->refcnt, reada_zone_release);
390 spin_unlock(&fs_info->reada_lock);
391 }
392 re->nzones = nzones;
393 if (nzones == 0) {
394 /* not a single zone found, error and out */
395 goto error;
396 }
397
398 /* insert extent in reada_tree + all per-device trees, all or nothing */
399 spin_lock(&fs_info->reada_lock);
400 ret = radix_tree_insert(&fs_info->reada_tree, index, re);
401 if (ret) {
402 spin_unlock(&fs_info->reada_lock);
403 if (ret != -ENOMEM) {
404 /* someone inserted the extent in the meantime */
405 looped = 1;
406 }
407 goto error;
408 }
409 for (i = 0; i < nzones; ++i) {
410 dev = bbio->stripes[i].dev;
411 ret = radix_tree_insert(&dev->reada_extents, index, re);
412 if (ret) {
413 while (--i >= 0) {
414 dev = bbio->stripes[i].dev;
415 BUG_ON(dev == NULL);
416 radix_tree_delete(&dev->reada_extents, index);
417 }
418 BUG_ON(fs_info == NULL);
419 radix_tree_delete(&fs_info->reada_tree, index);
420 spin_unlock(&fs_info->reada_lock);
421 goto error;
422 }
423 }
424 spin_unlock(&fs_info->reada_lock);
425
426 kfree(bbio);
427 return re;
428
429error:
430 while (nzones) {
431 struct reada_zone *zone;
432
433 --nzones;
434 zone = re->zones[nzones];
435 kref_get(&zone->refcnt);
436 spin_lock(&zone->lock);
437 --zone->elems;
438 if (zone->elems == 0) {
439 /*
440 * no fs_info->reada_lock needed, as this can't be
441 * the last ref
442 */
443 kref_put(&zone->refcnt, reada_zone_release);
444 }
445 spin_unlock(&zone->lock);
446
447 spin_lock(&fs_info->reada_lock);
448 kref_put(&zone->refcnt, reada_zone_release);
449 spin_unlock(&fs_info->reada_lock);
450 }
451 kfree(bbio);
452 kfree(re);
453 if (looped)
454 goto again;
455 return NULL;
456}
457
458static void reada_kref_dummy(struct kref *kr)
459{
460}
461
462static void reada_extent_put(struct btrfs_fs_info *fs_info,
463 struct reada_extent *re)
464{
465 int i;
466 unsigned long index = re->logical >> PAGE_CACHE_SHIFT;
467
468 spin_lock(&fs_info->reada_lock);
469 if (!kref_put(&re->refcnt, reada_kref_dummy)) {
470 spin_unlock(&fs_info->reada_lock);
471 return;
472 }
473
474 radix_tree_delete(&fs_info->reada_tree, index);
475 for (i = 0; i < re->nzones; ++i) {
476 struct reada_zone *zone = re->zones[i];
477
478 radix_tree_delete(&zone->device->reada_extents, index);
479 }
480
481 spin_unlock(&fs_info->reada_lock);
482
483 for (i = 0; i < re->nzones; ++i) {
484 struct reada_zone *zone = re->zones[i];
485
486 kref_get(&zone->refcnt);
487 spin_lock(&zone->lock);
488 --zone->elems;
489 if (zone->elems == 0) {
490 /* no fs_info->reada_lock needed, as this can't be
491 * the last ref */
492 kref_put(&zone->refcnt, reada_zone_release);
493 }
494 spin_unlock(&zone->lock);
495
496 spin_lock(&fs_info->reada_lock);
497 kref_put(&zone->refcnt, reada_zone_release);
498 spin_unlock(&fs_info->reada_lock);
499 }
500 if (re->scheduled_for)
501 atomic_dec(&re->scheduled_for->reada_in_flight);
502
503 kfree(re);
504}
505
506static void reada_zone_release(struct kref *kref)
507{
508 struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
509
510 radix_tree_delete(&zone->device->reada_zones,
511 zone->end >> PAGE_CACHE_SHIFT);
512
513 kfree(zone);
514}
515
516static void reada_control_release(struct kref *kref)
517{
518 struct reada_control *rc = container_of(kref, struct reada_control,
519 refcnt);
520
521 kfree(rc);
522}
523
524static int reada_add_block(struct reada_control *rc, u64 logical,
525 struct btrfs_key *top, int level, u64 generation)
526{
527 struct btrfs_root *root = rc->root;
528 struct reada_extent *re;
529 struct reada_extctl *rec;
530
531 re = reada_find_extent(root, logical, top, level); /* takes one ref */
532 if (!re)
533 return -1;
534
535 rec = kzalloc(sizeof(*rec), GFP_NOFS);
536 if (!rec) {
537 reada_extent_put(root->fs_info, re);
538 return -1;
539 }
540
541 rec->rc = rc;
542 rec->generation = generation;
543 atomic_inc(&rc->elems);
544
545 spin_lock(&re->lock);
546 list_add_tail(&rec->list, &re->extctl);
547 spin_unlock(&re->lock);
548
549 /* leave the ref on the extent */
550
551 return 0;
552}
553
554/*
555 * called with fs_info->reada_lock held
556 */
557static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
558{
559 int i;
560 unsigned long index = zone->end >> PAGE_CACHE_SHIFT;
561
562 for (i = 0; i < zone->ndevs; ++i) {
563 struct reada_zone *peer;
564 peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index);
565 if (peer && peer->device != zone->device)
566 peer->locked = lock;
567 }
568}
569
570/*
571 * called with fs_info->reada_lock held
572 */
573static int reada_pick_zone(struct btrfs_device *dev)
574{
575 struct reada_zone *top_zone = NULL;
576 struct reada_zone *top_locked_zone = NULL;
577 u64 top_elems = 0;
578 u64 top_locked_elems = 0;
579 unsigned long index = 0;
580 int ret;
581
582 if (dev->reada_curr_zone) {
583 reada_peer_zones_set_lock(dev->reada_curr_zone, 0);
584 kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release);
585 dev->reada_curr_zone = NULL;
586 }
587 /* pick the zone with the most elements */
588 while (1) {
589 struct reada_zone *zone;
590
591 ret = radix_tree_gang_lookup(&dev->reada_zones,
592 (void **)&zone, index, 1);
593 if (ret == 0)
594 break;
595 index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
596 if (zone->locked) {
597 if (zone->elems > top_locked_elems) {
598 top_locked_elems = zone->elems;
599 top_locked_zone = zone;
600 }
601 } else {
602 if (zone->elems > top_elems) {
603 top_elems = zone->elems;
604 top_zone = zone;
605 }
606 }
607 }
608 if (top_zone)
609 dev->reada_curr_zone = top_zone;
610 else if (top_locked_zone)
611 dev->reada_curr_zone = top_locked_zone;
612 else
613 return 0;
614
615 dev->reada_next = dev->reada_curr_zone->start;
616 kref_get(&dev->reada_curr_zone->refcnt);
617 reada_peer_zones_set_lock(dev->reada_curr_zone, 1);
618
619 return 1;
620}
621
622static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
623 struct btrfs_device *dev)
624{
625 struct reada_extent *re = NULL;
626 int mirror_num = 0;
627 struct extent_buffer *eb = NULL;
628 u64 logical;
629 u32 blocksize;
630 int ret;
631 int i;
632 int need_kick = 0;
633
634 spin_lock(&fs_info->reada_lock);
635 if (dev->reada_curr_zone == NULL) {
636 ret = reada_pick_zone(dev);
637 if (!ret) {
638 spin_unlock(&fs_info->reada_lock);
639 return 0;
640 }
641 }
642 /*
643 * FIXME currently we issue the reads one extent at a time. If we have
644 * a contiguous block of extents, we could also coagulate them or use
645 * plugging to speed things up
646 */
647 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
648 dev->reada_next >> PAGE_CACHE_SHIFT, 1);
649 if (ret == 0 || re->logical >= dev->reada_curr_zone->end) {
650 ret = reada_pick_zone(dev);
651 if (!ret) {
652 spin_unlock(&fs_info->reada_lock);
653 return 0;
654 }
655 re = NULL;
656 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
657 dev->reada_next >> PAGE_CACHE_SHIFT, 1);
658 }
659 if (ret == 0) {
660 spin_unlock(&fs_info->reada_lock);
661 return 0;
662 }
663 dev->reada_next = re->logical + re->blocksize;
664 kref_get(&re->refcnt);
665
666 spin_unlock(&fs_info->reada_lock);
667
668 /*
669 * find mirror num
670 */
671 for (i = 0; i < re->nzones; ++i) {
672 if (re->zones[i]->device == dev) {
673 mirror_num = i + 1;
674 break;
675 }
676 }
677 logical = re->logical;
678 blocksize = re->blocksize;
679
680 spin_lock(&re->lock);
681 if (re->scheduled_for == NULL) {
682 re->scheduled_for = dev;
683 need_kick = 1;
684 }
685 spin_unlock(&re->lock);
686
687 reada_extent_put(fs_info, re);
688
689 if (!need_kick)
690 return 0;
691
692 atomic_inc(&dev->reada_in_flight);
693 ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize,
694 mirror_num, &eb);
695 if (ret)
696 __readahead_hook(fs_info->extent_root, NULL, logical, ret);
697 else if (eb)
698 __readahead_hook(fs_info->extent_root, eb, eb->start, ret);
699
700 if (eb)
701 free_extent_buffer(eb);
702
703 return 1;
704
705}
706
707static void reada_start_machine_worker(struct btrfs_work *work)
708{
709 struct reada_machine_work *rmw;
710 struct btrfs_fs_info *fs_info;
711
712 rmw = container_of(work, struct reada_machine_work, work);
713 fs_info = rmw->fs_info;
714
715 kfree(rmw);
716
717 __reada_start_machine(fs_info);
718}
719
720static void __reada_start_machine(struct btrfs_fs_info *fs_info)
721{
722 struct btrfs_device *device;
723 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
724 u64 enqueued;
725 u64 total = 0;
726 int i;
727
728 do {
729 enqueued = 0;
730 list_for_each_entry(device, &fs_devices->devices, dev_list) {
731 if (atomic_read(&device->reada_in_flight) <
732 MAX_IN_FLIGHT)
733 enqueued += reada_start_machine_dev(fs_info,
734 device);
735 }
736 total += enqueued;
737 } while (enqueued && total < 10000);
738
739 if (enqueued == 0)
740 return;
741
742 /*
743 * If everything is already in the cache, this is effectively single
744 * threaded. To a) not hold the caller for too long and b) to utilize
745 * more cores, we broke the loop above after 10000 iterations and now
746 * enqueue to workers to finish it. This will distribute the load to
747 * the cores.
748 */
749 for (i = 0; i < 2; ++i)
750 reada_start_machine(fs_info);
751}
752
753static void reada_start_machine(struct btrfs_fs_info *fs_info)
754{
755 struct reada_machine_work *rmw;
756
757 rmw = kzalloc(sizeof(*rmw), GFP_NOFS);
758 if (!rmw) {
759 /* FIXME we cannot handle this properly right now */
760 BUG();
761 }
762 rmw->work.func = reada_start_machine_worker;
763 rmw->fs_info = fs_info;
764
765 btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work);
766}
767
768#ifdef DEBUG
769static void dump_devs(struct btrfs_fs_info *fs_info, int all)
770{
771 struct btrfs_device *device;
772 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
773 unsigned long index;
774 int ret;
775 int i;
776 int j;
777 int cnt;
778
779 spin_lock(&fs_info->reada_lock);
780 list_for_each_entry(device, &fs_devices->devices, dev_list) {
781 printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid,
782 atomic_read(&device->reada_in_flight));
783 index = 0;
784 while (1) {
785 struct reada_zone *zone;
786 ret = radix_tree_gang_lookup(&device->reada_zones,
787 (void **)&zone, index, 1);
788 if (ret == 0)
789 break;
790 printk(KERN_DEBUG " zone %llu-%llu elems %llu locked "
791 "%d devs", zone->start, zone->end, zone->elems,
792 zone->locked);
793 for (j = 0; j < zone->ndevs; ++j) {
794 printk(KERN_CONT " %lld",
795 zone->devs[j]->devid);
796 }
797 if (device->reada_curr_zone == zone)
798 printk(KERN_CONT " curr off %llu",
799 device->reada_next - zone->start);
800 printk(KERN_CONT "\n");
801 index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
802 }
803 cnt = 0;
804 index = 0;
805 while (all) {
806 struct reada_extent *re = NULL;
807
808 ret = radix_tree_gang_lookup(&device->reada_extents,
809 (void **)&re, index, 1);
810 if (ret == 0)
811 break;
812 printk(KERN_DEBUG
813 " re: logical %llu size %u empty %d for %lld",
814 re->logical, re->blocksize,
815 list_empty(&re->extctl), re->scheduled_for ?
816 re->scheduled_for->devid : -1);
817
818 for (i = 0; i < re->nzones; ++i) {
819 printk(KERN_CONT " zone %llu-%llu devs",
820 re->zones[i]->start,
821 re->zones[i]->end);
822 for (j = 0; j < re->zones[i]->ndevs; ++j) {
823 printk(KERN_CONT " %lld",
824 re->zones[i]->devs[j]->devid);
825 }
826 }
827 printk(KERN_CONT "\n");
828 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
829 if (++cnt > 15)
830 break;
831 }
832 }
833
834 index = 0;
835 cnt = 0;
836 while (all) {
837 struct reada_extent *re = NULL;
838
839 ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re,
840 index, 1);
841 if (ret == 0)
842 break;
843 if (!re->scheduled_for) {
844 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
845 continue;
846 }
847 printk(KERN_DEBUG
848 "re: logical %llu size %u list empty %d for %lld",
849 re->logical, re->blocksize, list_empty(&re->extctl),
850 re->scheduled_for ? re->scheduled_for->devid : -1);
851 for (i = 0; i < re->nzones; ++i) {
852 printk(KERN_CONT " zone %llu-%llu devs",
853 re->zones[i]->start,
854 re->zones[i]->end);
855 for (i = 0; i < re->nzones; ++i) {
856 printk(KERN_CONT " zone %llu-%llu devs",
857 re->zones[i]->start,
858 re->zones[i]->end);
859 for (j = 0; j < re->zones[i]->ndevs; ++j) {
860 printk(KERN_CONT " %lld",
861 re->zones[i]->devs[j]->devid);
862 }
863 }
864 }
865 printk(KERN_CONT "\n");
866 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
867 }
868 spin_unlock(&fs_info->reada_lock);
869}
870#endif
871
872/*
873 * interface
874 */
875struct reada_control *btrfs_reada_add(struct btrfs_root *root,
876 struct btrfs_key *key_start, struct btrfs_key *key_end)
877{
878 struct reada_control *rc;
879 u64 start;
880 u64 generation;
881 int level;
882 struct extent_buffer *node;
883 static struct btrfs_key max_key = {
884 .objectid = (u64)-1,
885 .type = (u8)-1,
886 .offset = (u64)-1
887 };
888
889 rc = kzalloc(sizeof(*rc), GFP_NOFS);
890 if (!rc)
891 return ERR_PTR(-ENOMEM);
892
893 rc->root = root;
894 rc->key_start = *key_start;
895 rc->key_end = *key_end;
896 atomic_set(&rc->elems, 0);
897 init_waitqueue_head(&rc->wait);
898 kref_init(&rc->refcnt);
899 kref_get(&rc->refcnt); /* one ref for having elements */
900
901 node = btrfs_root_node(root);
902 start = node->start;
903 level = btrfs_header_level(node);
904 generation = btrfs_header_generation(node);
905 free_extent_buffer(node);
906
907 reada_add_block(rc, start, &max_key, level, generation);
908
909 reada_start_machine(root->fs_info);
910
911 return rc;
912}
913
914#ifdef DEBUG
915int btrfs_reada_wait(void *handle)
916{
917 struct reada_control *rc = handle;
918
919 while (atomic_read(&rc->elems)) {
920 wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
921 5 * HZ);
922 dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
923 }
924
925 dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
926
927 kref_put(&rc->refcnt, reada_control_release);
928
929 return 0;
930}
931#else
932int btrfs_reada_wait(void *handle)
933{
934 struct reada_control *rc = handle;
935
936 while (atomic_read(&rc->elems)) {
937 wait_event(rc->wait, atomic_read(&rc->elems) == 0);
938 }
939
940 kref_put(&rc->refcnt, reada_control_release);
941
942 return 0;
943}
944#endif
945
946void btrfs_reada_detach(void *handle)
947{
948 struct reada_control *rc = handle;
949
950 kref_put(&rc->refcnt, reada_control_release);
951}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 59bb1764273d..24d654ce7a06 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2041,8 +2041,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2041 BUG_ON(IS_ERR(trans)); 2041 BUG_ON(IS_ERR(trans));
2042 trans->block_rsv = rc->block_rsv; 2042 trans->block_rsv = rc->block_rsv;
2043 2043
2044 ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, 2044 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved);
2045 min_reserved, 0);
2046 if (ret) { 2045 if (ret) {
2047 BUG_ON(ret != -EAGAIN); 2046 BUG_ON(ret != -EAGAIN);
2048 ret = btrfs_commit_transaction(trans, root); 2047 ret = btrfs_commit_transaction(trans, root);
@@ -2152,8 +2151,7 @@ int prepare_to_merge(struct reloc_control *rc, int err)
2152again: 2151again:
2153 if (!err) { 2152 if (!err) {
2154 num_bytes = rc->merging_rsv_size; 2153 num_bytes = rc->merging_rsv_size;
2155 ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv, 2154 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
2156 num_bytes);
2157 if (ret) 2155 if (ret)
2158 err = ret; 2156 err = ret;
2159 } 2157 }
@@ -2427,7 +2425,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2427 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2425 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2428 2426
2429 trans->block_rsv = rc->block_rsv; 2427 trans->block_rsv = rc->block_rsv;
2430 ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes); 2428 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
2431 if (ret) { 2429 if (ret) {
2432 if (ret == -EAGAIN) 2430 if (ret == -EAGAIN)
2433 rc->commit_transaction = 1; 2431 rc->commit_transaction = 1;
@@ -2922,6 +2920,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
2922 unsigned long last_index; 2920 unsigned long last_index;
2923 struct page *page; 2921 struct page *page;
2924 struct file_ra_state *ra; 2922 struct file_ra_state *ra;
2923 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
2925 int nr = 0; 2924 int nr = 0;
2926 int ret = 0; 2925 int ret = 0;
2927 2926
@@ -2956,7 +2955,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
2956 ra, NULL, index, 2955 ra, NULL, index,
2957 last_index + 1 - index); 2956 last_index + 1 - index);
2958 page = find_or_create_page(inode->i_mapping, index, 2957 page = find_or_create_page(inode->i_mapping, index,
2959 GFP_NOFS); 2958 mask);
2960 if (!page) { 2959 if (!page) {
2961 btrfs_delalloc_release_metadata(inode, 2960 btrfs_delalloc_release_metadata(inode,
2962 PAGE_CACHE_SIZE); 2961 PAGE_CACHE_SIZE);
@@ -3323,8 +3322,11 @@ static int find_data_references(struct reloc_control *rc,
3323 } 3322 }
3324 3323
3325 key.objectid = ref_objectid; 3324 key.objectid = ref_objectid;
3326 key.offset = ref_offset;
3327 key.type = BTRFS_EXTENT_DATA_KEY; 3325 key.type = BTRFS_EXTENT_DATA_KEY;
3326 if (ref_offset > ((u64)-1 << 32))
3327 key.offset = 0;
3328 else
3329 key.offset = ref_offset;
3328 3330
3329 path->search_commit_root = 1; 3331 path->search_commit_root = 1;
3330 path->skip_locking = 1; 3332 path->skip_locking = 1;
@@ -3645,14 +3647,11 @@ int prepare_to_relocate(struct reloc_control *rc)
3645 * btrfs_init_reloc_root will use them when there 3647 * btrfs_init_reloc_root will use them when there
3646 * is no reservation in transaction handle. 3648 * is no reservation in transaction handle.
3647 */ 3649 */
3648 ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv, 3650 ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
3649 rc->extent_root->nodesize * 256); 3651 rc->extent_root->nodesize * 256);
3650 if (ret) 3652 if (ret)
3651 return ret; 3653 return ret;
3652 3654
3653 rc->block_rsv->refill_used = 1;
3654 btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
3655
3656 memset(&rc->cluster, 0, sizeof(rc->cluster)); 3655 memset(&rc->cluster, 0, sizeof(rc->cluster));
3657 rc->search_start = rc->block_group->key.objectid; 3656 rc->search_start = rc->block_group->key.objectid;
3658 rc->extents_found = 0; 3657 rc->extents_found = 0;
@@ -3777,8 +3776,7 @@ restart:
3777 } 3776 }
3778 } 3777 }
3779 3778
3780 ret = btrfs_block_rsv_check(trans, rc->extent_root, 3779 ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5);
3781 rc->block_rsv, 0, 5);
3782 if (ret < 0) { 3780 if (ret < 0) {
3783 if (ret != -EAGAIN) { 3781 if (ret != -EAGAIN) {
3784 err = ret; 3782 err = ret;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a8d03d5efb5d..ed11d3866afd 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -17,10 +17,14 @@
17 */ 17 */
18 18
19#include <linux/blkdev.h> 19#include <linux/blkdev.h>
20#include <linux/ratelimit.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "volumes.h" 22#include "volumes.h"
22#include "disk-io.h" 23#include "disk-io.h"
23#include "ordered-data.h" 24#include "ordered-data.h"
25#include "transaction.h"
26#include "backref.h"
27#include "extent_io.h"
24 28
25/* 29/*
26 * This is only the first step towards a full-features scrub. It reads all 30 * This is only the first step towards a full-features scrub. It reads all
@@ -29,15 +33,12 @@
29 * any can be found. 33 * any can be found.
30 * 34 *
31 * Future enhancements: 35 * Future enhancements:
32 * - To enhance the performance, better read-ahead strategies for the
33 * extent-tree can be employed.
34 * - In case an unrepairable extent is encountered, track which files are 36 * - In case an unrepairable extent is encountered, track which files are
35 * affected and report them 37 * affected and report them
36 * - In case of a read error on files with nodatasum, map the file and read 38 * - In case of a read error on files with nodatasum, map the file and read
37 * the extent to trigger a writeback of the good copy 39 * the extent to trigger a writeback of the good copy
38 * - track and record media errors, throw out bad devices 40 * - track and record media errors, throw out bad devices
39 * - add a mode to also read unallocated space 41 * - add a mode to also read unallocated space
40 * - make the prefetch cancellable
41 */ 42 */
42 43
43struct scrub_bio; 44struct scrub_bio;
@@ -63,7 +64,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix);
63struct scrub_page { 64struct scrub_page {
64 u64 flags; /* extent flags */ 65 u64 flags; /* extent flags */
65 u64 generation; 66 u64 generation;
66 u64 mirror_num; 67 int mirror_num;
67 int have_csum; 68 int have_csum;
68 u8 csum[BTRFS_CSUM_SIZE]; 69 u8 csum[BTRFS_CSUM_SIZE];
69}; 70};
@@ -87,6 +88,7 @@ struct scrub_dev {
87 int first_free; 88 int first_free;
88 int curr; 89 int curr;
89 atomic_t in_flight; 90 atomic_t in_flight;
91 atomic_t fixup_cnt;
90 spinlock_t list_lock; 92 spinlock_t list_lock;
91 wait_queue_head_t list_wait; 93 wait_queue_head_t list_wait;
92 u16 csum_size; 94 u16 csum_size;
@@ -100,6 +102,27 @@ struct scrub_dev {
100 spinlock_t stat_lock; 102 spinlock_t stat_lock;
101}; 103};
102 104
105struct scrub_fixup_nodatasum {
106 struct scrub_dev *sdev;
107 u64 logical;
108 struct btrfs_root *root;
109 struct btrfs_work work;
110 int mirror_num;
111};
112
113struct scrub_warning {
114 struct btrfs_path *path;
115 u64 extent_item_size;
116 char *scratch_buf;
117 char *msg_buf;
118 const char *errstr;
119 sector_t sector;
120 u64 logical;
121 struct btrfs_device *dev;
122 int msg_bufsize;
123 int scratch_bufsize;
124};
125
103static void scrub_free_csums(struct scrub_dev *sdev) 126static void scrub_free_csums(struct scrub_dev *sdev)
104{ 127{
105 while (!list_empty(&sdev->csum_list)) { 128 while (!list_empty(&sdev->csum_list)) {
@@ -175,14 +198,15 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
175 198
176 if (i != SCRUB_BIOS_PER_DEV-1) 199 if (i != SCRUB_BIOS_PER_DEV-1)
177 sdev->bios[i]->next_free = i + 1; 200 sdev->bios[i]->next_free = i + 1;
178 else 201 else
179 sdev->bios[i]->next_free = -1; 202 sdev->bios[i]->next_free = -1;
180 } 203 }
181 sdev->first_free = 0; 204 sdev->first_free = 0;
182 sdev->curr = -1; 205 sdev->curr = -1;
183 atomic_set(&sdev->in_flight, 0); 206 atomic_set(&sdev->in_flight, 0);
207 atomic_set(&sdev->fixup_cnt, 0);
184 atomic_set(&sdev->cancel_req, 0); 208 atomic_set(&sdev->cancel_req, 0);
185 sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy); 209 sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
186 INIT_LIST_HEAD(&sdev->csum_list); 210 INIT_LIST_HEAD(&sdev->csum_list);
187 211
188 spin_lock_init(&sdev->list_lock); 212 spin_lock_init(&sdev->list_lock);
@@ -195,24 +219,361 @@ nomem:
195 return ERR_PTR(-ENOMEM); 219 return ERR_PTR(-ENOMEM);
196} 220}
197 221
222static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
223{
224 u64 isize;
225 u32 nlink;
226 int ret;
227 int i;
228 struct extent_buffer *eb;
229 struct btrfs_inode_item *inode_item;
230 struct scrub_warning *swarn = ctx;
231 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
232 struct inode_fs_paths *ipath = NULL;
233 struct btrfs_root *local_root;
234 struct btrfs_key root_key;
235
236 root_key.objectid = root;
237 root_key.type = BTRFS_ROOT_ITEM_KEY;
238 root_key.offset = (u64)-1;
239 local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
240 if (IS_ERR(local_root)) {
241 ret = PTR_ERR(local_root);
242 goto err;
243 }
244
245 ret = inode_item_info(inum, 0, local_root, swarn->path);
246 if (ret) {
247 btrfs_release_path(swarn->path);
248 goto err;
249 }
250
251 eb = swarn->path->nodes[0];
252 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
253 struct btrfs_inode_item);
254 isize = btrfs_inode_size(eb, inode_item);
255 nlink = btrfs_inode_nlink(eb, inode_item);
256 btrfs_release_path(swarn->path);
257
258 ipath = init_ipath(4096, local_root, swarn->path);
259 ret = paths_from_inode(inum, ipath);
260
261 if (ret < 0)
262 goto err;
263
264 /*
265 * we deliberately ignore the bit ipath might have been too small to
266 * hold all of the paths here
267 */
268 for (i = 0; i < ipath->fspath->elem_cnt; ++i)
269 printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
270 "%s, sector %llu, root %llu, inode %llu, offset %llu, "
271 "length %llu, links %u (path: %s)\n", swarn->errstr,
272 swarn->logical, swarn->dev->name,
273 (unsigned long long)swarn->sector, root, inum, offset,
274 min(isize - offset, (u64)PAGE_SIZE), nlink,
275 (char *)ipath->fspath->val[i]);
276
277 free_ipath(ipath);
278 return 0;
279
280err:
281 printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
282 "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
283 "resolving failed with ret=%d\n", swarn->errstr,
284 swarn->logical, swarn->dev->name,
285 (unsigned long long)swarn->sector, root, inum, offset, ret);
286
287 free_ipath(ipath);
288 return 0;
289}
290
291static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
292 int ix)
293{
294 struct btrfs_device *dev = sbio->sdev->dev;
295 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
296 struct btrfs_path *path;
297 struct btrfs_key found_key;
298 struct extent_buffer *eb;
299 struct btrfs_extent_item *ei;
300 struct scrub_warning swarn;
301 u32 item_size;
302 int ret;
303 u64 ref_root;
304 u8 ref_level;
305 unsigned long ptr = 0;
306 const int bufsize = 4096;
307 u64 extent_offset;
308
309 path = btrfs_alloc_path();
310
311 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
312 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
313 swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
314 swarn.logical = sbio->logical + ix * PAGE_SIZE;
315 swarn.errstr = errstr;
316 swarn.dev = dev;
317 swarn.msg_bufsize = bufsize;
318 swarn.scratch_bufsize = bufsize;
319
320 if (!path || !swarn.scratch_buf || !swarn.msg_buf)
321 goto out;
322
323 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key);
324 if (ret < 0)
325 goto out;
326
327 extent_offset = swarn.logical - found_key.objectid;
328 swarn.extent_item_size = found_key.offset;
329
330 eb = path->nodes[0];
331 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
332 item_size = btrfs_item_size_nr(eb, path->slots[0]);
333
334 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
335 do {
336 ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
337 &ref_root, &ref_level);
338 printk(KERN_WARNING "%s at logical %llu on dev %s, "
339 "sector %llu: metadata %s (level %d) in tree "
340 "%llu\n", errstr, swarn.logical, dev->name,
341 (unsigned long long)swarn.sector,
342 ref_level ? "node" : "leaf",
343 ret < 0 ? -1 : ref_level,
344 ret < 0 ? -1 : ref_root);
345 } while (ret != 1);
346 } else {
347 swarn.path = path;
348 iterate_extent_inodes(fs_info, path, found_key.objectid,
349 extent_offset,
350 scrub_print_warning_inode, &swarn);
351 }
352
353out:
354 btrfs_free_path(path);
355 kfree(swarn.scratch_buf);
356 kfree(swarn.msg_buf);
357}
358
359static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
360{
361 struct page *page = NULL;
362 unsigned long index;
363 struct scrub_fixup_nodatasum *fixup = ctx;
364 int ret;
365 int corrected = 0;
366 struct btrfs_key key;
367 struct inode *inode = NULL;
368 u64 end = offset + PAGE_SIZE - 1;
369 struct btrfs_root *local_root;
370
371 key.objectid = root;
372 key.type = BTRFS_ROOT_ITEM_KEY;
373 key.offset = (u64)-1;
374 local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
375 if (IS_ERR(local_root))
376 return PTR_ERR(local_root);
377
378 key.type = BTRFS_INODE_ITEM_KEY;
379 key.objectid = inum;
380 key.offset = 0;
381 inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
382 if (IS_ERR(inode))
383 return PTR_ERR(inode);
384
385 index = offset >> PAGE_CACHE_SHIFT;
386
387 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
388 if (!page) {
389 ret = -ENOMEM;
390 goto out;
391 }
392
393 if (PageUptodate(page)) {
394 struct btrfs_mapping_tree *map_tree;
395 if (PageDirty(page)) {
396 /*
397 * we need to write the data to the defect sector. the
398 * data that was in that sector is not in memory,
399 * because the page was modified. we must not write the
400 * modified page to that sector.
401 *
402 * TODO: what could be done here: wait for the delalloc
403 * runner to write out that page (might involve
404 * COW) and see whether the sector is still
405 * referenced afterwards.
406 *
407 * For the meantime, we'll treat this error
408 * incorrectable, although there is a chance that a
409 * later scrub will find the bad sector again and that
410 * there's no dirty page in memory, then.
411 */
412 ret = -EIO;
413 goto out;
414 }
415 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
416 ret = repair_io_failure(map_tree, offset, PAGE_SIZE,
417 fixup->logical, page,
418 fixup->mirror_num);
419 unlock_page(page);
420 corrected = !ret;
421 } else {
422 /*
423 * we need to get good data first. the general readpage path
424 * will call repair_io_failure for us, we just have to make
425 * sure we read the bad mirror.
426 */
427 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
428 EXTENT_DAMAGED, GFP_NOFS);
429 if (ret) {
430 /* set_extent_bits should give proper error */
431 WARN_ON(ret > 0);
432 if (ret > 0)
433 ret = -EFAULT;
434 goto out;
435 }
436
437 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
438 btrfs_get_extent,
439 fixup->mirror_num);
440 wait_on_page_locked(page);
441
442 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
443 end, EXTENT_DAMAGED, 0, NULL);
444 if (!corrected)
445 clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
446 EXTENT_DAMAGED, GFP_NOFS);
447 }
448
449out:
450 if (page)
451 put_page(page);
452 if (inode)
453 iput(inode);
454
455 if (ret < 0)
456 return ret;
457
458 if (ret == 0 && corrected) {
459 /*
460 * we only need to call readpage for one of the inodes belonging
461 * to this extent. so make iterate_extent_inodes stop
462 */
463 return 1;
464 }
465
466 return -EIO;
467}
468
469static void scrub_fixup_nodatasum(struct btrfs_work *work)
470{
471 int ret;
472 struct scrub_fixup_nodatasum *fixup;
473 struct scrub_dev *sdev;
474 struct btrfs_trans_handle *trans = NULL;
475 struct btrfs_fs_info *fs_info;
476 struct btrfs_path *path;
477 int uncorrectable = 0;
478
479 fixup = container_of(work, struct scrub_fixup_nodatasum, work);
480 sdev = fixup->sdev;
481 fs_info = fixup->root->fs_info;
482
483 path = btrfs_alloc_path();
484 if (!path) {
485 spin_lock(&sdev->stat_lock);
486 ++sdev->stat.malloc_errors;
487 spin_unlock(&sdev->stat_lock);
488 uncorrectable = 1;
489 goto out;
490 }
491
492 trans = btrfs_join_transaction(fixup->root);
493 if (IS_ERR(trans)) {
494 uncorrectable = 1;
495 goto out;
496 }
497
498 /*
499 * the idea is to trigger a regular read through the standard path. we
500 * read a page from the (failed) logical address by specifying the
501 * corresponding copynum of the failed sector. thus, that readpage is
502 * expected to fail.
503 * that is the point where on-the-fly error correction will kick in
504 * (once it's finished) and rewrite the failed sector if a good copy
505 * can be found.
506 */
507 ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
508 path, scrub_fixup_readpage,
509 fixup);
510 if (ret < 0) {
511 uncorrectable = 1;
512 goto out;
513 }
514 WARN_ON(ret != 1);
515
516 spin_lock(&sdev->stat_lock);
517 ++sdev->stat.corrected_errors;
518 spin_unlock(&sdev->stat_lock);
519
520out:
521 if (trans && !IS_ERR(trans))
522 btrfs_end_transaction(trans, fixup->root);
523 if (uncorrectable) {
524 spin_lock(&sdev->stat_lock);
525 ++sdev->stat.uncorrectable_errors;
526 spin_unlock(&sdev->stat_lock);
527 printk_ratelimited(KERN_ERR "btrfs: unable to fixup "
528 "(nodatasum) error at logical %llu\n",
529 fixup->logical);
530 }
531
532 btrfs_free_path(path);
533 kfree(fixup);
534
535 /* see caller why we're pretending to be paused in the scrub counters */
536 mutex_lock(&fs_info->scrub_lock);
537 atomic_dec(&fs_info->scrubs_running);
538 atomic_dec(&fs_info->scrubs_paused);
539 mutex_unlock(&fs_info->scrub_lock);
540 atomic_dec(&sdev->fixup_cnt);
541 wake_up(&fs_info->scrub_pause_wait);
542 wake_up(&sdev->list_wait);
543}
544
198/* 545/*
199 * scrub_recheck_error gets called when either verification of the page 546 * scrub_recheck_error gets called when either verification of the page
200 * failed or the bio failed to read, e.g. with EIO. In the latter case, 547 * failed or the bio failed to read, e.g. with EIO. In the latter case,
201 * recheck_error gets called for every page in the bio, even though only 548 * recheck_error gets called for every page in the bio, even though only
202 * one may be bad 549 * one may be bad
203 */ 550 */
204static void scrub_recheck_error(struct scrub_bio *sbio, int ix) 551static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
205{ 552{
553 struct scrub_dev *sdev = sbio->sdev;
554 u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
555 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
556 DEFAULT_RATELIMIT_BURST);
557
206 if (sbio->err) { 558 if (sbio->err) {
207 if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, 559 if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,
208 (sbio->physical + ix * PAGE_SIZE) >> 9,
209 sbio->bio->bi_io_vec[ix].bv_page) == 0) { 560 sbio->bio->bi_io_vec[ix].bv_page) == 0) {
210 if (scrub_fixup_check(sbio, ix) == 0) 561 if (scrub_fixup_check(sbio, ix) == 0)
211 return; 562 return 0;
212 } 563 }
564 if (__ratelimit(&_rs))
565 scrub_print_warning("i/o error", sbio, ix);
566 } else {
567 if (__ratelimit(&_rs))
568 scrub_print_warning("checksum error", sbio, ix);
213 } 569 }
214 570
571 spin_lock(&sdev->stat_lock);
572 ++sdev->stat.read_errors;
573 spin_unlock(&sdev->stat_lock);
574
215 scrub_fixup(sbio, ix); 575 scrub_fixup(sbio, ix);
576 return 1;
216} 577}
217 578
218static int scrub_fixup_check(struct scrub_bio *sbio, int ix) 579static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
@@ -250,7 +611,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
250 struct scrub_dev *sdev = sbio->sdev; 611 struct scrub_dev *sdev = sbio->sdev;
251 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 612 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
252 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 613 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
253 struct btrfs_multi_bio *multi = NULL; 614 struct btrfs_bio *bbio = NULL;
615 struct scrub_fixup_nodatasum *fixup;
254 u64 logical = sbio->logical + ix * PAGE_SIZE; 616 u64 logical = sbio->logical + ix * PAGE_SIZE;
255 u64 length; 617 u64 length;
256 int i; 618 int i;
@@ -259,38 +621,57 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
259 621
260 if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && 622 if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
261 (sbio->spag[ix].have_csum == 0)) { 623 (sbio->spag[ix].have_csum == 0)) {
624 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
625 if (!fixup)
626 goto uncorrectable;
627 fixup->sdev = sdev;
628 fixup->logical = logical;
629 fixup->root = fs_info->extent_root;
630 fixup->mirror_num = sbio->spag[ix].mirror_num;
262 /* 631 /*
263 * nodatasum, don't try to fix anything 632 * increment scrubs_running to prevent cancel requests from
264 * FIXME: we can do better, open the inode and trigger a 633 * completing as long as a fixup worker is running. we must also
265 * writeback 634 * increment scrubs_paused to prevent deadlocking on pause
635 * requests used for transactions commits (as the worker uses a
636 * transaction context). it is safe to regard the fixup worker
637 * as paused for all matters practical. effectively, we only
638 * avoid cancellation requests from completing.
266 */ 639 */
267 goto uncorrectable; 640 mutex_lock(&fs_info->scrub_lock);
641 atomic_inc(&fs_info->scrubs_running);
642 atomic_inc(&fs_info->scrubs_paused);
643 mutex_unlock(&fs_info->scrub_lock);
644 atomic_inc(&sdev->fixup_cnt);
645 fixup->work.func = scrub_fixup_nodatasum;
646 btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
647 return;
268 } 648 }
269 649
270 length = PAGE_SIZE; 650 length = PAGE_SIZE;
271 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, 651 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
272 &multi, 0); 652 &bbio, 0);
273 if (ret || !multi || length < PAGE_SIZE) { 653 if (ret || !bbio || length < PAGE_SIZE) {
274 printk(KERN_ERR 654 printk(KERN_ERR
275 "scrub_fixup: btrfs_map_block failed us for %llu\n", 655 "scrub_fixup: btrfs_map_block failed us for %llu\n",
276 (unsigned long long)logical); 656 (unsigned long long)logical);
277 WARN_ON(1); 657 WARN_ON(1);
658 kfree(bbio);
278 return; 659 return;
279 } 660 }
280 661
281 if (multi->num_stripes == 1) 662 if (bbio->num_stripes == 1)
282 /* there aren't any replicas */ 663 /* there aren't any replicas */
283 goto uncorrectable; 664 goto uncorrectable;
284 665
285 /* 666 /*
286 * first find a good copy 667 * first find a good copy
287 */ 668 */
288 for (i = 0; i < multi->num_stripes; ++i) { 669 for (i = 0; i < bbio->num_stripes; ++i) {
289 if (i == sbio->spag[ix].mirror_num) 670 if (i + 1 == sbio->spag[ix].mirror_num)
290 continue; 671 continue;
291 672
292 if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev, 673 if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev,
293 multi->stripes[i].physical >> 9, 674 bbio->stripes[i].physical >> 9,
294 sbio->bio->bi_io_vec[ix].bv_page)) { 675 sbio->bio->bi_io_vec[ix].bv_page)) {
295 /* I/O-error, this is not a good copy */ 676 /* I/O-error, this is not a good copy */
296 continue; 677 continue;
@@ -299,7 +680,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
299 if (scrub_fixup_check(sbio, ix) == 0) 680 if (scrub_fixup_check(sbio, ix) == 0)
300 break; 681 break;
301 } 682 }
302 if (i == multi->num_stripes) 683 if (i == bbio->num_stripes)
303 goto uncorrectable; 684 goto uncorrectable;
304 685
305 if (!sdev->readonly) { 686 if (!sdev->readonly) {
@@ -314,25 +695,23 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
314 } 695 }
315 } 696 }
316 697
317 kfree(multi); 698 kfree(bbio);
318 spin_lock(&sdev->stat_lock); 699 spin_lock(&sdev->stat_lock);
319 ++sdev->stat.corrected_errors; 700 ++sdev->stat.corrected_errors;
320 spin_unlock(&sdev->stat_lock); 701 spin_unlock(&sdev->stat_lock);
321 702
322 if (printk_ratelimit()) 703 printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n",
323 printk(KERN_ERR "btrfs: fixed up at %llu\n", 704 (unsigned long long)logical);
324 (unsigned long long)logical);
325 return; 705 return;
326 706
327uncorrectable: 707uncorrectable:
328 kfree(multi); 708 kfree(bbio);
329 spin_lock(&sdev->stat_lock); 709 spin_lock(&sdev->stat_lock);
330 ++sdev->stat.uncorrectable_errors; 710 ++sdev->stat.uncorrectable_errors;
331 spin_unlock(&sdev->stat_lock); 711 spin_unlock(&sdev->stat_lock);
332 712
333 if (printk_ratelimit()) 713 printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at "
334 printk(KERN_ERR "btrfs: unable to fixup at %llu\n", 714 "logical %llu\n", (unsigned long long)logical);
335 (unsigned long long)logical);
336} 715}
337 716
338static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, 717static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
@@ -382,8 +761,14 @@ static void scrub_checksum(struct btrfs_work *work)
382 int ret; 761 int ret;
383 762
384 if (sbio->err) { 763 if (sbio->err) {
764 ret = 0;
385 for (i = 0; i < sbio->count; ++i) 765 for (i = 0; i < sbio->count; ++i)
386 scrub_recheck_error(sbio, i); 766 ret |= scrub_recheck_error(sbio, i);
767 if (!ret) {
768 spin_lock(&sdev->stat_lock);
769 ++sdev->stat.unverified_errors;
770 spin_unlock(&sdev->stat_lock);
771 }
387 772
388 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); 773 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
389 sbio->bio->bi_flags |= 1 << BIO_UPTODATE; 774 sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
@@ -396,10 +781,6 @@ static void scrub_checksum(struct btrfs_work *work)
396 bi->bv_offset = 0; 781 bi->bv_offset = 0;
397 bi->bv_len = PAGE_SIZE; 782 bi->bv_len = PAGE_SIZE;
398 } 783 }
399
400 spin_lock(&sdev->stat_lock);
401 ++sdev->stat.read_errors;
402 spin_unlock(&sdev->stat_lock);
403 goto out; 784 goto out;
404 } 785 }
405 for (i = 0; i < sbio->count; ++i) { 786 for (i = 0; i < sbio->count; ++i) {
@@ -420,8 +801,14 @@ static void scrub_checksum(struct btrfs_work *work)
420 WARN_ON(1); 801 WARN_ON(1);
421 } 802 }
422 kunmap_atomic(buffer, KM_USER0); 803 kunmap_atomic(buffer, KM_USER0);
423 if (ret) 804 if (ret) {
424 scrub_recheck_error(sbio, i); 805 ret = scrub_recheck_error(sbio, i);
806 if (!ret) {
807 spin_lock(&sdev->stat_lock);
808 ++sdev->stat.unverified_errors;
809 spin_unlock(&sdev->stat_lock);
810 }
811 }
425 } 812 }
426 813
427out: 814out:
@@ -604,7 +991,7 @@ nomem:
604} 991}
605 992
606static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, 993static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
607 u64 physical, u64 flags, u64 gen, u64 mirror_num, 994 u64 physical, u64 flags, u64 gen, int mirror_num,
608 u8 *csum, int force) 995 u8 *csum, int force)
609{ 996{
610 struct scrub_bio *sbio; 997 struct scrub_bio *sbio;
@@ -701,7 +1088,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
701 1088
702/* scrub extent tries to collect up to 64 kB for each bio */ 1089/* scrub extent tries to collect up to 64 kB for each bio */
703static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, 1090static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
704 u64 physical, u64 flags, u64 gen, u64 mirror_num) 1091 u64 physical, u64 flags, u64 gen, int mirror_num)
705{ 1092{
706 int ret; 1093 int ret;
707 u8 csum[BTRFS_CSUM_SIZE]; 1094 u8 csum[BTRFS_CSUM_SIZE];
@@ -741,13 +1128,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
741 int slot; 1128 int slot;
742 int i; 1129 int i;
743 u64 nstripes; 1130 u64 nstripes;
744 int start_stripe;
745 struct extent_buffer *l; 1131 struct extent_buffer *l;
746 struct btrfs_key key; 1132 struct btrfs_key key;
747 u64 physical; 1133 u64 physical;
748 u64 logical; 1134 u64 logical;
749 u64 generation; 1135 u64 generation;
750 u64 mirror_num; 1136 int mirror_num;
1137 struct reada_control *reada1;
1138 struct reada_control *reada2;
1139 struct btrfs_key key_start;
1140 struct btrfs_key key_end;
751 1141
752 u64 increment = map->stripe_len; 1142 u64 increment = map->stripe_len;
753 u64 offset; 1143 u64 offset;
@@ -758,102 +1148,88 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
758 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 1148 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
759 offset = map->stripe_len * num; 1149 offset = map->stripe_len * num;
760 increment = map->stripe_len * map->num_stripes; 1150 increment = map->stripe_len * map->num_stripes;
761 mirror_num = 0; 1151 mirror_num = 1;
762 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 1152 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
763 int factor = map->num_stripes / map->sub_stripes; 1153 int factor = map->num_stripes / map->sub_stripes;
764 offset = map->stripe_len * (num / map->sub_stripes); 1154 offset = map->stripe_len * (num / map->sub_stripes);
765 increment = map->stripe_len * factor; 1155 increment = map->stripe_len * factor;
766 mirror_num = num % map->sub_stripes; 1156 mirror_num = num % map->sub_stripes + 1;
767 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 1157 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
768 increment = map->stripe_len; 1158 increment = map->stripe_len;
769 mirror_num = num % map->num_stripes; 1159 mirror_num = num % map->num_stripes + 1;
770 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 1160 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
771 increment = map->stripe_len; 1161 increment = map->stripe_len;
772 mirror_num = num % map->num_stripes; 1162 mirror_num = num % map->num_stripes + 1;
773 } else { 1163 } else {
774 increment = map->stripe_len; 1164 increment = map->stripe_len;
775 mirror_num = 0; 1165 mirror_num = 1;
776 } 1166 }
777 1167
778 path = btrfs_alloc_path(); 1168 path = btrfs_alloc_path();
779 if (!path) 1169 if (!path)
780 return -ENOMEM; 1170 return -ENOMEM;
781 1171
782 path->reada = 2;
783 path->search_commit_root = 1; 1172 path->search_commit_root = 1;
784 path->skip_locking = 1; 1173 path->skip_locking = 1;
785 1174
786 /* 1175 /*
787 * find all extents for each stripe and just read them to get 1176 * trigger the readahead for extent tree csum tree and wait for
788 * them into the page cache 1177 * completion. During readahead, the scrub is officially paused
789 * FIXME: we can do better. build a more intelligent prefetching 1178 * to not hold off transaction commits
790 */ 1179 */
791 logical = base + offset; 1180 logical = base + offset;
792 physical = map->stripes[num].physical;
793 ret = 0;
794 for (i = 0; i < nstripes; ++i) {
795 key.objectid = logical;
796 key.type = BTRFS_EXTENT_ITEM_KEY;
797 key.offset = (u64)0;
798
799 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
800 if (ret < 0)
801 goto out_noplug;
802
803 /*
804 * we might miss half an extent here, but that doesn't matter,
805 * as it's only the prefetch
806 */
807 while (1) {
808 l = path->nodes[0];
809 slot = path->slots[0];
810 if (slot >= btrfs_header_nritems(l)) {
811 ret = btrfs_next_leaf(root, path);
812 if (ret == 0)
813 continue;
814 if (ret < 0)
815 goto out_noplug;
816 1181
817 break; 1182 wait_event(sdev->list_wait,
818 } 1183 atomic_read(&sdev->in_flight) == 0);
819 btrfs_item_key_to_cpu(l, &key, slot); 1184 atomic_inc(&fs_info->scrubs_paused);
1185 wake_up(&fs_info->scrub_pause_wait);
820 1186
821 if (key.objectid >= logical + map->stripe_len) 1187 /* FIXME it might be better to start readahead at commit root */
822 break; 1188 key_start.objectid = logical;
1189 key_start.type = BTRFS_EXTENT_ITEM_KEY;
1190 key_start.offset = (u64)0;
1191 key_end.objectid = base + offset + nstripes * increment;
1192 key_end.type = BTRFS_EXTENT_ITEM_KEY;
1193 key_end.offset = (u64)0;
1194 reada1 = btrfs_reada_add(root, &key_start, &key_end);
1195
1196 key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1197 key_start.type = BTRFS_EXTENT_CSUM_KEY;
1198 key_start.offset = logical;
1199 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1200 key_end.type = BTRFS_EXTENT_CSUM_KEY;
1201 key_end.offset = base + offset + nstripes * increment;
1202 reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
1203
1204 if (!IS_ERR(reada1))
1205 btrfs_reada_wait(reada1);
1206 if (!IS_ERR(reada2))
1207 btrfs_reada_wait(reada2);
823 1208
824 path->slots[0]++; 1209 mutex_lock(&fs_info->scrub_lock);
825 } 1210 while (atomic_read(&fs_info->scrub_pause_req)) {
826 btrfs_release_path(path); 1211 mutex_unlock(&fs_info->scrub_lock);
827 logical += increment; 1212 wait_event(fs_info->scrub_pause_wait,
828 physical += map->stripe_len; 1213 atomic_read(&fs_info->scrub_pause_req) == 0);
829 cond_resched(); 1214 mutex_lock(&fs_info->scrub_lock);
830 } 1215 }
1216 atomic_dec(&fs_info->scrubs_paused);
1217 mutex_unlock(&fs_info->scrub_lock);
1218 wake_up(&fs_info->scrub_pause_wait);
831 1219
832 /* 1220 /*
833 * collect all data csums for the stripe to avoid seeking during 1221 * collect all data csums for the stripe to avoid seeking during
834 * the scrub. This might currently (crc32) end up to be about 1MB 1222 * the scrub. This might currently (crc32) end up to be about 1MB
835 */ 1223 */
836 start_stripe = 0;
837 blk_start_plug(&plug); 1224 blk_start_plug(&plug);
838again:
839 logical = base + offset + start_stripe * increment;
840 for (i = start_stripe; i < nstripes; ++i) {
841 ret = btrfs_lookup_csums_range(csum_root, logical,
842 logical + map->stripe_len - 1,
843 &sdev->csum_list, 1);
844 if (ret)
845 goto out;
846 1225
847 logical += increment;
848 cond_resched();
849 }
850 /* 1226 /*
851 * now find all extents for each stripe and scrub them 1227 * now find all extents for each stripe and scrub them
852 */ 1228 */
853 logical = base + offset + start_stripe * increment; 1229 logical = base + offset;
854 physical = map->stripes[num].physical + start_stripe * map->stripe_len; 1230 physical = map->stripes[num].physical;
855 ret = 0; 1231 ret = 0;
856 for (i = start_stripe; i < nstripes; ++i) { 1232 for (i = 0; i < nstripes; ++i) {
857 /* 1233 /*
858 * canceled? 1234 * canceled?
859 */ 1235 */
@@ -882,11 +1258,14 @@ again:
882 atomic_dec(&fs_info->scrubs_paused); 1258 atomic_dec(&fs_info->scrubs_paused);
883 mutex_unlock(&fs_info->scrub_lock); 1259 mutex_unlock(&fs_info->scrub_lock);
884 wake_up(&fs_info->scrub_pause_wait); 1260 wake_up(&fs_info->scrub_pause_wait);
885 scrub_free_csums(sdev);
886 start_stripe = i;
887 goto again;
888 } 1261 }
889 1262
1263 ret = btrfs_lookup_csums_range(csum_root, logical,
1264 logical + map->stripe_len - 1,
1265 &sdev->csum_list, 1);
1266 if (ret)
1267 goto out;
1268
890 key.objectid = logical; 1269 key.objectid = logical;
891 key.type = BTRFS_EXTENT_ITEM_KEY; 1270 key.type = BTRFS_EXTENT_ITEM_KEY;
892 key.offset = (u64)0; 1271 key.offset = (u64)0;
@@ -982,7 +1361,6 @@ next:
982 1361
983out: 1362out:
984 blk_finish_plug(&plug); 1363 blk_finish_plug(&plug);
985out_noplug:
986 btrfs_free_path(path); 1364 btrfs_free_path(path);
987 return ret < 0 ? ret : 0; 1365 return ret < 0 ? ret : 0;
988} 1366}
@@ -1253,10 +1631,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
1253 ret = scrub_enumerate_chunks(sdev, start, end); 1631 ret = scrub_enumerate_chunks(sdev, start, end);
1254 1632
1255 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 1633 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
1256
1257 atomic_dec(&fs_info->scrubs_running); 1634 atomic_dec(&fs_info->scrubs_running);
1258 wake_up(&fs_info->scrub_pause_wait); 1635 wake_up(&fs_info->scrub_pause_wait);
1259 1636
1637 wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0);
1638
1260 if (progress) 1639 if (progress)
1261 memcpy(progress, &sdev->stat, sizeof(*progress)); 1640 memcpy(progress, &sdev->stat, sizeof(*progress));
1262 1641
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 15634d4648d7..57080dffdfc6 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -40,6 +40,7 @@
40#include <linux/magic.h> 40#include <linux/magic.h>
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/cleancache.h> 42#include <linux/cleancache.h>
43#include <linux/mnt_namespace.h>
43#include "compat.h" 44#include "compat.h"
44#include "delayed-inode.h" 45#include "delayed-inode.h"
45#include "ctree.h" 46#include "ctree.h"
@@ -58,6 +59,7 @@
58#include <trace/events/btrfs.h> 59#include <trace/events/btrfs.h>
59 60
60static const struct super_operations btrfs_super_ops; 61static const struct super_operations btrfs_super_ops;
62static struct file_system_type btrfs_fs_type;
61 63
62static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, 64static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
63 char nbuf[16]) 65 char nbuf[16])
@@ -162,7 +164,7 @@ enum {
162 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, 164 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
163 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, 165 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
164 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, 166 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
165 Opt_inode_cache, Opt_err, 167 Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
166}; 168};
167 169
168static match_table_t tokens = { 170static match_table_t tokens = {
@@ -195,6 +197,8 @@ static match_table_t tokens = {
195 {Opt_subvolrootid, "subvolrootid=%d"}, 197 {Opt_subvolrootid, "subvolrootid=%d"},
196 {Opt_defrag, "autodefrag"}, 198 {Opt_defrag, "autodefrag"},
197 {Opt_inode_cache, "inode_cache"}, 199 {Opt_inode_cache, "inode_cache"},
200 {Opt_no_space_cache, "no_space_cache"},
201 {Opt_recovery, "recovery"},
198 {Opt_err, NULL}, 202 {Opt_err, NULL},
199}; 203};
200 204
@@ -206,14 +210,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
206{ 210{
207 struct btrfs_fs_info *info = root->fs_info; 211 struct btrfs_fs_info *info = root->fs_info;
208 substring_t args[MAX_OPT_ARGS]; 212 substring_t args[MAX_OPT_ARGS];
209 char *p, *num, *orig; 213 char *p, *num, *orig = NULL;
214 u64 cache_gen;
210 int intarg; 215 int intarg;
211 int ret = 0; 216 int ret = 0;
212 char *compress_type; 217 char *compress_type;
213 bool compress_force = false; 218 bool compress_force = false;
214 219
220 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
221 if (cache_gen)
222 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
223
215 if (!options) 224 if (!options)
216 return 0; 225 goto out;
217 226
218 /* 227 /*
219 * strsep changes the string, duplicate it because parse_options 228 * strsep changes the string, duplicate it because parse_options
@@ -360,9 +369,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
360 btrfs_set_opt(info->mount_opt, DISCARD); 369 btrfs_set_opt(info->mount_opt, DISCARD);
361 break; 370 break;
362 case Opt_space_cache: 371 case Opt_space_cache:
363 printk(KERN_INFO "btrfs: enabling disk space caching\n");
364 btrfs_set_opt(info->mount_opt, SPACE_CACHE); 372 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
365 break; 373 break;
374 case Opt_no_space_cache:
375 printk(KERN_INFO "btrfs: disabling disk space caching\n");
376 btrfs_clear_opt(info->mount_opt, SPACE_CACHE);
377 break;
366 case Opt_inode_cache: 378 case Opt_inode_cache:
367 printk(KERN_INFO "btrfs: enabling inode map caching\n"); 379 printk(KERN_INFO "btrfs: enabling inode map caching\n");
368 btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE); 380 btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE);
@@ -381,6 +393,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
381 printk(KERN_INFO "btrfs: enabling auto defrag"); 393 printk(KERN_INFO "btrfs: enabling auto defrag");
382 btrfs_set_opt(info->mount_opt, AUTO_DEFRAG); 394 btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
383 break; 395 break;
396 case Opt_recovery:
397 printk(KERN_INFO "btrfs: enabling auto recovery");
398 btrfs_set_opt(info->mount_opt, RECOVERY);
399 break;
384 case Opt_err: 400 case Opt_err:
385 printk(KERN_INFO "btrfs: unrecognized mount option " 401 printk(KERN_INFO "btrfs: unrecognized mount option "
386 "'%s'\n", p); 402 "'%s'\n", p);
@@ -391,6 +407,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
391 } 407 }
392 } 408 }
393out: 409out:
410 if (!ret && btrfs_test_opt(root, SPACE_CACHE))
411 printk(KERN_INFO "btrfs: disk space caching is enabled\n");
394 kfree(orig); 412 kfree(orig);
395 return ret; 413 return ret;
396} 414}
@@ -406,12 +424,12 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
406 u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices) 424 u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
407{ 425{
408 substring_t args[MAX_OPT_ARGS]; 426 substring_t args[MAX_OPT_ARGS];
409 char *opts, *orig, *p; 427 char *device_name, *opts, *orig, *p;
410 int error = 0; 428 int error = 0;
411 int intarg; 429 int intarg;
412 430
413 if (!options) 431 if (!options)
414 goto out; 432 return 0;
415 433
416 /* 434 /*
417 * strsep changes the string, duplicate it because parse_options 435 * strsep changes the string, duplicate it because parse_options
@@ -457,29 +475,24 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
457 } 475 }
458 break; 476 break;
459 case Opt_device: 477 case Opt_device:
460 error = btrfs_scan_one_device(match_strdup(&args[0]), 478 device_name = match_strdup(&args[0]);
479 if (!device_name) {
480 error = -ENOMEM;
481 goto out;
482 }
483 error = btrfs_scan_one_device(device_name,
461 flags, holder, fs_devices); 484 flags, holder, fs_devices);
485 kfree(device_name);
462 if (error) 486 if (error)
463 goto out_free_opts; 487 goto out;
464 break; 488 break;
465 default: 489 default:
466 break; 490 break;
467 } 491 }
468 } 492 }
469 493
470 out_free_opts: 494out:
471 kfree(orig); 495 kfree(orig);
472 out:
473 /*
474 * If no subvolume name is specified we use the default one. Allocate
475 * a copy of the string "." here so that code later in the
476 * mount path doesn't care if it's the default volume or another one.
477 */
478 if (!*subvol_name) {
479 *subvol_name = kstrdup(".", GFP_KERNEL);
480 if (!*subvol_name)
481 return -ENOMEM;
482 }
483 return error; 496 return error;
484} 497}
485 498
@@ -492,7 +505,6 @@ static struct dentry *get_default_root(struct super_block *sb,
492 struct btrfs_path *path; 505 struct btrfs_path *path;
493 struct btrfs_key location; 506 struct btrfs_key location;
494 struct inode *inode; 507 struct inode *inode;
495 struct dentry *dentry;
496 u64 dir_id; 508 u64 dir_id;
497 int new = 0; 509 int new = 0;
498 510
@@ -517,7 +529,7 @@ static struct dentry *get_default_root(struct super_block *sb,
517 * will mount by default if we haven't been given a specific subvolume 529 * will mount by default if we haven't been given a specific subvolume
518 * to mount. 530 * to mount.
519 */ 531 */
520 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 532 dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
521 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); 533 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
522 if (IS_ERR(di)) { 534 if (IS_ERR(di)) {
523 btrfs_free_path(path); 535 btrfs_free_path(path);
@@ -566,29 +578,7 @@ setup_root:
566 return dget(sb->s_root); 578 return dget(sb->s_root);
567 } 579 }
568 580
569 if (new) { 581 return d_obtain_alias(inode);
570 const struct qstr name = { .name = "/", .len = 1 };
571
572 /*
573 * New inode, we need to make the dentry a sibling of s_root so
574 * everything gets cleaned up properly on unmount.
575 */
576 dentry = d_alloc(sb->s_root, &name);
577 if (!dentry) {
578 iput(inode);
579 return ERR_PTR(-ENOMEM);
580 }
581 d_splice_alias(inode, dentry);
582 } else {
583 /*
584 * We found the inode in cache, just find a dentry for it and
585 * put the reference to the inode we just got.
586 */
587 dentry = d_find_alias(inode);
588 iput(inode);
589 }
590
591 return dentry;
592} 582}
593 583
594static int btrfs_fill_super(struct super_block *sb, 584static int btrfs_fill_super(struct super_block *sb,
@@ -719,6 +709,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
719 seq_puts(seq, ",noacl"); 709 seq_puts(seq, ",noacl");
720 if (btrfs_test_opt(root, SPACE_CACHE)) 710 if (btrfs_test_opt(root, SPACE_CACHE))
721 seq_puts(seq, ",space_cache"); 711 seq_puts(seq, ",space_cache");
712 else
713 seq_puts(seq, ",no_space_cache");
722 if (btrfs_test_opt(root, CLEAR_CACHE)) 714 if (btrfs_test_opt(root, CLEAR_CACHE))
723 seq_puts(seq, ",clear_cache"); 715 seq_puts(seq, ",clear_cache");
724 if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) 716 if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
@@ -753,6 +745,137 @@ static int btrfs_set_super(struct super_block *s, void *data)
753 return set_anon_super(s, data); 745 return set_anon_super(s, data);
754} 746}
755 747
748/*
749 * subvolumes are identified by ino 256
750 */
751static inline int is_subvolume_inode(struct inode *inode)
752{
753 if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
754 return 1;
755 return 0;
756}
757
758/*
759 * This will strip out the subvol=%s argument for an argument string and add
760 * subvolid=0 to make sure we get the actual tree root for path walking to the
761 * subvol we want.
762 */
763static char *setup_root_args(char *args)
764{
765 unsigned copied = 0;
766 unsigned len = strlen(args) + 2;
767 char *pos;
768 char *ret;
769
770 /*
771 * We need the same args as before, but minus
772 *
773 * subvol=a
774 *
775 * and add
776 *
777 * subvolid=0
778 *
779 * which is a difference of 2 characters, so we allocate strlen(args) +
780 * 2 characters.
781 */
782 ret = kzalloc(len * sizeof(char), GFP_NOFS);
783 if (!ret)
784 return NULL;
785 pos = strstr(args, "subvol=");
786
787 /* This shouldn't happen, but just in case.. */
788 if (!pos) {
789 kfree(ret);
790 return NULL;
791 }
792
793 /*
794 * The subvol=<> arg is not at the front of the string, copy everybody
795 * up to that into ret.
796 */
797 if (pos != args) {
798 *pos = '\0';
799 strcpy(ret, args);
800 copied += strlen(args);
801 pos++;
802 }
803
804 strncpy(ret + copied, "subvolid=0", len - copied);
805
806 /* Length of subvolid=0 */
807 copied += 10;
808
809 /*
810 * If there is no , after the subvol= option then we know there's no
811 * other options and we can just return.
812 */
813 pos = strchr(pos, ',');
814 if (!pos)
815 return ret;
816
817 /* Copy the rest of the arguments into our buffer */
818 strncpy(ret + copied, pos, len - copied);
819 copied += strlen(pos);
820
821 return ret;
822}
823
824static struct dentry *mount_subvol(const char *subvol_name, int flags,
825 const char *device_name, char *data)
826{
827 struct super_block *s;
828 struct dentry *root;
829 struct vfsmount *mnt;
830 struct mnt_namespace *ns_private;
831 char *newargs;
832 struct path path;
833 int error;
834
835 newargs = setup_root_args(data);
836 if (!newargs)
837 return ERR_PTR(-ENOMEM);
838 mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
839 newargs);
840 kfree(newargs);
841 if (IS_ERR(mnt))
842 return ERR_CAST(mnt);
843
844 ns_private = create_mnt_ns(mnt);
845 if (IS_ERR(ns_private)) {
846 mntput(mnt);
847 return ERR_CAST(ns_private);
848 }
849
850 /*
851 * This will trigger the automount of the subvol so we can just
852 * drop the mnt we have here and return the dentry that we
853 * found.
854 */
855 error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name,
856 LOOKUP_FOLLOW, &path);
857 put_mnt_ns(ns_private);
858 if (error)
859 return ERR_PTR(error);
860
861 if (!is_subvolume_inode(path.dentry->d_inode)) {
862 path_put(&path);
863 mntput(mnt);
864 error = -EINVAL;
865 printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
866 subvol_name);
867 return ERR_PTR(-EINVAL);
868 }
869
870 /* Get a ref to the sb and the dentry we found and return it */
871 s = path.mnt->mnt_sb;
872 atomic_inc(&s->s_active);
873 root = dget(path.dentry);
874 path_put(&path);
875 down_write(&s->s_umount);
876
877 return root;
878}
756 879
757/* 880/*
758 * Find a superblock for the given device / mount point. 881 * Find a superblock for the given device / mount point.
@@ -784,13 +907,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
784 if (error) 907 if (error)
785 return ERR_PTR(error); 908 return ERR_PTR(error);
786 909
910 if (subvol_name) {
911 root = mount_subvol(subvol_name, flags, device_name, data);
912 kfree(subvol_name);
913 return root;
914 }
915
787 error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices); 916 error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
788 if (error) 917 if (error)
789 goto error_free_subvol_name; 918 return ERR_PTR(error);
790 919
791 error = btrfs_open_devices(fs_devices, mode, fs_type); 920 error = btrfs_open_devices(fs_devices, mode, fs_type);
792 if (error) 921 if (error)
793 goto error_free_subvol_name; 922 return ERR_PTR(error);
794 923
795 if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { 924 if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
796 error = -EACCES; 925 error = -EACCES;
@@ -813,88 +942,57 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
813 fs_info->fs_devices = fs_devices; 942 fs_info->fs_devices = fs_devices;
814 tree_root->fs_info = fs_info; 943 tree_root->fs_info = fs_info;
815 944
945 fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
946 fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
947 if (!fs_info->super_copy || !fs_info->super_for_commit) {
948 error = -ENOMEM;
949 goto error_close_devices;
950 }
951
816 bdev = fs_devices->latest_bdev; 952 bdev = fs_devices->latest_bdev;
817 s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root); 953 s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
818 if (IS_ERR(s)) 954 if (IS_ERR(s)) {
819 goto error_s; 955 error = PTR_ERR(s);
956 goto error_close_devices;
957 }
820 958
821 if (s->s_root) { 959 if (s->s_root) {
822 if ((flags ^ s->s_flags) & MS_RDONLY) { 960 if ((flags ^ s->s_flags) & MS_RDONLY) {
823 deactivate_locked_super(s); 961 deactivate_locked_super(s);
824 error = -EBUSY; 962 return ERR_PTR(-EBUSY);
825 goto error_close_devices;
826 } 963 }
827 964
828 btrfs_close_devices(fs_devices); 965 btrfs_close_devices(fs_devices);
829 kfree(fs_info); 966 free_fs_info(fs_info);
830 kfree(tree_root); 967 kfree(tree_root);
831 } else { 968 } else {
832 char b[BDEVNAME_SIZE]; 969 char b[BDEVNAME_SIZE];
833 970
834 s->s_flags = flags | MS_NOSEC; 971 s->s_flags = flags | MS_NOSEC;
835 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); 972 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
973 btrfs_sb(s)->fs_info->bdev_holder = fs_type;
836 error = btrfs_fill_super(s, fs_devices, data, 974 error = btrfs_fill_super(s, fs_devices, data,
837 flags & MS_SILENT ? 1 : 0); 975 flags & MS_SILENT ? 1 : 0);
838 if (error) { 976 if (error) {
839 deactivate_locked_super(s); 977 deactivate_locked_super(s);
840 goto error_free_subvol_name; 978 return ERR_PTR(error);
841 } 979 }
842 980
843 btrfs_sb(s)->fs_info->bdev_holder = fs_type;
844 s->s_flags |= MS_ACTIVE; 981 s->s_flags |= MS_ACTIVE;
845 } 982 }
846 983
847 /* if they gave us a subvolume name bind mount into that */ 984 root = get_default_root(s, subvol_objectid);
848 if (strcmp(subvol_name, ".")) { 985 if (IS_ERR(root)) {
849 struct dentry *new_root; 986 deactivate_locked_super(s);
850 987 return root;
851 root = get_default_root(s, subvol_rootid);
852 if (IS_ERR(root)) {
853 error = PTR_ERR(root);
854 deactivate_locked_super(s);
855 goto error_free_subvol_name;
856 }
857
858 mutex_lock(&root->d_inode->i_mutex);
859 new_root = lookup_one_len(subvol_name, root,
860 strlen(subvol_name));
861 mutex_unlock(&root->d_inode->i_mutex);
862
863 if (IS_ERR(new_root)) {
864 dput(root);
865 deactivate_locked_super(s);
866 error = PTR_ERR(new_root);
867 goto error_free_subvol_name;
868 }
869 if (!new_root->d_inode) {
870 dput(root);
871 dput(new_root);
872 deactivate_locked_super(s);
873 error = -ENXIO;
874 goto error_free_subvol_name;
875 }
876 dput(root);
877 root = new_root;
878 } else {
879 root = get_default_root(s, subvol_objectid);
880 if (IS_ERR(root)) {
881 error = PTR_ERR(root);
882 deactivate_locked_super(s);
883 goto error_free_subvol_name;
884 }
885 } 988 }
886 989
887 kfree(subvol_name);
888 return root; 990 return root;
889 991
890error_s:
891 error = PTR_ERR(s);
892error_close_devices: 992error_close_devices:
893 btrfs_close_devices(fs_devices); 993 btrfs_close_devices(fs_devices);
894 kfree(fs_info); 994 free_fs_info(fs_info);
895 kfree(tree_root); 995 kfree(tree_root);
896error_free_subvol_name:
897 kfree(subvol_name);
898 return ERR_PTR(error); 996 return ERR_PTR(error);
899} 997}
900 998
@@ -919,7 +1017,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
919 if (root->fs_info->fs_devices->rw_devices == 0) 1017 if (root->fs_info->fs_devices->rw_devices == 0)
920 return -EACCES; 1018 return -EACCES;
921 1019
922 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) 1020 if (btrfs_super_log_root(root->fs_info->super_copy) != 0)
923 return -EINVAL; 1021 return -EINVAL;
924 1022
925 ret = btrfs_cleanup_fs_roots(root->fs_info); 1023 ret = btrfs_cleanup_fs_roots(root->fs_info);
@@ -1085,7 +1183,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1085static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) 1183static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1086{ 1184{
1087 struct btrfs_root *root = btrfs_sb(dentry->d_sb); 1185 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
1088 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 1186 struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1089 struct list_head *head = &root->fs_info->space_info; 1187 struct list_head *head = &root->fs_info->space_info;
1090 struct btrfs_space_info *found; 1188 struct btrfs_space_info *found;
1091 u64 total_used = 0; 1189 u64 total_used = 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e24b7964a155..960835eaf4da 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -55,6 +55,7 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
55 struct btrfs_transaction *cur_trans; 55 struct btrfs_transaction *cur_trans;
56 56
57 spin_lock(&root->fs_info->trans_lock); 57 spin_lock(&root->fs_info->trans_lock);
58loop:
58 if (root->fs_info->trans_no_join) { 59 if (root->fs_info->trans_no_join) {
59 if (!nofail) { 60 if (!nofail) {
60 spin_unlock(&root->fs_info->trans_lock); 61 spin_unlock(&root->fs_info->trans_lock);
@@ -75,16 +76,18 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
75 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); 76 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
76 if (!cur_trans) 77 if (!cur_trans)
77 return -ENOMEM; 78 return -ENOMEM;
79
78 spin_lock(&root->fs_info->trans_lock); 80 spin_lock(&root->fs_info->trans_lock);
79 if (root->fs_info->running_transaction) { 81 if (root->fs_info->running_transaction) {
82 /*
83 * someone started a transaction after we unlocked. Make sure
84 * to redo the trans_no_join checks above
85 */
80 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 86 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
81 cur_trans = root->fs_info->running_transaction; 87 cur_trans = root->fs_info->running_transaction;
82 atomic_inc(&cur_trans->use_count); 88 goto loop;
83 atomic_inc(&cur_trans->num_writers);
84 cur_trans->num_joined++;
85 spin_unlock(&root->fs_info->trans_lock);
86 return 0;
87 } 89 }
90
88 atomic_set(&cur_trans->num_writers, 1); 91 atomic_set(&cur_trans->num_writers, 1);
89 cur_trans->num_joined = 0; 92 cur_trans->num_joined = 0;
90 init_waitqueue_head(&cur_trans->writer_wait); 93 init_waitqueue_head(&cur_trans->writer_wait);
@@ -275,7 +278,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
275 */ 278 */
276 if (num_items > 0 && root != root->fs_info->chunk_root) { 279 if (num_items > 0 && root != root->fs_info->chunk_root) {
277 num_bytes = btrfs_calc_trans_metadata_size(root, num_items); 280 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
278 ret = btrfs_block_rsv_add(NULL, root, 281 ret = btrfs_block_rsv_add(root,
279 &root->fs_info->trans_block_rsv, 282 &root->fs_info->trans_block_rsv,
280 num_bytes); 283 num_bytes);
281 if (ret) 284 if (ret)
@@ -418,8 +421,8 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
418 struct btrfs_root *root) 421 struct btrfs_root *root)
419{ 422{
420 int ret; 423 int ret;
421 ret = btrfs_block_rsv_check(trans, root, 424
422 &root->fs_info->global_block_rsv, 0, 5); 425 ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
423 return ret ? 1 : 0; 426 return ret ? 1 : 0;
424} 427}
425 428
@@ -427,17 +430,26 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
427 struct btrfs_root *root) 430 struct btrfs_root *root)
428{ 431{
429 struct btrfs_transaction *cur_trans = trans->transaction; 432 struct btrfs_transaction *cur_trans = trans->transaction;
433 struct btrfs_block_rsv *rsv = trans->block_rsv;
430 int updates; 434 int updates;
431 435
432 smp_mb(); 436 smp_mb();
433 if (cur_trans->blocked || cur_trans->delayed_refs.flushing) 437 if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
434 return 1; 438 return 1;
435 439
440 /*
441 * We need to do this in case we're deleting csums so the global block
442 * rsv get's used instead of the csum block rsv.
443 */
444 trans->block_rsv = NULL;
445
436 updates = trans->delayed_ref_updates; 446 updates = trans->delayed_ref_updates;
437 trans->delayed_ref_updates = 0; 447 trans->delayed_ref_updates = 0;
438 if (updates) 448 if (updates)
439 btrfs_run_delayed_refs(trans, root, updates); 449 btrfs_run_delayed_refs(trans, root, updates);
440 450
451 trans->block_rsv = rsv;
452
441 return should_end_transaction(trans, root); 453 return should_end_transaction(trans, root);
442} 454}
443 455
@@ -453,6 +465,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
453 return 0; 465 return 0;
454 } 466 }
455 467
468 btrfs_trans_release_metadata(trans, root);
469 trans->block_rsv = NULL;
456 while (count < 4) { 470 while (count < 4) {
457 unsigned long cur = trans->delayed_ref_updates; 471 unsigned long cur = trans->delayed_ref_updates;
458 trans->delayed_ref_updates = 0; 472 trans->delayed_ref_updates = 0;
@@ -473,8 +487,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
473 count++; 487 count++;
474 } 488 }
475 489
476 btrfs_trans_release_metadata(trans, root);
477
478 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && 490 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
479 should_end_transaction(trans, root)) { 491 should_end_transaction(trans, root)) {
480 trans->transaction->blocked = 1; 492 trans->transaction->blocked = 1;
@@ -562,50 +574,21 @@ int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
562int btrfs_write_marked_extents(struct btrfs_root *root, 574int btrfs_write_marked_extents(struct btrfs_root *root,
563 struct extent_io_tree *dirty_pages, int mark) 575 struct extent_io_tree *dirty_pages, int mark)
564{ 576{
565 int ret;
566 int err = 0; 577 int err = 0;
567 int werr = 0; 578 int werr = 0;
568 struct page *page; 579 struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
569 struct inode *btree_inode = root->fs_info->btree_inode;
570 u64 start = 0; 580 u64 start = 0;
571 u64 end; 581 u64 end;
572 unsigned long index;
573
574 while (1) {
575 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
576 mark);
577 if (ret)
578 break;
579 while (start <= end) {
580 cond_resched();
581
582 index = start >> PAGE_CACHE_SHIFT;
583 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
584 page = find_get_page(btree_inode->i_mapping, index);
585 if (!page)
586 continue;
587
588 btree_lock_page_hook(page);
589 if (!page->mapping) {
590 unlock_page(page);
591 page_cache_release(page);
592 continue;
593 }
594 582
595 if (PageWriteback(page)) { 583 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
596 if (PageDirty(page)) 584 mark)) {
597 wait_on_page_writeback(page); 585 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
598 else { 586 GFP_NOFS);
599 unlock_page(page); 587 err = filemap_fdatawrite_range(mapping, start, end);
600 page_cache_release(page); 588 if (err)
601 continue; 589 werr = err;
602 } 590 cond_resched();
603 } 591 start = end + 1;
604 err = write_one_page(page, 0);
605 if (err)
606 werr = err;
607 page_cache_release(page);
608 }
609 } 592 }
610 if (err) 593 if (err)
611 werr = err; 594 werr = err;
@@ -621,39 +604,20 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
621int btrfs_wait_marked_extents(struct btrfs_root *root, 604int btrfs_wait_marked_extents(struct btrfs_root *root,
622 struct extent_io_tree *dirty_pages, int mark) 605 struct extent_io_tree *dirty_pages, int mark)
623{ 606{
624 int ret;
625 int err = 0; 607 int err = 0;
626 int werr = 0; 608 int werr = 0;
627 struct page *page; 609 struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
628 struct inode *btree_inode = root->fs_info->btree_inode;
629 u64 start = 0; 610 u64 start = 0;
630 u64 end; 611 u64 end;
631 unsigned long index;
632
633 while (1) {
634 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
635 mark);
636 if (ret)
637 break;
638 612
639 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); 613 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
640 while (start <= end) { 614 EXTENT_NEED_WAIT)) {
641 index = start >> PAGE_CACHE_SHIFT; 615 clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
642 start = (u64)(index + 1) << PAGE_CACHE_SHIFT; 616 err = filemap_fdatawait_range(mapping, start, end);
643 page = find_get_page(btree_inode->i_mapping, index); 617 if (err)
644 if (!page) 618 werr = err;
645 continue; 619 cond_resched();
646 if (PageDirty(page)) { 620 start = end + 1;
647 btree_lock_page_hook(page);
648 wait_on_page_writeback(page);
649 err = write_one_page(page, 0);
650 if (err)
651 werr = err;
652 }
653 wait_on_page_writeback(page);
654 page_cache_release(page);
655 cond_resched();
656 }
657 } 621 }
658 if (err) 622 if (err)
659 werr = err; 623 werr = err;
@@ -673,7 +637,12 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
673 637
674 ret = btrfs_write_marked_extents(root, dirty_pages, mark); 638 ret = btrfs_write_marked_extents(root, dirty_pages, mark);
675 ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); 639 ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
676 return ret || ret2; 640
641 if (ret)
642 return ret;
643 if (ret2)
644 return ret2;
645 return 0;
677} 646}
678 647
679int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 648int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
@@ -911,10 +880,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
911 } 880 }
912 881
913 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); 882 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
914 btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
915 883
916 if (to_reserve > 0) { 884 if (to_reserve > 0) {
917 ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, 885 ret = btrfs_block_rsv_add(root, &pending->block_rsv,
918 to_reserve); 886 to_reserve);
919 if (ret) { 887 if (ret) {
920 pending->error = ret; 888 pending->error = ret;
@@ -1002,7 +970,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1002 BUG_ON(IS_ERR(pending->snap)); 970 BUG_ON(IS_ERR(pending->snap));
1003 971
1004 btrfs_reloc_post_snapshot(trans, pending); 972 btrfs_reloc_post_snapshot(trans, pending);
1005 btrfs_orphan_post_snapshot(trans, pending);
1006fail: 973fail:
1007 kfree(new_root_item); 974 kfree(new_root_item);
1008 trans->block_rsv = rsv; 975 trans->block_rsv = rsv;
@@ -1032,7 +999,7 @@ static void update_super_roots(struct btrfs_root *root)
1032 struct btrfs_root_item *root_item; 999 struct btrfs_root_item *root_item;
1033 struct btrfs_super_block *super; 1000 struct btrfs_super_block *super;
1034 1001
1035 super = &root->fs_info->super_copy; 1002 super = root->fs_info->super_copy;
1036 1003
1037 root_item = &root->fs_info->chunk_root->root_item; 1004 root_item = &root->fs_info->chunk_root->root_item;
1038 super->chunk_root = root_item->bytenr; 1005 super->chunk_root = root_item->bytenr;
@@ -1043,7 +1010,7 @@ static void update_super_roots(struct btrfs_root *root)
1043 super->root = root_item->bytenr; 1010 super->root = root_item->bytenr;
1044 super->generation = root_item->generation; 1011 super->generation = root_item->generation;
1045 super->root_level = root_item->level; 1012 super->root_level = root_item->level;
1046 if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE)) 1013 if (btrfs_test_opt(root, SPACE_CACHE))
1047 super->cache_generation = root_item->generation; 1014 super->cache_generation = root_item->generation;
1048} 1015}
1049 1016
@@ -1168,14 +1135,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1168 1135
1169 btrfs_run_ordered_operations(root, 0); 1136 btrfs_run_ordered_operations(root, 0);
1170 1137
1138 btrfs_trans_release_metadata(trans, root);
1139 trans->block_rsv = NULL;
1140
1171 /* make a pass through all the delayed refs we have so far 1141 /* make a pass through all the delayed refs we have so far
1172 * any runnings procs may add more while we are here 1142 * any runnings procs may add more while we are here
1173 */ 1143 */
1174 ret = btrfs_run_delayed_refs(trans, root, 0); 1144 ret = btrfs_run_delayed_refs(trans, root, 0);
1175 BUG_ON(ret); 1145 BUG_ON(ret);
1176 1146
1177 btrfs_trans_release_metadata(trans, root);
1178
1179 cur_trans = trans->transaction; 1147 cur_trans = trans->transaction;
1180 /* 1148 /*
1181 * set the flushing flag so procs in this transaction have to 1149 * set the flushing flag so procs in this transaction have to
@@ -1341,12 +1309,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1341 update_super_roots(root); 1309 update_super_roots(root);
1342 1310
1343 if (!root->fs_info->log_root_recovering) { 1311 if (!root->fs_info->log_root_recovering) {
1344 btrfs_set_super_log_root(&root->fs_info->super_copy, 0); 1312 btrfs_set_super_log_root(root->fs_info->super_copy, 0);
1345 btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0); 1313 btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
1346 } 1314 }
1347 1315
1348 memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, 1316 memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
1349 sizeof(root->fs_info->super_copy)); 1317 sizeof(*root->fs_info->super_copy));
1350 1318
1351 trans->transaction->blocked = 0; 1319 trans->transaction->blocked = 0;
1352 spin_lock(&root->fs_info->trans_lock); 1320 spin_lock(&root->fs_info->trans_lock);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 0618aa39740b..3568374d419d 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -276,8 +276,9 @@ static int process_one_buffer(struct btrfs_root *log,
276 struct walk_control *wc, u64 gen) 276 struct walk_control *wc, u64 gen)
277{ 277{
278 if (wc->pin) 278 if (wc->pin)
279 btrfs_pin_extent(log->fs_info->extent_root, 279 btrfs_pin_extent_for_log_replay(wc->trans,
280 eb->start, eb->len, 0); 280 log->fs_info->extent_root,
281 eb->start, eb->len);
281 282
282 if (btrfs_buffer_uptodate(eb, gen)) { 283 if (btrfs_buffer_uptodate(eb, gen)) {
283 if (wc->write) 284 if (wc->write)
@@ -1760,7 +1761,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1760 1761
1761 WARN_ON(root_owner != 1762 WARN_ON(root_owner !=
1762 BTRFS_TREE_LOG_OBJECTID); 1763 BTRFS_TREE_LOG_OBJECTID);
1763 ret = btrfs_free_reserved_extent(root, 1764 ret = btrfs_free_and_pin_reserved_extent(root,
1764 bytenr, blocksize); 1765 bytenr, blocksize);
1765 BUG_ON(ret); 1766 BUG_ON(ret);
1766 } 1767 }
@@ -1828,7 +1829,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1828 btrfs_tree_unlock(next); 1829 btrfs_tree_unlock(next);
1829 1830
1830 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1831 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1831 ret = btrfs_free_reserved_extent(root, 1832 ret = btrfs_free_and_pin_reserved_extent(root,
1832 path->nodes[*level]->start, 1833 path->nodes[*level]->start,
1833 path->nodes[*level]->len); 1834 path->nodes[*level]->len);
1834 BUG_ON(ret); 1835 BUG_ON(ret);
@@ -1897,7 +1898,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1897 1898
1898 WARN_ON(log->root_key.objectid != 1899 WARN_ON(log->root_key.objectid !=
1899 BTRFS_TREE_LOG_OBJECTID); 1900 BTRFS_TREE_LOG_OBJECTID);
1900 ret = btrfs_free_reserved_extent(log, next->start, 1901 ret = btrfs_free_and_pin_reserved_extent(log, next->start,
1901 next->len); 1902 next->len);
1902 BUG_ON(ret); 1903 BUG_ON(ret);
1903 } 1904 }
@@ -2013,10 +2014,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2013 /* wait for previous tree log sync to complete */ 2014 /* wait for previous tree log sync to complete */
2014 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2015 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2015 wait_log_commit(trans, root, root->log_transid - 1); 2016 wait_log_commit(trans, root, root->log_transid - 1);
2016
2017 while (1) { 2017 while (1) {
2018 unsigned long batch = root->log_batch; 2018 unsigned long batch = root->log_batch;
2019 if (root->log_multiple_pids) { 2019 /* when we're on an ssd, just kick the log commit out */
2020 if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
2020 mutex_unlock(&root->log_mutex); 2021 mutex_unlock(&root->log_mutex);
2021 schedule_timeout_uninterruptible(1); 2022 schedule_timeout_uninterruptible(1);
2022 mutex_lock(&root->log_mutex); 2023 mutex_lock(&root->log_mutex);
@@ -2117,9 +2118,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2117 BUG_ON(ret); 2118 BUG_ON(ret);
2118 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2119 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2119 2120
2120 btrfs_set_super_log_root(&root->fs_info->super_for_commit, 2121 btrfs_set_super_log_root(root->fs_info->super_for_commit,
2121 log_root_tree->node->start); 2122 log_root_tree->node->start);
2122 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, 2123 btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
2123 btrfs_header_level(log_root_tree->node)); 2124 btrfs_header_level(log_root_tree->node));
2124 2125
2125 log_root_tree->log_batch = 0; 2126 log_root_tree->log_batch = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f2a4cc79da61..f8e2943101a1 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -366,6 +366,14 @@ static noinline int device_list_add(const char *path,
366 } 366 }
367 INIT_LIST_HEAD(&device->dev_alloc_list); 367 INIT_LIST_HEAD(&device->dev_alloc_list);
368 368
369 /* init readahead state */
370 spin_lock_init(&device->reada_lock);
371 device->reada_curr_zone = NULL;
372 atomic_set(&device->reada_in_flight, 0);
373 device->reada_next = 0;
374 INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
375 INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
376
369 mutex_lock(&fs_devices->device_list_mutex); 377 mutex_lock(&fs_devices->device_list_mutex);
370 list_add_rcu(&device->dev_list, &fs_devices->devices); 378 list_add_rcu(&device->dev_list, &fs_devices->devices);
371 mutex_unlock(&fs_devices->device_list_mutex); 379 mutex_unlock(&fs_devices->device_list_mutex);
@@ -597,10 +605,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
597 set_blocksize(bdev, 4096); 605 set_blocksize(bdev, 4096);
598 606
599 bh = btrfs_read_dev_super(bdev); 607 bh = btrfs_read_dev_super(bdev);
600 if (!bh) { 608 if (!bh)
601 ret = -EINVAL;
602 goto error_close; 609 goto error_close;
603 }
604 610
605 disk_super = (struct btrfs_super_block *)bh->b_data; 611 disk_super = (struct btrfs_super_block *)bh->b_data;
606 devid = btrfs_stack_device_id(&disk_super->dev_item); 612 devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -655,7 +661,7 @@ error:
655 continue; 661 continue;
656 } 662 }
657 if (fs_devices->open_devices == 0) { 663 if (fs_devices->open_devices == 0) {
658 ret = -EIO; 664 ret = -EINVAL;
659 goto out; 665 goto out;
660 } 666 }
661 fs_devices->seeding = seeding; 667 fs_devices->seeding = seeding;
@@ -1013,8 +1019,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1013 } 1019 }
1014 BUG_ON(ret); 1020 BUG_ON(ret);
1015 1021
1016 if (device->bytes_used > 0) 1022 if (device->bytes_used > 0) {
1017 device->bytes_used -= btrfs_dev_extent_length(leaf, extent); 1023 u64 len = btrfs_dev_extent_length(leaf, extent);
1024 device->bytes_used -= len;
1025 spin_lock(&root->fs_info->free_chunk_lock);
1026 root->fs_info->free_chunk_space += len;
1027 spin_unlock(&root->fs_info->free_chunk_lock);
1028 }
1018 ret = btrfs_del_item(trans, root, path); 1029 ret = btrfs_del_item(trans, root, path);
1019 1030
1020out: 1031out:
@@ -1356,6 +1367,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1356 if (ret) 1367 if (ret)
1357 goto error_undo; 1368 goto error_undo;
1358 1369
1370 spin_lock(&root->fs_info->free_chunk_lock);
1371 root->fs_info->free_chunk_space = device->total_bytes -
1372 device->bytes_used;
1373 spin_unlock(&root->fs_info->free_chunk_lock);
1374
1359 device->in_fs_metadata = 0; 1375 device->in_fs_metadata = 0;
1360 btrfs_scrub_cancel_dev(root, device); 1376 btrfs_scrub_cancel_dev(root, device);
1361 1377
@@ -1387,8 +1403,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1387 call_rcu(&device->rcu, free_device); 1403 call_rcu(&device->rcu, free_device);
1388 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1404 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1389 1405
1390 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 1406 num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
1391 btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); 1407 btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
1392 1408
1393 if (cur_devices->open_devices == 0) { 1409 if (cur_devices->open_devices == 0) {
1394 struct btrfs_fs_devices *fs_devices; 1410 struct btrfs_fs_devices *fs_devices;
@@ -1450,7 +1466,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1450 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1466 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1451 struct btrfs_fs_devices *old_devices; 1467 struct btrfs_fs_devices *old_devices;
1452 struct btrfs_fs_devices *seed_devices; 1468 struct btrfs_fs_devices *seed_devices;
1453 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 1469 struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1454 struct btrfs_device *device; 1470 struct btrfs_device *device;
1455 u64 super_flags; 1471 u64 super_flags;
1456 1472
@@ -1691,15 +1707,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1691 root->fs_info->fs_devices->num_can_discard++; 1707 root->fs_info->fs_devices->num_can_discard++;
1692 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1708 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1693 1709
1710 spin_lock(&root->fs_info->free_chunk_lock);
1711 root->fs_info->free_chunk_space += device->total_bytes;
1712 spin_unlock(&root->fs_info->free_chunk_lock);
1713
1694 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 1714 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
1695 root->fs_info->fs_devices->rotating = 1; 1715 root->fs_info->fs_devices->rotating = 1;
1696 1716
1697 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); 1717 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
1698 btrfs_set_super_total_bytes(&root->fs_info->super_copy, 1718 btrfs_set_super_total_bytes(root->fs_info->super_copy,
1699 total_bytes + device->total_bytes); 1719 total_bytes + device->total_bytes);
1700 1720
1701 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); 1721 total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
1702 btrfs_set_super_num_devices(&root->fs_info->super_copy, 1722 btrfs_set_super_num_devices(root->fs_info->super_copy,
1703 total_bytes + 1); 1723 total_bytes + 1);
1704 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1724 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1705 1725
@@ -1790,7 +1810,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1790 struct btrfs_device *device, u64 new_size) 1810 struct btrfs_device *device, u64 new_size)
1791{ 1811{
1792 struct btrfs_super_block *super_copy = 1812 struct btrfs_super_block *super_copy =
1793 &device->dev_root->fs_info->super_copy; 1813 device->dev_root->fs_info->super_copy;
1794 u64 old_total = btrfs_super_total_bytes(super_copy); 1814 u64 old_total = btrfs_super_total_bytes(super_copy);
1795 u64 diff = new_size - device->total_bytes; 1815 u64 diff = new_size - device->total_bytes;
1796 1816
@@ -1849,7 +1869,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1849static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 1869static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1850 chunk_offset) 1870 chunk_offset)
1851{ 1871{
1852 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 1872 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
1853 struct btrfs_disk_key *disk_key; 1873 struct btrfs_disk_key *disk_key;
1854 struct btrfs_chunk *chunk; 1874 struct btrfs_chunk *chunk;
1855 u8 *ptr; 1875 u8 *ptr;
@@ -2175,7 +2195,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2175 bool retried = false; 2195 bool retried = false;
2176 struct extent_buffer *l; 2196 struct extent_buffer *l;
2177 struct btrfs_key key; 2197 struct btrfs_key key;
2178 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2198 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2179 u64 old_total = btrfs_super_total_bytes(super_copy); 2199 u64 old_total = btrfs_super_total_bytes(super_copy);
2180 u64 old_size = device->total_bytes; 2200 u64 old_size = device->total_bytes;
2181 u64 diff = device->total_bytes - new_size; 2201 u64 diff = device->total_bytes - new_size;
@@ -2192,8 +2212,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2192 lock_chunks(root); 2212 lock_chunks(root);
2193 2213
2194 device->total_bytes = new_size; 2214 device->total_bytes = new_size;
2195 if (device->writeable) 2215 if (device->writeable) {
2196 device->fs_devices->total_rw_bytes -= diff; 2216 device->fs_devices->total_rw_bytes -= diff;
2217 spin_lock(&root->fs_info->free_chunk_lock);
2218 root->fs_info->free_chunk_space -= diff;
2219 spin_unlock(&root->fs_info->free_chunk_lock);
2220 }
2197 unlock_chunks(root); 2221 unlock_chunks(root);
2198 2222
2199again: 2223again:
@@ -2257,6 +2281,9 @@ again:
2257 device->total_bytes = old_size; 2281 device->total_bytes = old_size;
2258 if (device->writeable) 2282 if (device->writeable)
2259 device->fs_devices->total_rw_bytes += diff; 2283 device->fs_devices->total_rw_bytes += diff;
2284 spin_lock(&root->fs_info->free_chunk_lock);
2285 root->fs_info->free_chunk_space += diff;
2286 spin_unlock(&root->fs_info->free_chunk_lock);
2260 unlock_chunks(root); 2287 unlock_chunks(root);
2261 goto done; 2288 goto done;
2262 } 2289 }
@@ -2292,7 +2319,7 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
2292 struct btrfs_key *key, 2319 struct btrfs_key *key,
2293 struct btrfs_chunk *chunk, int item_size) 2320 struct btrfs_chunk *chunk, int item_size)
2294{ 2321{
2295 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2322 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2296 struct btrfs_disk_key disk_key; 2323 struct btrfs_disk_key disk_key;
2297 u32 array_size; 2324 u32 array_size;
2298 u8 *ptr; 2325 u8 *ptr;
@@ -2615,6 +2642,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2615 index++; 2642 index++;
2616 } 2643 }
2617 2644
2645 spin_lock(&extent_root->fs_info->free_chunk_lock);
2646 extent_root->fs_info->free_chunk_space -= (stripe_size *
2647 map->num_stripes);
2648 spin_unlock(&extent_root->fs_info->free_chunk_lock);
2649
2618 index = 0; 2650 index = 0;
2619 stripe = &chunk->stripe; 2651 stripe = &chunk->stripe;
2620 while (index < map->num_stripes) { 2652 while (index < map->num_stripes) {
@@ -2848,7 +2880,7 @@ static int find_live_mirror(struct map_lookup *map, int first, int num,
2848 2880
2849static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2881static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2850 u64 logical, u64 *length, 2882 u64 logical, u64 *length,
2851 struct btrfs_multi_bio **multi_ret, 2883 struct btrfs_bio **bbio_ret,
2852 int mirror_num) 2884 int mirror_num)
2853{ 2885{
2854 struct extent_map *em; 2886 struct extent_map *em;
@@ -2866,18 +2898,18 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2866 int i; 2898 int i;
2867 int num_stripes; 2899 int num_stripes;
2868 int max_errors = 0; 2900 int max_errors = 0;
2869 struct btrfs_multi_bio *multi = NULL; 2901 struct btrfs_bio *bbio = NULL;
2870 2902
2871 if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) 2903 if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
2872 stripes_allocated = 1; 2904 stripes_allocated = 1;
2873again: 2905again:
2874 if (multi_ret) { 2906 if (bbio_ret) {
2875 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), 2907 bbio = kzalloc(btrfs_bio_size(stripes_allocated),
2876 GFP_NOFS); 2908 GFP_NOFS);
2877 if (!multi) 2909 if (!bbio)
2878 return -ENOMEM; 2910 return -ENOMEM;
2879 2911
2880 atomic_set(&multi->error, 0); 2912 atomic_set(&bbio->error, 0);
2881 } 2913 }
2882 2914
2883 read_lock(&em_tree->lock); 2915 read_lock(&em_tree->lock);
@@ -2898,7 +2930,7 @@ again:
2898 if (mirror_num > map->num_stripes) 2930 if (mirror_num > map->num_stripes)
2899 mirror_num = 0; 2931 mirror_num = 0;
2900 2932
2901 /* if our multi bio struct is too small, back off and try again */ 2933 /* if our btrfs_bio struct is too small, back off and try again */
2902 if (rw & REQ_WRITE) { 2934 if (rw & REQ_WRITE) {
2903 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 2935 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2904 BTRFS_BLOCK_GROUP_DUP)) { 2936 BTRFS_BLOCK_GROUP_DUP)) {
@@ -2917,11 +2949,11 @@ again:
2917 stripes_required = map->num_stripes; 2949 stripes_required = map->num_stripes;
2918 } 2950 }
2919 } 2951 }
2920 if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && 2952 if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
2921 stripes_allocated < stripes_required) { 2953 stripes_allocated < stripes_required) {
2922 stripes_allocated = map->num_stripes; 2954 stripes_allocated = map->num_stripes;
2923 free_extent_map(em); 2955 free_extent_map(em);
2924 kfree(multi); 2956 kfree(bbio);
2925 goto again; 2957 goto again;
2926 } 2958 }
2927 stripe_nr = offset; 2959 stripe_nr = offset;
@@ -2950,7 +2982,7 @@ again:
2950 *length = em->len - offset; 2982 *length = em->len - offset;
2951 } 2983 }
2952 2984
2953 if (!multi_ret) 2985 if (!bbio_ret)
2954 goto out; 2986 goto out;
2955 2987
2956 num_stripes = 1; 2988 num_stripes = 1;
@@ -2975,13 +3007,17 @@ again:
2975 stripe_index = find_live_mirror(map, 0, 3007 stripe_index = find_live_mirror(map, 0,
2976 map->num_stripes, 3008 map->num_stripes,
2977 current->pid % map->num_stripes); 3009 current->pid % map->num_stripes);
3010 mirror_num = stripe_index + 1;
2978 } 3011 }
2979 3012
2980 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 3013 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2981 if (rw & (REQ_WRITE | REQ_DISCARD)) 3014 if (rw & (REQ_WRITE | REQ_DISCARD)) {
2982 num_stripes = map->num_stripes; 3015 num_stripes = map->num_stripes;
2983 else if (mirror_num) 3016 } else if (mirror_num) {
2984 stripe_index = mirror_num - 1; 3017 stripe_index = mirror_num - 1;
3018 } else {
3019 mirror_num = 1;
3020 }
2985 3021
2986 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3022 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2987 int factor = map->num_stripes / map->sub_stripes; 3023 int factor = map->num_stripes / map->sub_stripes;
@@ -3001,6 +3037,7 @@ again:
3001 stripe_index = find_live_mirror(map, stripe_index, 3037 stripe_index = find_live_mirror(map, stripe_index,
3002 map->sub_stripes, stripe_index + 3038 map->sub_stripes, stripe_index +
3003 current->pid % map->sub_stripes); 3039 current->pid % map->sub_stripes);
3040 mirror_num = stripe_index + 1;
3004 } 3041 }
3005 } else { 3042 } else {
3006 /* 3043 /*
@@ -3009,15 +3046,16 @@ again:
3009 * stripe_index is the number of our device in the stripe array 3046 * stripe_index is the number of our device in the stripe array
3010 */ 3047 */
3011 stripe_index = do_div(stripe_nr, map->num_stripes); 3048 stripe_index = do_div(stripe_nr, map->num_stripes);
3049 mirror_num = stripe_index + 1;
3012 } 3050 }
3013 BUG_ON(stripe_index >= map->num_stripes); 3051 BUG_ON(stripe_index >= map->num_stripes);
3014 3052
3015 if (rw & REQ_DISCARD) { 3053 if (rw & REQ_DISCARD) {
3016 for (i = 0; i < num_stripes; i++) { 3054 for (i = 0; i < num_stripes; i++) {
3017 multi->stripes[i].physical = 3055 bbio->stripes[i].physical =
3018 map->stripes[stripe_index].physical + 3056 map->stripes[stripe_index].physical +
3019 stripe_offset + stripe_nr * map->stripe_len; 3057 stripe_offset + stripe_nr * map->stripe_len;
3020 multi->stripes[i].dev = map->stripes[stripe_index].dev; 3058 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
3021 3059
3022 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3060 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3023 u64 stripes; 3061 u64 stripes;
@@ -3038,16 +3076,16 @@ again:
3038 } 3076 }
3039 stripes = stripe_nr_end - 1 - j; 3077 stripes = stripe_nr_end - 1 - j;
3040 do_div(stripes, map->num_stripes); 3078 do_div(stripes, map->num_stripes);
3041 multi->stripes[i].length = map->stripe_len * 3079 bbio->stripes[i].length = map->stripe_len *
3042 (stripes - stripe_nr + 1); 3080 (stripes - stripe_nr + 1);
3043 3081
3044 if (i == 0) { 3082 if (i == 0) {
3045 multi->stripes[i].length -= 3083 bbio->stripes[i].length -=
3046 stripe_offset; 3084 stripe_offset;
3047 stripe_offset = 0; 3085 stripe_offset = 0;
3048 } 3086 }
3049 if (stripe_index == last_stripe) 3087 if (stripe_index == last_stripe)
3050 multi->stripes[i].length -= 3088 bbio->stripes[i].length -=
3051 stripe_end_offset; 3089 stripe_end_offset;
3052 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3090 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3053 u64 stripes; 3091 u64 stripes;
@@ -3072,11 +3110,11 @@ again:
3072 } 3110 }
3073 stripes = stripe_nr_end - 1 - j; 3111 stripes = stripe_nr_end - 1 - j;
3074 do_div(stripes, factor); 3112 do_div(stripes, factor);
3075 multi->stripes[i].length = map->stripe_len * 3113 bbio->stripes[i].length = map->stripe_len *
3076 (stripes - stripe_nr + 1); 3114 (stripes - stripe_nr + 1);
3077 3115
3078 if (i < map->sub_stripes) { 3116 if (i < map->sub_stripes) {
3079 multi->stripes[i].length -= 3117 bbio->stripes[i].length -=
3080 stripe_offset; 3118 stripe_offset;
3081 if (i == map->sub_stripes - 1) 3119 if (i == map->sub_stripes - 1)
3082 stripe_offset = 0; 3120 stripe_offset = 0;
@@ -3084,11 +3122,11 @@ again:
3084 if (stripe_index >= last_stripe && 3122 if (stripe_index >= last_stripe &&
3085 stripe_index <= (last_stripe + 3123 stripe_index <= (last_stripe +
3086 map->sub_stripes - 1)) { 3124 map->sub_stripes - 1)) {
3087 multi->stripes[i].length -= 3125 bbio->stripes[i].length -=
3088 stripe_end_offset; 3126 stripe_end_offset;
3089 } 3127 }
3090 } else 3128 } else
3091 multi->stripes[i].length = *length; 3129 bbio->stripes[i].length = *length;
3092 3130
3093 stripe_index++; 3131 stripe_index++;
3094 if (stripe_index == map->num_stripes) { 3132 if (stripe_index == map->num_stripes) {
@@ -3099,19 +3137,20 @@ again:
3099 } 3137 }
3100 } else { 3138 } else {
3101 for (i = 0; i < num_stripes; i++) { 3139 for (i = 0; i < num_stripes; i++) {
3102 multi->stripes[i].physical = 3140 bbio->stripes[i].physical =
3103 map->stripes[stripe_index].physical + 3141 map->stripes[stripe_index].physical +
3104 stripe_offset + 3142 stripe_offset +
3105 stripe_nr * map->stripe_len; 3143 stripe_nr * map->stripe_len;
3106 multi->stripes[i].dev = 3144 bbio->stripes[i].dev =
3107 map->stripes[stripe_index].dev; 3145 map->stripes[stripe_index].dev;
3108 stripe_index++; 3146 stripe_index++;
3109 } 3147 }
3110 } 3148 }
3111 if (multi_ret) { 3149 if (bbio_ret) {
3112 *multi_ret = multi; 3150 *bbio_ret = bbio;
3113 multi->num_stripes = num_stripes; 3151 bbio->num_stripes = num_stripes;
3114 multi->max_errors = max_errors; 3152 bbio->max_errors = max_errors;
3153 bbio->mirror_num = mirror_num;
3115 } 3154 }
3116out: 3155out:
3117 free_extent_map(em); 3156 free_extent_map(em);
@@ -3120,9 +3159,9 @@ out:
3120 3159
3121int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 3160int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3122 u64 logical, u64 *length, 3161 u64 logical, u64 *length,
3123 struct btrfs_multi_bio **multi_ret, int mirror_num) 3162 struct btrfs_bio **bbio_ret, int mirror_num)
3124{ 3163{
3125 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, 3164 return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
3126 mirror_num); 3165 mirror_num);
3127} 3166}
3128 3167
@@ -3191,28 +3230,30 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
3191 return 0; 3230 return 0;
3192} 3231}
3193 3232
3194static void end_bio_multi_stripe(struct bio *bio, int err) 3233static void btrfs_end_bio(struct bio *bio, int err)
3195{ 3234{
3196 struct btrfs_multi_bio *multi = bio->bi_private; 3235 struct btrfs_bio *bbio = bio->bi_private;
3197 int is_orig_bio = 0; 3236 int is_orig_bio = 0;
3198 3237
3199 if (err) 3238 if (err)
3200 atomic_inc(&multi->error); 3239 atomic_inc(&bbio->error);
3201 3240
3202 if (bio == multi->orig_bio) 3241 if (bio == bbio->orig_bio)
3203 is_orig_bio = 1; 3242 is_orig_bio = 1;
3204 3243
3205 if (atomic_dec_and_test(&multi->stripes_pending)) { 3244 if (atomic_dec_and_test(&bbio->stripes_pending)) {
3206 if (!is_orig_bio) { 3245 if (!is_orig_bio) {
3207 bio_put(bio); 3246 bio_put(bio);
3208 bio = multi->orig_bio; 3247 bio = bbio->orig_bio;
3209 } 3248 }
3210 bio->bi_private = multi->private; 3249 bio->bi_private = bbio->private;
3211 bio->bi_end_io = multi->end_io; 3250 bio->bi_end_io = bbio->end_io;
3251 bio->bi_bdev = (struct block_device *)
3252 (unsigned long)bbio->mirror_num;
3212 /* only send an error to the higher layers if it is 3253 /* only send an error to the higher layers if it is
3213 * beyond the tolerance of the multi-bio 3254 * beyond the tolerance of the multi-bio
3214 */ 3255 */
3215 if (atomic_read(&multi->error) > multi->max_errors) { 3256 if (atomic_read(&bbio->error) > bbio->max_errors) {
3216 err = -EIO; 3257 err = -EIO;
3217 } else if (err) { 3258 } else if (err) {
3218 /* 3259 /*
@@ -3222,7 +3263,7 @@ static void end_bio_multi_stripe(struct bio *bio, int err)
3222 set_bit(BIO_UPTODATE, &bio->bi_flags); 3263 set_bit(BIO_UPTODATE, &bio->bi_flags);
3223 err = 0; 3264 err = 0;
3224 } 3265 }
3225 kfree(multi); 3266 kfree(bbio);
3226 3267
3227 bio_endio(bio, err); 3268 bio_endio(bio, err);
3228 } else if (!is_orig_bio) { 3269 } else if (!is_orig_bio) {
@@ -3302,20 +3343,20 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3302 u64 logical = (u64)bio->bi_sector << 9; 3343 u64 logical = (u64)bio->bi_sector << 9;
3303 u64 length = 0; 3344 u64 length = 0;
3304 u64 map_length; 3345 u64 map_length;
3305 struct btrfs_multi_bio *multi = NULL;
3306 int ret; 3346 int ret;
3307 int dev_nr = 0; 3347 int dev_nr = 0;
3308 int total_devs = 1; 3348 int total_devs = 1;
3349 struct btrfs_bio *bbio = NULL;
3309 3350
3310 length = bio->bi_size; 3351 length = bio->bi_size;
3311 map_tree = &root->fs_info->mapping_tree; 3352 map_tree = &root->fs_info->mapping_tree;
3312 map_length = length; 3353 map_length = length;
3313 3354
3314 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, 3355 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
3315 mirror_num); 3356 mirror_num);
3316 BUG_ON(ret); 3357 BUG_ON(ret);
3317 3358
3318 total_devs = multi->num_stripes; 3359 total_devs = bbio->num_stripes;
3319 if (map_length < length) { 3360 if (map_length < length) {
3320 printk(KERN_CRIT "mapping failed logical %llu bio len %llu " 3361 printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
3321 "len %llu\n", (unsigned long long)logical, 3362 "len %llu\n", (unsigned long long)logical,
@@ -3323,25 +3364,28 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3323 (unsigned long long)map_length); 3364 (unsigned long long)map_length);
3324 BUG(); 3365 BUG();
3325 } 3366 }
3326 multi->end_io = first_bio->bi_end_io; 3367
3327 multi->private = first_bio->bi_private; 3368 bbio->orig_bio = first_bio;
3328 multi->orig_bio = first_bio; 3369 bbio->private = first_bio->bi_private;
3329 atomic_set(&multi->stripes_pending, multi->num_stripes); 3370 bbio->end_io = first_bio->bi_end_io;
3371 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
3330 3372
3331 while (dev_nr < total_devs) { 3373 while (dev_nr < total_devs) {
3332 if (total_devs > 1) { 3374 if (dev_nr < total_devs - 1) {
3333 if (dev_nr < total_devs - 1) { 3375 bio = bio_clone(first_bio, GFP_NOFS);
3334 bio = bio_clone(first_bio, GFP_NOFS); 3376 BUG_ON(!bio);
3335 BUG_ON(!bio); 3377 } else {
3336 } else { 3378 bio = first_bio;
3337 bio = first_bio;
3338 }
3339 bio->bi_private = multi;
3340 bio->bi_end_io = end_bio_multi_stripe;
3341 } 3379 }
3342 bio->bi_sector = multi->stripes[dev_nr].physical >> 9; 3380 bio->bi_private = bbio;
3343 dev = multi->stripes[dev_nr].dev; 3381 bio->bi_end_io = btrfs_end_bio;
3382 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
3383 dev = bbio->stripes[dev_nr].dev;
3344 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { 3384 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
3385 pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
3386 "(%s id %llu), size=%u\n", rw,
3387 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
3388 dev->name, dev->devid, bio->bi_size);
3345 bio->bi_bdev = dev->bdev; 3389 bio->bi_bdev = dev->bdev;
3346 if (async_submit) 3390 if (async_submit)
3347 schedule_bio(root, dev, rw, bio); 3391 schedule_bio(root, dev, rw, bio);
@@ -3354,8 +3398,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3354 } 3398 }
3355 dev_nr++; 3399 dev_nr++;
3356 } 3400 }
3357 if (total_devs == 1)
3358 kfree(multi);
3359 return 0; 3401 return 0;
3360} 3402}
3361 3403
@@ -3616,15 +3658,20 @@ static int read_one_dev(struct btrfs_root *root,
3616 fill_device_from_item(leaf, dev_item, device); 3658 fill_device_from_item(leaf, dev_item, device);
3617 device->dev_root = root->fs_info->dev_root; 3659 device->dev_root = root->fs_info->dev_root;
3618 device->in_fs_metadata = 1; 3660 device->in_fs_metadata = 1;
3619 if (device->writeable) 3661 if (device->writeable) {
3620 device->fs_devices->total_rw_bytes += device->total_bytes; 3662 device->fs_devices->total_rw_bytes += device->total_bytes;
3663 spin_lock(&root->fs_info->free_chunk_lock);
3664 root->fs_info->free_chunk_space += device->total_bytes -
3665 device->bytes_used;
3666 spin_unlock(&root->fs_info->free_chunk_lock);
3667 }
3621 ret = 0; 3668 ret = 0;
3622 return ret; 3669 return ret;
3623} 3670}
3624 3671
3625int btrfs_read_sys_array(struct btrfs_root *root) 3672int btrfs_read_sys_array(struct btrfs_root *root)
3626{ 3673{
3627 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 3674 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3628 struct extent_buffer *sb; 3675 struct extent_buffer *sb;
3629 struct btrfs_disk_key *disk_key; 3676 struct btrfs_disk_key *disk_key;
3630 struct btrfs_chunk *chunk; 3677 struct btrfs_chunk *chunk;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 6d866db4e177..ab5b1c49f352 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -92,6 +92,14 @@ struct btrfs_device {
92 struct btrfs_work work; 92 struct btrfs_work work;
93 struct rcu_head rcu; 93 struct rcu_head rcu;
94 struct work_struct rcu_work; 94 struct work_struct rcu_work;
95
96 /* readahead state */
97 spinlock_t reada_lock;
98 atomic_t reada_in_flight;
99 u64 reada_next;
100 struct reada_zone *reada_curr_zone;
101 struct radix_tree_root reada_zones;
102 struct radix_tree_root reada_extents;
95}; 103};
96 104
97struct btrfs_fs_devices { 105struct btrfs_fs_devices {
@@ -136,7 +144,10 @@ struct btrfs_bio_stripe {
136 u64 length; /* only used for discard mappings */ 144 u64 length; /* only used for discard mappings */
137}; 145};
138 146
139struct btrfs_multi_bio { 147struct btrfs_bio;
148typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
149
150struct btrfs_bio {
140 atomic_t stripes_pending; 151 atomic_t stripes_pending;
141 bio_end_io_t *end_io; 152 bio_end_io_t *end_io;
142 struct bio *orig_bio; 153 struct bio *orig_bio;
@@ -144,6 +155,7 @@ struct btrfs_multi_bio {
144 atomic_t error; 155 atomic_t error;
145 int max_errors; 156 int max_errors;
146 int num_stripes; 157 int num_stripes;
158 int mirror_num;
147 struct btrfs_bio_stripe stripes[]; 159 struct btrfs_bio_stripe stripes[];
148}; 160};
149 161
@@ -171,7 +183,7 @@ struct map_lookup {
171int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 183int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
172 u64 end, u64 *length); 184 u64 end, u64 *length);
173 185
174#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ 186#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \
175 (sizeof(struct btrfs_bio_stripe) * (n))) 187 (sizeof(struct btrfs_bio_stripe) * (n)))
176 188
177int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 189int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
@@ -180,7 +192,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
180 u64 chunk_offset, u64 start, u64 num_bytes); 192 u64 chunk_offset, u64 start, u64 num_bytes);
181int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 193int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
182 u64 logical, u64 *length, 194 u64 logical, u64 *length,
183 struct btrfs_multi_bio **multi_ret, int mirror_num); 195 struct btrfs_bio **bbio_ret, int mirror_num);
184int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 196int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
185 u64 chunk_start, u64 physical, u64 devid, 197 u64 chunk_start, u64 physical, u64 devid,
186 u64 **logical, int *naddrs, int *stripe_len); 198 u64 **logical, int *naddrs, int *stripe_len);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 426aa464f1af..3848b04e310e 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -127,6 +127,17 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
127again: 127again:
128 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), 128 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
129 name, name_len, value, size); 129 name, name_len, value, size);
130 /*
131 * If we're setting an xattr to a new value but the new value is say
132 * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting
133 * back from split_leaf. This is because it thinks we'll be extending
134 * the existing item size, but we're asking for enough space to add the
135 * item itself. So if we get EOVERFLOW just set ret to EEXIST and let
136 * the rest of the function figure it out.
137 */
138 if (ret == -EOVERFLOW)
139 ret = -EEXIST;
140
130 if (ret == -EEXIST) { 141 if (ret == -EEXIST) {
131 if (flags & XATTR_CREATE) 142 if (flags & XATTR_CREATE)
132 goto out; 143 goto out;
diff --git a/fs/buffer.c b/fs/buffer.c
index 70a19745cb61..19d8eb7fdc81 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -288,7 +288,7 @@ static void free_more_memory(void)
288 struct zone *zone; 288 struct zone *zone;
289 int nid; 289 int nid;
290 290
291 wakeup_flusher_threads(1024); 291 wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
292 yield(); 292 yield();
293 293
294 for_each_online_node(nid) { 294 for_each_online_node(nid) {
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 15b21e35078a..0f327c6c9679 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -487,17 +487,15 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
487 ci->i_rdcache_gen++; 487 ci->i_rdcache_gen++;
488 488
489 /* 489 /*
490 * if we are newly issued FILE_SHARED, clear I_COMPLETE; we 490 * if we are newly issued FILE_SHARED, clear D_COMPLETE; we
491 * don't know what happened to this directory while we didn't 491 * don't know what happened to this directory while we didn't
492 * have the cap. 492 * have the cap.
493 */ 493 */
494 if ((issued & CEPH_CAP_FILE_SHARED) && 494 if ((issued & CEPH_CAP_FILE_SHARED) &&
495 (had & CEPH_CAP_FILE_SHARED) == 0) { 495 (had & CEPH_CAP_FILE_SHARED) == 0) {
496 ci->i_shared_gen++; 496 ci->i_shared_gen++;
497 if (S_ISDIR(ci->vfs_inode.i_mode)) { 497 if (S_ISDIR(ci->vfs_inode.i_mode))
498 dout(" marking %p NOT complete\n", &ci->vfs_inode); 498 ceph_dir_clear_complete(&ci->vfs_inode);
499 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
500 }
501 } 499 }
502} 500}
503 501
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 382abc9a6a54..2abd0dfad7f8 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -108,7 +108,7 @@ static unsigned fpos_off(loff_t p)
108 * falling back to a "normal" sync readdir if any dentries in the dir 108 * falling back to a "normal" sync readdir if any dentries in the dir
109 * are dropped. 109 * are dropped.
110 * 110 *
111 * I_COMPLETE tells indicates we have all dentries in the dir. It is 111 * D_COMPLETE tells indicates we have all dentries in the dir. It is
112 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by 112 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
113 * the MDS if/when the directory is modified). 113 * the MDS if/when the directory is modified).
114 */ 114 */
@@ -199,8 +199,8 @@ more:
199 filp->f_pos++; 199 filp->f_pos++;
200 200
201 /* make sure a dentry wasn't dropped while we didn't have parent lock */ 201 /* make sure a dentry wasn't dropped while we didn't have parent lock */
202 if (!ceph_i_test(dir, CEPH_I_COMPLETE)) { 202 if (!ceph_dir_test_complete(dir)) {
203 dout(" lost I_COMPLETE on %p; falling back to mds\n", dir); 203 dout(" lost D_COMPLETE on %p; falling back to mds\n", dir);
204 err = -EAGAIN; 204 err = -EAGAIN;
205 goto out; 205 goto out;
206 } 206 }
@@ -285,7 +285,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
285 if ((filp->f_pos == 2 || fi->dentry) && 285 if ((filp->f_pos == 2 || fi->dentry) &&
286 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && 286 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
287 ceph_snap(inode) != CEPH_SNAPDIR && 287 ceph_snap(inode) != CEPH_SNAPDIR &&
288 (ci->i_ceph_flags & CEPH_I_COMPLETE) && 288 ceph_dir_test_complete(inode) &&
289 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { 289 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
290 spin_unlock(&inode->i_lock); 290 spin_unlock(&inode->i_lock);
291 err = __dcache_readdir(filp, dirent, filldir); 291 err = __dcache_readdir(filp, dirent, filldir);
@@ -351,7 +351,7 @@ more:
351 351
352 if (!req->r_did_prepopulate) { 352 if (!req->r_did_prepopulate) {
353 dout("readdir !did_prepopulate"); 353 dout("readdir !did_prepopulate");
354 fi->dir_release_count--; /* preclude I_COMPLETE */ 354 fi->dir_release_count--; /* preclude D_COMPLETE */
355 } 355 }
356 356
357 /* note next offset and last dentry name */ 357 /* note next offset and last dentry name */
@@ -430,8 +430,7 @@ more:
430 */ 430 */
431 spin_lock(&inode->i_lock); 431 spin_lock(&inode->i_lock);
432 if (ci->i_release_count == fi->dir_release_count) { 432 if (ci->i_release_count == fi->dir_release_count) {
433 dout(" marking %p complete\n", inode); 433 ceph_dir_set_complete(inode);
434 /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
435 ci->i_max_offset = filp->f_pos; 434 ci->i_max_offset = filp->f_pos;
436 } 435 }
437 spin_unlock(&inode->i_lock); 436 spin_unlock(&inode->i_lock);
@@ -614,7 +613,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
614 fsc->mount_options->snapdir_name, 613 fsc->mount_options->snapdir_name,
615 dentry->d_name.len) && 614 dentry->d_name.len) &&
616 !is_root_ceph_dentry(dir, dentry) && 615 !is_root_ceph_dentry(dir, dentry) &&
617 (ci->i_ceph_flags & CEPH_I_COMPLETE) && 616 ceph_dir_test_complete(dir) &&
618 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { 617 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
619 spin_unlock(&dir->i_lock); 618 spin_unlock(&dir->i_lock);
620 dout(" dir %p complete, -ENOENT\n", dir); 619 dout(" dir %p complete, -ENOENT\n", dir);
@@ -934,7 +933,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
934 */ 933 */
935 934
936 /* d_move screws up d_subdirs order */ 935 /* d_move screws up d_subdirs order */
937 ceph_i_clear(new_dir, CEPH_I_COMPLETE); 936 ceph_dir_clear_complete(new_dir);
938 937
939 d_move(old_dentry, new_dentry); 938 d_move(old_dentry, new_dentry);
940 939
@@ -1092,7 +1091,75 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry,
1092 return 1; 1091 return 1;
1093} 1092}
1094 1093
1094/*
1095 * Set/clear/test dir complete flag on the dir's dentry.
1096 */
1097static struct dentry * __d_find_any_alias(struct inode *inode)
1098{
1099 struct dentry *alias;
1100
1101 if (list_empty(&inode->i_dentry))
1102 return NULL;
1103 alias = list_first_entry(&inode->i_dentry, struct dentry, d_alias);
1104 return alias;
1105}
1106
1107void ceph_dir_set_complete(struct inode *inode)
1108{
1109 struct dentry *dentry = __d_find_any_alias(inode);
1110
1111 if (dentry && ceph_dentry(dentry)) {
1112 dout(" marking %p (%p) complete\n", inode, dentry);
1113 set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1114 }
1115}
1116
1117void ceph_dir_clear_complete(struct inode *inode)
1118{
1119 struct dentry *dentry = __d_find_any_alias(inode);
1120
1121 if (dentry && ceph_dentry(dentry)) {
1122 dout(" marking %p (%p) NOT complete\n", inode, dentry);
1123 clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1124 }
1125}
1126
1127bool ceph_dir_test_complete(struct inode *inode)
1128{
1129 struct dentry *dentry = __d_find_any_alias(inode);
1130
1131 if (dentry && ceph_dentry(dentry))
1132 return test_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1133 return false;
1134}
1135
1136/*
1137 * When the VFS prunes a dentry from the cache, we need to clear the
1138 * complete flag on the parent directory.
1139 *
1140 * Called under dentry->d_lock.
1141 */
1142static void ceph_d_prune(struct dentry *dentry)
1143{
1144 struct ceph_dentry_info *di;
1145
1146 dout("d_release %p\n", dentry);
1147
1148 /* do we have a valid parent? */
1149 if (!dentry->d_parent || IS_ROOT(dentry))
1150 return;
1095 1151
1152 /* if we are not hashed, we don't affect D_COMPLETE */
1153 if (d_unhashed(dentry))
1154 return;
1155
1156 /*
1157 * we hold d_lock, so d_parent is stable, and d_fsdata is never
1158 * cleared until d_release
1159 */
1160 di = ceph_dentry(dentry->d_parent);
1161 clear_bit(CEPH_D_COMPLETE, &di->flags);
1162}
1096 1163
1097/* 1164/*
1098 * read() on a dir. This weird interface hack only works if mounted 1165 * read() on a dir. This weird interface hack only works if mounted
@@ -1306,6 +1373,7 @@ const struct inode_operations ceph_dir_iops = {
1306const struct dentry_operations ceph_dentry_ops = { 1373const struct dentry_operations ceph_dentry_ops = {
1307 .d_revalidate = ceph_d_revalidate, 1374 .d_revalidate = ceph_d_revalidate,
1308 .d_release = ceph_d_release, 1375 .d_release = ceph_d_release,
1376 .d_prune = ceph_d_prune,
1309}; 1377};
1310 1378
1311const struct dentry_operations ceph_snapdir_dentry_ops = { 1379const struct dentry_operations ceph_snapdir_dentry_ops = {
@@ -1315,4 +1383,5 @@ const struct dentry_operations ceph_snapdir_dentry_ops = {
1315 1383
1316const struct dentry_operations ceph_snap_dentry_ops = { 1384const struct dentry_operations ceph_snap_dentry_ops = {
1317 .d_release = ceph_d_release, 1385 .d_release = ceph_d_release,
1386 .d_prune = ceph_d_prune,
1318}; 1387};
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 1616a0d37cbd..e392bfce84a3 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -771,9 +771,9 @@ no_change:
771 ceph_snap(inode) == CEPH_NOSNAP && 771 ceph_snap(inode) == CEPH_NOSNAP &&
772 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && 772 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
773 (issued & CEPH_CAP_FILE_EXCL) == 0 && 773 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
774 (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { 774 !ceph_dir_test_complete(inode)) {
775 dout(" marking %p complete (empty)\n", inode); 775 dout(" marking %p complete (empty)\n", inode);
776 /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */ 776 ceph_dir_set_complete(inode);
777 ci->i_max_offset = 2; 777 ci->i_max_offset = 2;
778 } 778 }
779 779
@@ -856,7 +856,7 @@ static void ceph_set_dentry_offset(struct dentry *dn)
856 di = ceph_dentry(dn); 856 di = ceph_dentry(dn);
857 857
858 spin_lock(&inode->i_lock); 858 spin_lock(&inode->i_lock);
859 if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) { 859 if (!ceph_dir_test_complete(inode)) {
860 spin_unlock(&inode->i_lock); 860 spin_unlock(&inode->i_lock);
861 return; 861 return;
862 } 862 }
@@ -1056,7 +1056,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1056 * d_move() puts the renamed dentry at the end of 1056 * d_move() puts the renamed dentry at the end of
1057 * d_subdirs. We need to assign it an appropriate 1057 * d_subdirs. We need to assign it an appropriate
1058 * directory offset so we can behave when holding 1058 * directory offset so we can behave when holding
1059 * I_COMPLETE. 1059 * D_COMPLETE.
1060 */ 1060 */
1061 ceph_set_dentry_offset(req->r_old_dentry); 1061 ceph_set_dentry_offset(req->r_old_dentry);
1062 dout("dn %p gets new offset %lld\n", req->r_old_dentry, 1062 dout("dn %p gets new offset %lld\n", req->r_old_dentry,
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 1d72f15fe9f4..264ab701154f 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -619,7 +619,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
619 * 619 *
620 * Called under mdsc->mutex. 620 * Called under mdsc->mutex.
621 */ 621 */
622struct dentry *get_nonsnap_parent(struct dentry *dentry) 622static struct dentry *get_nonsnap_parent(struct dentry *dentry)
623{ 623{
624 /* 624 /*
625 * we don't need to worry about protecting the d_parent access 625 * we don't need to worry about protecting the d_parent access
@@ -2002,7 +2002,7 @@ out:
2002} 2002}
2003 2003
2004/* 2004/*
2005 * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS 2005 * Invalidate dir D_COMPLETE, dentry lease state on an aborted MDS
2006 * namespace request. 2006 * namespace request.
2007 */ 2007 */
2008void ceph_invalidate_dir_request(struct ceph_mds_request *req) 2008void ceph_invalidate_dir_request(struct ceph_mds_request *req)
@@ -2010,9 +2010,9 @@ void ceph_invalidate_dir_request(struct ceph_mds_request *req)
2010 struct inode *inode = req->r_locked_dir; 2010 struct inode *inode = req->r_locked_dir;
2011 struct ceph_inode_info *ci = ceph_inode(inode); 2011 struct ceph_inode_info *ci = ceph_inode(inode);
2012 2012
2013 dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode); 2013 dout("invalidate_dir_request %p (D_COMPLETE, lease(s))\n", inode);
2014 spin_lock(&inode->i_lock); 2014 spin_lock(&inode->i_lock);
2015 ci->i_ceph_flags &= ~CEPH_I_COMPLETE; 2015 ceph_dir_clear_complete(inode);
2016 ci->i_release_count++; 2016 ci->i_release_count++;
2017 spin_unlock(&inode->i_lock); 2017 spin_unlock(&inode->i_lock);
2018 2018
@@ -3154,7 +3154,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
3154/* 3154/*
3155 * true if all sessions are closed, or we force unmount 3155 * true if all sessions are closed, or we force unmount
3156 */ 3156 */
3157bool done_closing_sessions(struct ceph_mds_client *mdsc) 3157static bool done_closing_sessions(struct ceph_mds_client *mdsc)
3158{ 3158{
3159 int i, n = 0; 3159 int i, n = 0;
3160 3160
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 788f5ad8e66d..a90846fac759 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -426,7 +426,7 @@ static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
426/* 426/*
427 * create a new fs client 427 * create a new fs client
428 */ 428 */
429struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, 429static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
430 struct ceph_options *opt) 430 struct ceph_options *opt)
431{ 431{
432 struct ceph_fs_client *fsc; 432 struct ceph_fs_client *fsc;
@@ -502,7 +502,7 @@ fail:
502 return ERR_PTR(err); 502 return ERR_PTR(err);
503} 503}
504 504
505void destroy_fs_client(struct ceph_fs_client *fsc) 505static void destroy_fs_client(struct ceph_fs_client *fsc)
506{ 506{
507 dout("destroy_fs_client %p\n", fsc); 507 dout("destroy_fs_client %p\n", fsc);
508 508
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index b01442aaf278..01bf189e08a9 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -203,6 +203,7 @@ struct ceph_inode_xattr {
203 * Ceph dentry state 203 * Ceph dentry state
204 */ 204 */
205struct ceph_dentry_info { 205struct ceph_dentry_info {
206 unsigned long flags;
206 struct ceph_mds_session *lease_session; 207 struct ceph_mds_session *lease_session;
207 u32 lease_gen, lease_shared_gen; 208 u32 lease_gen, lease_shared_gen;
208 u32 lease_seq; 209 u32 lease_seq;
@@ -213,6 +214,18 @@ struct ceph_dentry_info {
213 u64 offset; 214 u64 offset;
214}; 215};
215 216
217/*
218 * dentry flags
219 *
220 * The locking for D_COMPLETE is a bit odd:
221 * - we can clear it at almost any time (see ceph_d_prune)
222 * - it is only meaningful if:
223 * - we hold dir inode i_lock
224 * - we hold dir FILE_SHARED caps
225 * - the dentry D_COMPLETE is set
226 */
227#define CEPH_D_COMPLETE 1 /* if set, d_u.d_subdirs is complete directory */
228
216struct ceph_inode_xattrs_info { 229struct ceph_inode_xattrs_info {
217 /* 230 /*
218 * (still encoded) xattr blob. we avoid the overhead of parsing 231 * (still encoded) xattr blob. we avoid the overhead of parsing
@@ -251,7 +264,7 @@ struct ceph_inode_info {
251 struct timespec i_rctime; 264 struct timespec i_rctime;
252 u64 i_rbytes, i_rfiles, i_rsubdirs; 265 u64 i_rbytes, i_rfiles, i_rsubdirs;
253 u64 i_files, i_subdirs; 266 u64 i_files, i_subdirs;
254 u64 i_max_offset; /* largest readdir offset, set with I_COMPLETE */ 267 u64 i_max_offset; /* largest readdir offset, set with D_COMPLETE */
255 268
256 struct rb_root i_fragtree; 269 struct rb_root i_fragtree;
257 struct mutex i_fragtree_mutex; 270 struct mutex i_fragtree_mutex;
@@ -416,7 +429,6 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
416/* 429/*
417 * Ceph inode. 430 * Ceph inode.
418 */ 431 */
419#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
420#define CEPH_I_NODELAY 4 /* do not delay cap release */ 432#define CEPH_I_NODELAY 4 /* do not delay cap release */
421#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ 433#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
422#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ 434#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
@@ -474,6 +486,13 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
474} 486}
475 487
476/* 488/*
489 * set/clear directory D_COMPLETE flag
490 */
491void ceph_dir_set_complete(struct inode *inode);
492void ceph_dir_clear_complete(struct inode *inode);
493bool ceph_dir_test_complete(struct inode *inode);
494
495/*
477 * caps helpers 496 * caps helpers
478 */ 497 */
479static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci) 498static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index c0458c543f17..d6a972df0338 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -37,6 +37,7 @@
37#include <asm/uaccess.h> 37#include <asm/uaccess.h>
38#include <asm/processor.h> 38#include <asm/processor.h>
39#include <linux/inet.h> 39#include <linux/inet.h>
40#include <linux/module.h>
40#include <net/ipv6.h> 41#include <net/ipv6.h>
41#include "cifspdu.h" 42#include "cifspdu.h"
42#include "cifsglob.h" 43#include "cifsglob.h"
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
index fa9a286c8771..da42f32c49be 100644
--- a/fs/exofs/Kconfig
+++ b/fs/exofs/Kconfig
@@ -5,7 +5,7 @@
5# selected by any of the users. 5# selected by any of the users.
6config ORE 6config ORE
7 tristate 7 tristate
8 depends on EXOFS_FS 8 depends on EXOFS_FS || PNFS_OBJLAYOUT
9 select ASYNC_XOR 9 select ASYNC_XOR
10 default SCSI_OSD_ULD 10 default SCSI_OSD_ULD
11 11
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index fcfa86ae6faf..d271ad837202 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -23,6 +23,7 @@
23 */ 23 */
24 24
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/module.h>
26#include <asm/div64.h> 27#include <asm/div64.h>
27#include <linux/lcm.h> 28#include <linux/lcm.h>
28 29
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 057b237b8b69..e6085ec192d6 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -35,6 +35,7 @@
35#include <linux/parser.h> 35#include <linux/parser.h>
36#include <linux/vfs.h> 36#include <linux/vfs.h>
37#include <linux/random.h> 37#include <linux/random.h>
38#include <linux/module.h>
38#include <linux/exportfs.h> 39#include <linux/exportfs.h>
39#include <linux/slab.h> 40#include <linux/slab.h>
40 41
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cc5a6da030a1..240f6e2dc7ee 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2372,7 +2372,7 @@ static int ext4_nonda_switch(struct super_block *sb)
2372 * start pushing delalloc when 1/2 of free blocks are dirty. 2372 * start pushing delalloc when 1/2 of free blocks are dirty.
2373 */ 2373 */
2374 if (free_blocks < 2 * dirty_blocks) 2374 if (free_blocks < 2 * dirty_blocks)
2375 writeback_inodes_sb_if_idle(sb); 2375 writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE);
2376 2376
2377 return 0; 2377 return 0;
2378} 2378}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 04cf3b91e501..73c3992b2bb4 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -41,11 +41,23 @@ struct wb_writeback_work {
41 unsigned int for_kupdate:1; 41 unsigned int for_kupdate:1;
42 unsigned int range_cyclic:1; 42 unsigned int range_cyclic:1;
43 unsigned int for_background:1; 43 unsigned int for_background:1;
44 enum wb_reason reason; /* why was writeback initiated? */
44 45
45 struct list_head list; /* pending work list */ 46 struct list_head list; /* pending work list */
46 struct completion *done; /* set if the caller waits */ 47 struct completion *done; /* set if the caller waits */
47}; 48};
48 49
50const char *wb_reason_name[] = {
51 [WB_REASON_BACKGROUND] = "background",
52 [WB_REASON_TRY_TO_FREE_PAGES] = "try_to_free_pages",
53 [WB_REASON_SYNC] = "sync",
54 [WB_REASON_PERIODIC] = "periodic",
55 [WB_REASON_LAPTOP_TIMER] = "laptop_timer",
56 [WB_REASON_FREE_MORE_MEM] = "free_more_memory",
57 [WB_REASON_FS_FREE_SPACE] = "fs_free_space",
58 [WB_REASON_FORKER_THREAD] = "forker_thread"
59};
60
49/* 61/*
50 * Include the creation of the trace points after defining the 62 * Include the creation of the trace points after defining the
51 * wb_writeback_work structure so that the definition remains local to this 63 * wb_writeback_work structure so that the definition remains local to this
@@ -115,7 +127,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
115 127
116static void 128static void
117__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, 129__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
118 bool range_cyclic) 130 bool range_cyclic, enum wb_reason reason)
119{ 131{
120 struct wb_writeback_work *work; 132 struct wb_writeback_work *work;
121 133
@@ -135,6 +147,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
135 work->sync_mode = WB_SYNC_NONE; 147 work->sync_mode = WB_SYNC_NONE;
136 work->nr_pages = nr_pages; 148 work->nr_pages = nr_pages;
137 work->range_cyclic = range_cyclic; 149 work->range_cyclic = range_cyclic;
150 work->reason = reason;
138 151
139 bdi_queue_work(bdi, work); 152 bdi_queue_work(bdi, work);
140} 153}
@@ -150,9 +163,10 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
150 * completion. Caller need not hold sb s_umount semaphore. 163 * completion. Caller need not hold sb s_umount semaphore.
151 * 164 *
152 */ 165 */
153void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) 166void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
167 enum wb_reason reason)
154{ 168{
155 __bdi_start_writeback(bdi, nr_pages, true); 169 __bdi_start_writeback(bdi, nr_pages, true, reason);
156} 170}
157 171
158/** 172/**
@@ -251,7 +265,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
251 */ 265 */
252static int move_expired_inodes(struct list_head *delaying_queue, 266static int move_expired_inodes(struct list_head *delaying_queue,
253 struct list_head *dispatch_queue, 267 struct list_head *dispatch_queue,
254 unsigned long *older_than_this) 268 struct wb_writeback_work *work)
255{ 269{
256 LIST_HEAD(tmp); 270 LIST_HEAD(tmp);
257 struct list_head *pos, *node; 271 struct list_head *pos, *node;
@@ -262,8 +276,8 @@ static int move_expired_inodes(struct list_head *delaying_queue,
262 276
263 while (!list_empty(delaying_queue)) { 277 while (!list_empty(delaying_queue)) {
264 inode = wb_inode(delaying_queue->prev); 278 inode = wb_inode(delaying_queue->prev);
265 if (older_than_this && 279 if (work->older_than_this &&
266 inode_dirtied_after(inode, *older_than_this)) 280 inode_dirtied_after(inode, *work->older_than_this))
267 break; 281 break;
268 if (sb && sb != inode->i_sb) 282 if (sb && sb != inode->i_sb)
269 do_sb_sort = 1; 283 do_sb_sort = 1;
@@ -302,13 +316,13 @@ out:
302 * | 316 * |
303 * +--> dequeue for IO 317 * +--> dequeue for IO
304 */ 318 */
305static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) 319static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
306{ 320{
307 int moved; 321 int moved;
308 assert_spin_locked(&wb->list_lock); 322 assert_spin_locked(&wb->list_lock);
309 list_splice_init(&wb->b_more_io, &wb->b_io); 323 list_splice_init(&wb->b_more_io, &wb->b_io);
310 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); 324 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
311 trace_writeback_queue_io(wb, older_than_this, moved); 325 trace_writeback_queue_io(wb, work, moved);
312} 326}
313 327
314static int write_inode(struct inode *inode, struct writeback_control *wbc) 328static int write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -641,31 +655,40 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
641 return wrote; 655 return wrote;
642} 656}
643 657
644long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages) 658long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
659 enum wb_reason reason)
645{ 660{
646 struct wb_writeback_work work = { 661 struct wb_writeback_work work = {
647 .nr_pages = nr_pages, 662 .nr_pages = nr_pages,
648 .sync_mode = WB_SYNC_NONE, 663 .sync_mode = WB_SYNC_NONE,
649 .range_cyclic = 1, 664 .range_cyclic = 1,
665 .reason = reason,
650 }; 666 };
651 667
652 spin_lock(&wb->list_lock); 668 spin_lock(&wb->list_lock);
653 if (list_empty(&wb->b_io)) 669 if (list_empty(&wb->b_io))
654 queue_io(wb, NULL); 670 queue_io(wb, &work);
655 __writeback_inodes_wb(wb, &work); 671 __writeback_inodes_wb(wb, &work);
656 spin_unlock(&wb->list_lock); 672 spin_unlock(&wb->list_lock);
657 673
658 return nr_pages - work.nr_pages; 674 return nr_pages - work.nr_pages;
659} 675}
660 676
661static inline bool over_bground_thresh(void) 677static bool over_bground_thresh(struct backing_dev_info *bdi)
662{ 678{
663 unsigned long background_thresh, dirty_thresh; 679 unsigned long background_thresh, dirty_thresh;
664 680
665 global_dirty_limits(&background_thresh, &dirty_thresh); 681 global_dirty_limits(&background_thresh, &dirty_thresh);
666 682
667 return (global_page_state(NR_FILE_DIRTY) + 683 if (global_page_state(NR_FILE_DIRTY) +
668 global_page_state(NR_UNSTABLE_NFS) > background_thresh); 684 global_page_state(NR_UNSTABLE_NFS) > background_thresh)
685 return true;
686
687 if (bdi_stat(bdi, BDI_RECLAIMABLE) >
688 bdi_dirty_limit(bdi, background_thresh))
689 return true;
690
691 return false;
669} 692}
670 693
671/* 694/*
@@ -675,7 +698,7 @@ static inline bool over_bground_thresh(void)
675static void wb_update_bandwidth(struct bdi_writeback *wb, 698static void wb_update_bandwidth(struct bdi_writeback *wb,
676 unsigned long start_time) 699 unsigned long start_time)
677{ 700{
678 __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time); 701 __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time);
679} 702}
680 703
681/* 704/*
@@ -727,7 +750,7 @@ static long wb_writeback(struct bdi_writeback *wb,
727 * For background writeout, stop when we are below the 750 * For background writeout, stop when we are below the
728 * background dirty threshold 751 * background dirty threshold
729 */ 752 */
730 if (work->for_background && !over_bground_thresh()) 753 if (work->for_background && !over_bground_thresh(wb->bdi))
731 break; 754 break;
732 755
733 if (work->for_kupdate) { 756 if (work->for_kupdate) {
@@ -738,7 +761,7 @@ static long wb_writeback(struct bdi_writeback *wb,
738 761
739 trace_writeback_start(wb->bdi, work); 762 trace_writeback_start(wb->bdi, work);
740 if (list_empty(&wb->b_io)) 763 if (list_empty(&wb->b_io))
741 queue_io(wb, work->older_than_this); 764 queue_io(wb, work);
742 if (work->sb) 765 if (work->sb)
743 progress = writeback_sb_inodes(work->sb, wb, work); 766 progress = writeback_sb_inodes(work->sb, wb, work);
744 else 767 else
@@ -811,13 +834,14 @@ static unsigned long get_nr_dirty_pages(void)
811 834
812static long wb_check_background_flush(struct bdi_writeback *wb) 835static long wb_check_background_flush(struct bdi_writeback *wb)
813{ 836{
814 if (over_bground_thresh()) { 837 if (over_bground_thresh(wb->bdi)) {
815 838
816 struct wb_writeback_work work = { 839 struct wb_writeback_work work = {
817 .nr_pages = LONG_MAX, 840 .nr_pages = LONG_MAX,
818 .sync_mode = WB_SYNC_NONE, 841 .sync_mode = WB_SYNC_NONE,
819 .for_background = 1, 842 .for_background = 1,
820 .range_cyclic = 1, 843 .range_cyclic = 1,
844 .reason = WB_REASON_BACKGROUND,
821 }; 845 };
822 846
823 return wb_writeback(wb, &work); 847 return wb_writeback(wb, &work);
@@ -851,6 +875,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
851 .sync_mode = WB_SYNC_NONE, 875 .sync_mode = WB_SYNC_NONE,
852 .for_kupdate = 1, 876 .for_kupdate = 1,
853 .range_cyclic = 1, 877 .range_cyclic = 1,
878 .reason = WB_REASON_PERIODIC,
854 }; 879 };
855 880
856 return wb_writeback(wb, &work); 881 return wb_writeback(wb, &work);
@@ -969,7 +994,7 @@ int bdi_writeback_thread(void *data)
969 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back 994 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
970 * the whole world. 995 * the whole world.
971 */ 996 */
972void wakeup_flusher_threads(long nr_pages) 997void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
973{ 998{
974 struct backing_dev_info *bdi; 999 struct backing_dev_info *bdi;
975 1000
@@ -982,7 +1007,7 @@ void wakeup_flusher_threads(long nr_pages)
982 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { 1007 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
983 if (!bdi_has_dirty_io(bdi)) 1008 if (!bdi_has_dirty_io(bdi))
984 continue; 1009 continue;
985 __bdi_start_writeback(bdi, nr_pages, false); 1010 __bdi_start_writeback(bdi, nr_pages, false, reason);
986 } 1011 }
987 rcu_read_unlock(); 1012 rcu_read_unlock();
988} 1013}
@@ -1203,7 +1228,9 @@ static void wait_sb_inodes(struct super_block *sb)
1203 * on how many (if any) will be written, and this function does not wait 1228 * on how many (if any) will be written, and this function does not wait
1204 * for IO completion of submitted IO. 1229 * for IO completion of submitted IO.
1205 */ 1230 */
1206void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) 1231void writeback_inodes_sb_nr(struct super_block *sb,
1232 unsigned long nr,
1233 enum wb_reason reason)
1207{ 1234{
1208 DECLARE_COMPLETION_ONSTACK(done); 1235 DECLARE_COMPLETION_ONSTACK(done);
1209 struct wb_writeback_work work = { 1236 struct wb_writeback_work work = {
@@ -1212,6 +1239,7 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
1212 .tagged_writepages = 1, 1239 .tagged_writepages = 1,
1213 .done = &done, 1240 .done = &done,
1214 .nr_pages = nr, 1241 .nr_pages = nr,
1242 .reason = reason,
1215 }; 1243 };
1216 1244
1217 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1245 WARN_ON(!rwsem_is_locked(&sb->s_umount));
@@ -1228,9 +1256,9 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr);
1228 * on how many (if any) will be written, and this function does not wait 1256 * on how many (if any) will be written, and this function does not wait
1229 * for IO completion of submitted IO. 1257 * for IO completion of submitted IO.
1230 */ 1258 */
1231void writeback_inodes_sb(struct super_block *sb) 1259void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
1232{ 1260{
1233 return writeback_inodes_sb_nr(sb, get_nr_dirty_pages()); 1261 return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
1234} 1262}
1235EXPORT_SYMBOL(writeback_inodes_sb); 1263EXPORT_SYMBOL(writeback_inodes_sb);
1236 1264
@@ -1241,11 +1269,11 @@ EXPORT_SYMBOL(writeback_inodes_sb);
1241 * Invoke writeback_inodes_sb if no writeback is currently underway. 1269 * Invoke writeback_inodes_sb if no writeback is currently underway.
1242 * Returns 1 if writeback was started, 0 if not. 1270 * Returns 1 if writeback was started, 0 if not.
1243 */ 1271 */
1244int writeback_inodes_sb_if_idle(struct super_block *sb) 1272int writeback_inodes_sb_if_idle(struct super_block *sb, enum wb_reason reason)
1245{ 1273{
1246 if (!writeback_in_progress(sb->s_bdi)) { 1274 if (!writeback_in_progress(sb->s_bdi)) {
1247 down_read(&sb->s_umount); 1275 down_read(&sb->s_umount);
1248 writeback_inodes_sb(sb); 1276 writeback_inodes_sb(sb, reason);
1249 up_read(&sb->s_umount); 1277 up_read(&sb->s_umount);
1250 return 1; 1278 return 1;
1251 } else 1279 } else
@@ -1262,11 +1290,12 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
1262 * Returns 1 if writeback was started, 0 if not. 1290 * Returns 1 if writeback was started, 0 if not.
1263 */ 1291 */
1264int writeback_inodes_sb_nr_if_idle(struct super_block *sb, 1292int writeback_inodes_sb_nr_if_idle(struct super_block *sb,
1265 unsigned long nr) 1293 unsigned long nr,
1294 enum wb_reason reason)
1266{ 1295{
1267 if (!writeback_in_progress(sb->s_bdi)) { 1296 if (!writeback_in_progress(sb->s_bdi)) {
1268 down_read(&sb->s_umount); 1297 down_read(&sb->s_umount);
1269 writeback_inodes_sb_nr(sb, nr); 1298 writeback_inodes_sb_nr(sb, nr, reason);
1270 up_read(&sb->s_umount); 1299 up_read(&sb->s_umount);
1271 return 1; 1300 return 1;
1272 } else 1301 } else
@@ -1290,6 +1319,7 @@ void sync_inodes_sb(struct super_block *sb)
1290 .nr_pages = LONG_MAX, 1319 .nr_pages = LONG_MAX,
1291 .range_cyclic = 0, 1320 .range_cyclic = 0,
1292 .done = &done, 1321 .done = &done,
1322 .reason = WB_REASON_SYNC,
1293 }; 1323 };
1294 1324
1295 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1325 WARN_ON(!rwsem_is_locked(&sb->s_umount));
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index b6cca47f7b07..3426521f3205 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -47,6 +47,7 @@
47#include <linux/slab.h> 47#include <linux/slab.h>
48#include <linux/spinlock.h> 48#include <linux/spinlock.h>
49#include <linux/stat.h> 49#include <linux/stat.h>
50#include <linux/module.h>
50 51
51#include "fuse_i.h" 52#include "fuse_i.h"
52 53
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 7e823bbd2453..cb23c2be731a 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -14,6 +14,7 @@
14#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
15#include <linux/blkdev.h> 15#include <linux/blkdev.h>
16#include <linux/kthread.h> 16#include <linux/kthread.h>
17#include <linux/export.h>
17#include <linux/namei.h> 18#include <linux/namei.h>
18#include <linux/mount.h> 19#include <linux/mount.h>
19#include <linux/gfs2_ondisk.h> 20#include <linux/gfs2_ondisk.h>
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 7da2a06508e5..f79dab83e17b 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -21,6 +21,7 @@
21 */ 21 */
22#include <linux/gfp.h> 22#include <linux/gfp.h>
23#include <linux/kernel.h> 23#include <linux/kernel.h>
24#include <linux/export.h>
24#include <linux/ioprio.h> 25#include <linux/ioprio.h>
25#include <linux/blkdev.h> 26#include <linux/blkdev.h>
26#include <linux/capability.h> 27#include <linux/capability.h>
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index de4247021d25..5b6c9d1a2fb9 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -53,6 +53,78 @@ static int jffs2_is_best_compression(struct jffs2_compressor *this,
53 return 0; 53 return 0;
54} 54}
55 55
56/*
57 * jffs2_selected_compress:
58 * @compr: Explicit compression type to use (ie, JFFS2_COMPR_ZLIB).
59 * If 0, just take the first available compression mode.
60 * @data_in: Pointer to uncompressed data
61 * @cpage_out: Pointer to returned pointer to buffer for compressed data
62 * @datalen: On entry, holds the amount of data available for compression.
63 * On exit, expected to hold the amount of data actually compressed.
64 * @cdatalen: On entry, holds the amount of space available for compressed
65 * data. On exit, expected to hold the actual size of the compressed
66 * data.
67 *
68 * Returns: the compression type used. Zero is used to show that the data
69 * could not be compressed; probably because we couldn't find the requested
70 * compression mode.
71 */
72static int jffs2_selected_compress(u8 compr, unsigned char *data_in,
73 unsigned char **cpage_out, u32 *datalen, u32 *cdatalen)
74{
75 struct jffs2_compressor *this;
76 int err, ret = JFFS2_COMPR_NONE;
77 uint32_t orig_slen, orig_dlen;
78 char *output_buf;
79
80 output_buf = kmalloc(*cdatalen, GFP_KERNEL);
81 if (!output_buf) {
82 printk(KERN_WARNING "JFFS2: No memory for compressor allocation. Compression failed.\n");
83 return ret;
84 }
85 orig_slen = *datalen;
86 orig_dlen = *cdatalen;
87 spin_lock(&jffs2_compressor_list_lock);
88 list_for_each_entry(this, &jffs2_compressor_list, list) {
89 /* Skip decompress-only and disabled modules */
90 if (!this->compress || this->disabled)
91 continue;
92
93 /* Skip if not the desired compression type */
94 if (compr && (compr != this->compr))
95 continue;
96
97 /*
98 * Either compression type was unspecified, or we found our
99 * compressor; either way, we're good to go.
100 */
101 this->usecount++;
102 spin_unlock(&jffs2_compressor_list_lock);
103
104 *datalen = orig_slen;
105 *cdatalen = orig_dlen;
106 err = this->compress(data_in, output_buf, datalen, cdatalen);
107
108 spin_lock(&jffs2_compressor_list_lock);
109 this->usecount--;
110 if (!err) {
111 /* Success */
112 ret = this->compr;
113 this->stat_compr_blocks++;
114 this->stat_compr_orig_size += *datalen;
115 this->stat_compr_new_size += *cdatalen;
116 break;
117 }
118 }
119 spin_unlock(&jffs2_compressor_list_lock);
120 if (ret == JFFS2_COMPR_NONE)
121 kfree(output_buf);
122 else
123 *cpage_out = output_buf;
124
125 return ret;
126}
127
56/* jffs2_compress: 128/* jffs2_compress:
57 * @data_in: Pointer to uncompressed data 129 * @data_in: Pointer to uncompressed data
58 * @cpage_out: Pointer to returned pointer to buffer for compressed data 130 * @cpage_out: Pointer to returned pointer to buffer for compressed data
@@ -76,47 +148,23 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
76 uint32_t *datalen, uint32_t *cdatalen) 148 uint32_t *datalen, uint32_t *cdatalen)
77{ 149{
78 int ret = JFFS2_COMPR_NONE; 150 int ret = JFFS2_COMPR_NONE;
79 int compr_ret; 151 int mode, compr_ret;
80 struct jffs2_compressor *this, *best=NULL; 152 struct jffs2_compressor *this, *best=NULL;
81 unsigned char *output_buf = NULL, *tmp_buf; 153 unsigned char *output_buf = NULL, *tmp_buf;
82 uint32_t orig_slen, orig_dlen; 154 uint32_t orig_slen, orig_dlen;
83 uint32_t best_slen=0, best_dlen=0; 155 uint32_t best_slen=0, best_dlen=0;
84 156
85 switch (jffs2_compression_mode) { 157 if (c->mount_opts.override_compr)
158 mode = c->mount_opts.compr;
159 else
160 mode = jffs2_compression_mode;
161
162 switch (mode) {
86 case JFFS2_COMPR_MODE_NONE: 163 case JFFS2_COMPR_MODE_NONE:
87 break; 164 break;
88 case JFFS2_COMPR_MODE_PRIORITY: 165 case JFFS2_COMPR_MODE_PRIORITY:
89 output_buf = kmalloc(*cdatalen,GFP_KERNEL); 166 ret = jffs2_selected_compress(0, data_in, cpage_out, datalen,
90 if (!output_buf) { 167 cdatalen);
91 printk(KERN_WARNING "JFFS2: No memory for compressor allocation. Compression failed.\n");
92 goto out;
93 }
94 orig_slen = *datalen;
95 orig_dlen = *cdatalen;
96 spin_lock(&jffs2_compressor_list_lock);
97 list_for_each_entry(this, &jffs2_compressor_list, list) {
98 /* Skip decompress-only backwards-compatibility and disabled modules */
99 if ((!this->compress)||(this->disabled))
100 continue;
101
102 this->usecount++;
103 spin_unlock(&jffs2_compressor_list_lock);
104 *datalen = orig_slen;
105 *cdatalen = orig_dlen;
106 compr_ret = this->compress(data_in, output_buf, datalen, cdatalen);
107 spin_lock(&jffs2_compressor_list_lock);
108 this->usecount--;
109 if (!compr_ret) {
110 ret = this->compr;
111 this->stat_compr_blocks++;
112 this->stat_compr_orig_size += *datalen;
113 this->stat_compr_new_size += *cdatalen;
114 break;
115 }
116 }
117 spin_unlock(&jffs2_compressor_list_lock);
118 if (ret == JFFS2_COMPR_NONE)
119 kfree(output_buf);
120 break; 168 break;
121 case JFFS2_COMPR_MODE_SIZE: 169 case JFFS2_COMPR_MODE_SIZE:
122 case JFFS2_COMPR_MODE_FAVOURLZO: 170 case JFFS2_COMPR_MODE_FAVOURLZO:
@@ -174,22 +222,28 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
174 best->stat_compr_orig_size += best_slen; 222 best->stat_compr_orig_size += best_slen;
175 best->stat_compr_new_size += best_dlen; 223 best->stat_compr_new_size += best_dlen;
176 ret = best->compr; 224 ret = best->compr;
225 *cpage_out = output_buf;
177 } 226 }
178 spin_unlock(&jffs2_compressor_list_lock); 227 spin_unlock(&jffs2_compressor_list_lock);
179 break; 228 break;
229 case JFFS2_COMPR_MODE_FORCELZO:
230 ret = jffs2_selected_compress(JFFS2_COMPR_LZO, data_in,
231 cpage_out, datalen, cdatalen);
232 break;
233 case JFFS2_COMPR_MODE_FORCEZLIB:
234 ret = jffs2_selected_compress(JFFS2_COMPR_ZLIB, data_in,
235 cpage_out, datalen, cdatalen);
236 break;
180 default: 237 default:
181 printk(KERN_ERR "JFFS2: unknown compression mode.\n"); 238 printk(KERN_ERR "JFFS2: unknown compression mode.\n");
182 } 239 }
183 out: 240
184 if (ret == JFFS2_COMPR_NONE) { 241 if (ret == JFFS2_COMPR_NONE) {
185 *cpage_out = data_in; 242 *cpage_out = data_in;
186 *datalen = *cdatalen; 243 *datalen = *cdatalen;
187 none_stat_compr_blocks++; 244 none_stat_compr_blocks++;
188 none_stat_compr_size += *datalen; 245 none_stat_compr_size += *datalen;
189 } 246 }
190 else {
191 *cpage_out = output_buf;
192 }
193 return ret; 247 return ret;
194} 248}
195 249
diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h
index 13bb7597ab39..5e91d578f4ed 100644
--- a/fs/jffs2/compr.h
+++ b/fs/jffs2/compr.h
@@ -40,6 +40,8 @@
40#define JFFS2_COMPR_MODE_PRIORITY 1 40#define JFFS2_COMPR_MODE_PRIORITY 1
41#define JFFS2_COMPR_MODE_SIZE 2 41#define JFFS2_COMPR_MODE_SIZE 2
42#define JFFS2_COMPR_MODE_FAVOURLZO 3 42#define JFFS2_COMPR_MODE_FAVOURLZO 3
43#define JFFS2_COMPR_MODE_FORCELZO 4
44#define JFFS2_COMPR_MODE_FORCEZLIB 5
43 45
44#define FAVOUR_LZO_PERCENT 80 46#define FAVOUR_LZO_PERCENT 80
45 47
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 7286e44ac665..4b8afe39a87f 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -379,7 +379,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags)
379 jffs2_do_setattr(inode, &iattr); 379 jffs2_do_setattr(inode, &iattr);
380} 380}
381 381
382int jffs2_remount_fs (struct super_block *sb, int *flags, char *data) 382int jffs2_do_remount_fs(struct super_block *sb, int *flags, char *data)
383{ 383{
384 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); 384 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
385 385
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 0bc6a6c80a56..55a0c1dceadf 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -29,6 +29,11 @@
29 29
30struct jffs2_inodirty; 30struct jffs2_inodirty;
31 31
32struct jffs2_mount_opts {
33 bool override_compr;
34 unsigned int compr;
35};
36
32/* A struct for the overall file system control. Pointers to 37/* A struct for the overall file system control. Pointers to
33 jffs2_sb_info structs are named `c' in the source code. 38 jffs2_sb_info structs are named `c' in the source code.
34 Nee jffs_control 39 Nee jffs_control
@@ -126,6 +131,7 @@ struct jffs2_sb_info {
126#endif 131#endif
127 132
128 struct jffs2_summary *summary; /* Summary information */ 133 struct jffs2_summary *summary; /* Summary information */
134 struct jffs2_mount_opts mount_opts;
129 135
130#ifdef CONFIG_JFFS2_FS_XATTR 136#ifdef CONFIG_JFFS2_FS_XATTR
131#define XATTRINDEX_HASHSIZE (57) 137#define XATTRINDEX_HASHSIZE (57)
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 6c1755c59c0f..ab65ee3ec858 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -176,7 +176,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags);
176struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, 176struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode,
177 struct jffs2_raw_inode *ri); 177 struct jffs2_raw_inode *ri);
178int jffs2_statfs (struct dentry *, struct kstatfs *); 178int jffs2_statfs (struct dentry *, struct kstatfs *);
179int jffs2_remount_fs (struct super_block *, int *, char *); 179int jffs2_do_remount_fs(struct super_block *, int *, char *);
180int jffs2_do_fill_super(struct super_block *sb, void *data, int silent); 180int jffs2_do_fill_super(struct super_block *sb, void *data, int silent);
181void jffs2_gc_release_inode(struct jffs2_sb_info *c, 181void jffs2_gc_release_inode(struct jffs2_sb_info *c,
182 struct jffs2_inode_info *f); 182 struct jffs2_inode_info *f);
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 8d8cd3419d02..28107ca136e4 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -275,9 +275,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
275 else 275 else
276 c->mtd->unpoint(c->mtd, 0, c->mtd->size); 276 c->mtd->unpoint(c->mtd, 0, c->mtd->size);
277#endif 277#endif
278 if (s) 278 kfree(s);
279 kfree(s);
280
281 return ret; 279 return ret;
282} 280}
283 281
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 853b8e300084..e7e974454115 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -17,11 +17,13 @@
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/err.h> 18#include <linux/err.h>
19#include <linux/mount.h> 19#include <linux/mount.h>
20#include <linux/parser.h>
20#include <linux/jffs2.h> 21#include <linux/jffs2.h>
21#include <linux/pagemap.h> 22#include <linux/pagemap.h>
22#include <linux/mtd/super.h> 23#include <linux/mtd/super.h>
23#include <linux/ctype.h> 24#include <linux/ctype.h>
24#include <linux/namei.h> 25#include <linux/namei.h>
26#include <linux/seq_file.h>
25#include <linux/exportfs.h> 27#include <linux/exportfs.h>
26#include "compr.h" 28#include "compr.h"
27#include "nodelist.h" 29#include "nodelist.h"
@@ -75,6 +77,37 @@ static void jffs2_write_super(struct super_block *sb)
75 unlock_super(sb); 77 unlock_super(sb);
76} 78}
77 79
80static const char *jffs2_compr_name(unsigned int compr)
81{
82 switch (compr) {
83 case JFFS2_COMPR_MODE_NONE:
84 return "none";
85#ifdef CONFIG_JFFS2_LZO
86 case JFFS2_COMPR_MODE_FORCELZO:
87 return "lzo";
88#endif
89#ifdef CONFIG_JFFS2_ZLIB
90 case JFFS2_COMPR_MODE_FORCEZLIB:
91 return "zlib";
92#endif
93 default:
94 /* should never happen; programmer error */
95 WARN_ON(1);
96 return "";
97 }
98}
99
100static int jffs2_show_options(struct seq_file *s, struct vfsmount *mnt)
101{
102 struct jffs2_sb_info *c = JFFS2_SB_INFO(mnt->mnt_sb);
103 struct jffs2_mount_opts *opts = &c->mount_opts;
104
105 if (opts->override_compr)
106 seq_printf(s, ",compr=%s", jffs2_compr_name(opts->compr));
107
108 return 0;
109}
110
78static int jffs2_sync_fs(struct super_block *sb, int wait) 111static int jffs2_sync_fs(struct super_block *sb, int wait)
79{ 112{
80 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); 113 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
@@ -133,6 +166,85 @@ static const struct export_operations jffs2_export_ops = {
133 .fh_to_parent = jffs2_fh_to_parent, 166 .fh_to_parent = jffs2_fh_to_parent,
134}; 167};
135 168
169/*
170 * JFFS2 mount options.
171 *
172 * Opt_override_compr: override default compressor
173 * Opt_err: just end of array marker
174 */
175enum {
176 Opt_override_compr,
177 Opt_err,
178};
179
180static const match_table_t tokens = {
181 {Opt_override_compr, "compr=%s"},
182 {Opt_err, NULL},
183};
184
185static int jffs2_parse_options(struct jffs2_sb_info *c, char *data)
186{
187 substring_t args[MAX_OPT_ARGS];
188 char *p, *name;
189
190 if (!data)
191 return 0;
192
193 while ((p = strsep(&data, ","))) {
194 int token;
195
196 if (!*p)
197 continue;
198
199 token = match_token(p, tokens, args);
200 switch (token) {
201 case Opt_override_compr:
202 name = match_strdup(&args[0]);
203
204 if (!name)
205 return -ENOMEM;
206 if (!strcmp(name, "none"))
207 c->mount_opts.compr = JFFS2_COMPR_MODE_NONE;
208#ifdef CONFIG_JFFS2_LZO
209 else if (!strcmp(name, "lzo"))
210 c->mount_opts.compr = JFFS2_COMPR_MODE_FORCELZO;
211#endif
212#ifdef CONFIG_JFFS2_ZLIB
213 else if (!strcmp(name, "zlib"))
214 c->mount_opts.compr =
215 JFFS2_COMPR_MODE_FORCEZLIB;
216#endif
217 else {
218 printk(KERN_ERR "JFFS2 Error: unknown compressor \"%s\"",
219 name);
220 kfree(name);
221 return -EINVAL;
222 }
223 kfree(name);
224 c->mount_opts.override_compr = true;
225 break;
226 default:
227 printk(KERN_ERR "JFFS2 Error: unrecognized mount option '%s' or missing value\n",
228 p);
229 return -EINVAL;
230 }
231 }
232
233 return 0;
234}
235
236static int jffs2_remount_fs(struct super_block *sb, int *flags, char *data)
237{
238 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
239 int err;
240
241 err = jffs2_parse_options(c, data);
242 if (err)
243 return -EINVAL;
244
245 return jffs2_do_remount_fs(sb, flags, data);
246}
247
136static const struct super_operations jffs2_super_operations = 248static const struct super_operations jffs2_super_operations =
137{ 249{
138 .alloc_inode = jffs2_alloc_inode, 250 .alloc_inode = jffs2_alloc_inode,
@@ -143,6 +255,7 @@ static const struct super_operations jffs2_super_operations =
143 .remount_fs = jffs2_remount_fs, 255 .remount_fs = jffs2_remount_fs,
144 .evict_inode = jffs2_evict_inode, 256 .evict_inode = jffs2_evict_inode,
145 .dirty_inode = jffs2_dirty_inode, 257 .dirty_inode = jffs2_dirty_inode,
258 .show_options = jffs2_show_options,
146 .sync_fs = jffs2_sync_fs, 259 .sync_fs = jffs2_sync_fs,
147}; 260};
148 261
@@ -166,6 +279,12 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
166 c->os_priv = sb; 279 c->os_priv = sb;
167 sb->s_fs_info = c; 280 sb->s_fs_info = c;
168 281
282 ret = jffs2_parse_options(c, data);
283 if (ret) {
284 kfree(c);
285 return -EINVAL;
286 }
287
169 /* Initialize JFFS2 superblock locks, the further initialization will 288 /* Initialize JFFS2 superblock locks, the further initialization will
170 * be done later */ 289 * be done later */
171 mutex_init(&c->alloc_sem); 290 mutex_init(&c->alloc_sem);
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 4515bea0268f..b09e51d2f81f 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -578,8 +578,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
578 if (!jffs2_is_writebuffered(c)) 578 if (!jffs2_is_writebuffered(c))
579 return 0; 579 return 0;
580 580
581 if (mutex_trylock(&c->alloc_sem)) { 581 if (!mutex_is_locked(&c->alloc_sem)) {
582 mutex_unlock(&c->alloc_sem);
583 printk(KERN_CRIT "jffs2_flush_wbuf() called with alloc_sem not locked!\n"); 582 printk(KERN_CRIT "jffs2_flush_wbuf() called with alloc_sem not locked!\n");
584 BUG(); 583 BUG();
585 } 584 }
@@ -1026,7 +1025,7 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c,
1026 int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); 1025 int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
1027 struct mtd_oob_ops ops; 1026 struct mtd_oob_ops ops;
1028 1027
1029 ops.mode = MTD_OOB_AUTO; 1028 ops.mode = MTD_OPS_AUTO_OOB;
1030 ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail; 1029 ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail;
1031 ops.oobbuf = c->oobbuf; 1030 ops.oobbuf = c->oobbuf;
1032 ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; 1031 ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
@@ -1069,7 +1068,7 @@ int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c,
1069 struct mtd_oob_ops ops; 1068 struct mtd_oob_ops ops;
1070 int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); 1069 int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
1071 1070
1072 ops.mode = MTD_OOB_AUTO; 1071 ops.mode = MTD_OPS_AUTO_OOB;
1073 ops.ooblen = cmlen; 1072 ops.ooblen = cmlen;
1074 ops.oobbuf = c->oobbuf; 1073 ops.oobbuf = c->oobbuf;
1075 ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; 1074 ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
@@ -1095,7 +1094,7 @@ int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c,
1095 struct mtd_oob_ops ops; 1094 struct mtd_oob_ops ops;
1096 int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); 1095 int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
1097 1096
1098 ops.mode = MTD_OOB_AUTO; 1097 ops.mode = MTD_OPS_AUTO_OOB;
1099 ops.ooblen = cmlen; 1098 ops.ooblen = cmlen;
1100 ops.oobbuf = (uint8_t *)&oob_cleanmarker; 1099 ops.oobbuf = (uint8_t *)&oob_cleanmarker;
1101 ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; 1100 ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 583636f745e5..cc5f811ed383 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -67,6 +67,7 @@
67#include <linux/buffer_head.h> /* for sync_blockdev() */ 67#include <linux/buffer_head.h> /* for sync_blockdev() */
68#include <linux/bio.h> 68#include <linux/bio.h>
69#include <linux/freezer.h> 69#include <linux/freezer.h>
70#include <linux/export.h>
70#include <linux/delay.h> 71#include <linux/delay.h>
71#include <linux/mutex.h> 72#include <linux/mutex.h>
72#include <linux/seq_file.h> 73#include <linux/seq_file.h>
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index f2697e4df109..e795c234ea33 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -13,6 +13,7 @@
13#include <linux/bio.h> 13#include <linux/bio.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/blkdev.h> 15#include <linux/blkdev.h>
16#include <linux/module.h>
16#include <linux/mtd/mtd.h> 17#include <linux/mtd/mtd.h>
17#include <linux/statfs.h> 18#include <linux/statfs.h>
18#include <linux/buffer_head.h> 19#include <linux/buffer_head.h>
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 918ad647afea..726e59a9e50f 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -488,17 +488,18 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp,
488 struct xdr_stream *xdr, 488 struct xdr_stream *xdr,
489 struct cb_recallanyargs *args) 489 struct cb_recallanyargs *args)
490{ 490{
491 __be32 *p; 491 uint32_t bitmap[2];
492 __be32 *p, status;
492 493
493 args->craa_addr = svc_addr(rqstp); 494 args->craa_addr = svc_addr(rqstp);
494 p = read_buf(xdr, 4); 495 p = read_buf(xdr, 4);
495 if (unlikely(p == NULL)) 496 if (unlikely(p == NULL))
496 return htonl(NFS4ERR_BADXDR); 497 return htonl(NFS4ERR_BADXDR);
497 args->craa_objs_to_keep = ntohl(*p++); 498 args->craa_objs_to_keep = ntohl(*p++);
498 p = read_buf(xdr, 4); 499 status = decode_bitmap(xdr, bitmap);
499 if (unlikely(p == NULL)) 500 if (unlikely(status))
500 return htonl(NFS4ERR_BADXDR); 501 return status;
501 args->craa_type_mask = ntohl(*p); 502 args->craa_type_mask = bitmap[0];
502 503
503 return 0; 504 return 0;
504} 505}
@@ -986,4 +987,5 @@ struct svc_version nfs4_callback_version4 = {
986 .vs_proc = nfs4_callback_procedures1, 987 .vs_proc = nfs4_callback_procedures1,
987 .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, 988 .vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
988 .vs_dispatch = NULL, 989 .vs_dispatch = NULL,
990 .vs_hidden = 1,
989}; 991};
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 91c01f0a4c3b..0a1f8312b4dc 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -137,11 +137,9 @@ nfs_file_open(struct inode *inode, struct file *filp)
137static int 137static int
138nfs_file_release(struct inode *inode, struct file *filp) 138nfs_file_release(struct inode *inode, struct file *filp)
139{ 139{
140 struct dentry *dentry = filp->f_path.dentry;
141
142 dprintk("NFS: release(%s/%s)\n", 140 dprintk("NFS: release(%s/%s)\n",
143 dentry->d_parent->d_name.name, 141 filp->f_path.dentry->d_parent->d_name.name,
144 dentry->d_name.name); 142 filp->f_path.dentry->d_name.name);
145 143
146 nfs_inc_stats(inode, NFSIOS_VFSRELEASE); 144 nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
147 return nfs_release(inode, filp); 145 return nfs_release(inode, filp);
@@ -228,14 +226,13 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
228 struct dentry * dentry = iocb->ki_filp->f_path.dentry; 226 struct dentry * dentry = iocb->ki_filp->f_path.dentry;
229 struct inode * inode = dentry->d_inode; 227 struct inode * inode = dentry->d_inode;
230 ssize_t result; 228 ssize_t result;
231 size_t count = iov_length(iov, nr_segs);
232 229
233 if (iocb->ki_filp->f_flags & O_DIRECT) 230 if (iocb->ki_filp->f_flags & O_DIRECT)
234 return nfs_file_direct_read(iocb, iov, nr_segs, pos); 231 return nfs_file_direct_read(iocb, iov, nr_segs, pos);
235 232
236 dprintk("NFS: read(%s/%s, %lu@%lu)\n", 233 dprintk("NFS: read(%s/%s, %lu@%lu)\n",
237 dentry->d_parent->d_name.name, dentry->d_name.name, 234 dentry->d_parent->d_name.name, dentry->d_name.name,
238 (unsigned long) count, (unsigned long) pos); 235 (unsigned long) iov_length(iov, nr_segs), (unsigned long) pos);
239 236
240 result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); 237 result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
241 if (!result) { 238 if (!result) {
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 09119418402f..a62d36b9a99e 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -31,6 +31,7 @@
31 31
32#include <linux/nfs_fs.h> 32#include <linux/nfs_fs.h>
33#include <linux/nfs_page.h> 33#include <linux/nfs_page.h>
34#include <linux/module.h>
34 35
35#include "internal.h" 36#include "internal.h"
36#include "nfs4filelayout.h" 37#include "nfs4filelayout.h"
@@ -449,9 +450,8 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
449 450
450 fl->dsaddr = dsaddr; 451 fl->dsaddr = dsaddr;
451 452
452 if (fl->first_stripe_index < 0 || 453 if (fl->first_stripe_index >= dsaddr->stripe_count) {
453 fl->first_stripe_index >= dsaddr->stripe_count) { 454 dprintk("%s Bad first_stripe_index %u\n",
454 dprintk("%s Bad first_stripe_index %d\n",
455 __func__, fl->first_stripe_index); 455 __func__, fl->first_stripe_index);
456 goto out_put; 456 goto out_put;
457 } 457 }
@@ -552,7 +552,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
552 552
553 /* Note that a zero value for num_fh is legal for STRIPE_SPARSE. 553 /* Note that a zero value for num_fh is legal for STRIPE_SPARSE.
554 * Futher checking is done in filelayout_check_layout */ 554 * Futher checking is done in filelayout_check_layout */
555 if (fl->num_fh < 0 || fl->num_fh > 555 if (fl->num_fh >
556 max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT)) 556 max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT))
557 goto out_err; 557 goto out_err;
558 558
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d2ae413c986a..b60fddf606f7 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5950,6 +5950,7 @@ static void nfs4_layoutcommit_release(void *calldata)
5950{ 5950{
5951 struct nfs4_layoutcommit_data *data = calldata; 5951 struct nfs4_layoutcommit_data *data = calldata;
5952 struct pnfs_layout_segment *lseg, *tmp; 5952 struct pnfs_layout_segment *lseg, *tmp;
5953 unsigned long *bitlock = &NFS_I(data->args.inode)->flags;
5953 5954
5954 pnfs_cleanup_layoutcommit(data); 5955 pnfs_cleanup_layoutcommit(data);
5955 /* Matched by references in pnfs_set_layoutcommit */ 5956 /* Matched by references in pnfs_set_layoutcommit */
@@ -5959,6 +5960,11 @@ static void nfs4_layoutcommit_release(void *calldata)
5959 &lseg->pls_flags)) 5960 &lseg->pls_flags))
5960 put_lseg(lseg); 5961 put_lseg(lseg);
5961 } 5962 }
5963
5964 clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
5965 smp_mb__after_clear_bit();
5966 wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
5967
5962 put_rpccred(data->cred); 5968 put_rpccred(data->cred);
5963 kfree(data); 5969 kfree(data);
5964} 5970}
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 1dce12f41a4f..e6161b213ed1 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -6602,8 +6602,6 @@ static int nfs4_xdr_dec_secinfo(struct rpc_rqst *rqstp,
6602 if (status) 6602 if (status)
6603 goto out; 6603 goto out;
6604 status = decode_secinfo(xdr, res); 6604 status = decode_secinfo(xdr, res);
6605 if (status)
6606 goto out;
6607out: 6605out:
6608 return status; 6606 return status;
6609} 6607}
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index d0cda12fddc3..c807ab93140e 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -38,21 +38,15 @@
38 */ 38 */
39 39
40#include <linux/module.h> 40#include <linux/module.h>
41#include <scsi/osd_initiator.h> 41#include <scsi/osd_ore.h>
42 42
43#include "objlayout.h" 43#include "objlayout.h"
44 44
45#define NFSDBG_FACILITY NFSDBG_PNFS_LD 45#define NFSDBG_FACILITY NFSDBG_PNFS_LD
46 46
47#define _LLU(x) ((unsigned long long)x)
48
49enum { BIO_MAX_PAGES_KMALLOC =
50 (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
51};
52
53struct objio_dev_ent { 47struct objio_dev_ent {
54 struct nfs4_deviceid_node id_node; 48 struct nfs4_deviceid_node id_node;
55 struct osd_dev *od; 49 struct ore_dev od;
56}; 50};
57 51
58static void 52static void
@@ -60,8 +54,8 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d)
60{ 54{
61 struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node); 55 struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
62 56
63 dprintk("%s: free od=%p\n", __func__, de->od); 57 dprintk("%s: free od=%p\n", __func__, de->od.od);
64 osduld_put_device(de->od); 58 osduld_put_device(de->od.od);
65 kfree(de); 59 kfree(de);
66} 60}
67 61
@@ -98,12 +92,12 @@ _dev_list_add(const struct nfs_server *nfss,
98 nfss->pnfs_curr_ld, 92 nfss->pnfs_curr_ld,
99 nfss->nfs_client, 93 nfss->nfs_client,
100 d_id); 94 d_id);
101 de->od = od; 95 de->od.od = od;
102 96
103 d = nfs4_insert_deviceid_node(&de->id_node); 97 d = nfs4_insert_deviceid_node(&de->id_node);
104 n = container_of(d, struct objio_dev_ent, id_node); 98 n = container_of(d, struct objio_dev_ent, id_node);
105 if (n != de) { 99 if (n != de) {
106 dprintk("%s: Race with other n->od=%p\n", __func__, n->od); 100 dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od);
107 objio_free_deviceid_node(&de->id_node); 101 objio_free_deviceid_node(&de->id_node);
108 de = n; 102 de = n;
109 } 103 }
@@ -111,28 +105,11 @@ _dev_list_add(const struct nfs_server *nfss,
111 return de; 105 return de;
112} 106}
113 107
114struct caps_buffers {
115 u8 caps_key[OSD_CRYPTO_KEYID_SIZE];
116 u8 creds[OSD_CAP_LEN];
117};
118
119struct objio_segment { 108struct objio_segment {
120 struct pnfs_layout_segment lseg; 109 struct pnfs_layout_segment lseg;
121 110
122 struct pnfs_osd_object_cred *comps; 111 struct ore_layout layout;
123 112 struct ore_components oc;
124 unsigned mirrors_p1;
125 unsigned stripe_unit;
126 unsigned group_width; /* Data stripe_units without integrity comps */
127 u64 group_depth;
128 unsigned group_count;
129
130 unsigned max_io_size;
131
132 unsigned comps_index;
133 unsigned num_comps;
134 /* variable length */
135 struct objio_dev_ent *ods[];
136}; 113};
137 114
138static inline struct objio_segment * 115static inline struct objio_segment *
@@ -141,59 +118,44 @@ OBJIO_LSEG(struct pnfs_layout_segment *lseg)
141 return container_of(lseg, struct objio_segment, lseg); 118 return container_of(lseg, struct objio_segment, lseg);
142} 119}
143 120
144struct objio_state;
145typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
146
147struct objio_state { 121struct objio_state {
148 /* Generic layer */ 122 /* Generic layer */
149 struct objlayout_io_state ol_state; 123 struct objlayout_io_res oir;
150 124
151 struct objio_segment *layout; 125 bool sync;
152 126 /*FIXME: Support for extra_bytes at ore_get_rw_state() */
153 struct kref kref; 127 struct ore_io_state *ios;
154 objio_done_fn done;
155 void *private;
156
157 unsigned long length;
158 unsigned numdevs; /* Actually used devs in this IO */
159 /* A per-device variable array of size numdevs */
160 struct _objio_per_comp {
161 struct bio *bio;
162 struct osd_request *or;
163 unsigned long length;
164 u64 offset;
165 unsigned dev;
166 } per_dev[];
167}; 128};
168 129
169/* Send and wait for a get_device_info of devices in the layout, 130/* Send and wait for a get_device_info of devices in the layout,
170 then look them up with the osd_initiator library */ 131 then look them up with the osd_initiator library */
171static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay, 132static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
172 struct objio_segment *objio_seg, unsigned comp, 133 struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id,
173 gfp_t gfp_flags) 134 gfp_t gfp_flags)
174{ 135{
175 struct pnfs_osd_deviceaddr *deviceaddr; 136 struct pnfs_osd_deviceaddr *deviceaddr;
176 struct nfs4_deviceid *d_id;
177 struct objio_dev_ent *ode; 137 struct objio_dev_ent *ode;
178 struct osd_dev *od; 138 struct osd_dev *od;
179 struct osd_dev_info odi; 139 struct osd_dev_info odi;
180 int err; 140 int err;
181 141
182 d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id;
183
184 ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id); 142 ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
185 if (ode) 143 if (ode) {
186 return ode; 144 objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
145 return 0;
146 }
187 147
188 err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags); 148 err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
189 if (unlikely(err)) { 149 if (unlikely(err)) {
190 dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n", 150 dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
191 __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err); 151 __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
192 return ERR_PTR(err); 152 return err;
193 } 153 }
194 154
195 odi.systemid_len = deviceaddr->oda_systemid.len; 155 odi.systemid_len = deviceaddr->oda_systemid.len;
196 if (odi.systemid_len > sizeof(odi.systemid)) { 156 if (odi.systemid_len > sizeof(odi.systemid)) {
157 dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n",
158 __func__, sizeof(odi.systemid));
197 err = -EINVAL; 159 err = -EINVAL;
198 goto out; 160 goto out;
199 } else if (odi.systemid_len) 161 } else if (odi.systemid_len)
@@ -218,96 +180,53 @@ static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
218 180
219 ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od, 181 ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
220 gfp_flags); 182 gfp_flags);
221 183 objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
184 dprintk("Adding new dev_id(%llx:%llx)\n",
185 _DEVID_LO(d_id), _DEVID_HI(d_id));
222out: 186out:
223 dprintk("%s: return=%d\n", __func__, err);
224 objlayout_put_deviceinfo(deviceaddr); 187 objlayout_put_deviceinfo(deviceaddr);
225 return err ? ERR_PTR(err) : ode; 188 return err;
226} 189}
227 190
228static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, 191static void copy_single_comp(struct ore_components *oc, unsigned c,
229 struct objio_segment *objio_seg, 192 struct pnfs_osd_object_cred *src_comp)
230 gfp_t gfp_flags)
231{ 193{
232 unsigned i; 194 struct ore_comp *ocomp = &oc->comps[c];
233 int err;
234 195
235 /* lookup all devices */ 196 WARN_ON(src_comp->oc_cap_key.cred_len > 0); /* libosd is NO_SEC only */
236 for (i = 0; i < objio_seg->num_comps; i++) { 197 WARN_ON(src_comp->oc_cap.cred_len > sizeof(ocomp->cred));
237 struct objio_dev_ent *ode;
238 198
239 ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags); 199 ocomp->obj.partition = src_comp->oc_object_id.oid_partition_id;
240 if (unlikely(IS_ERR(ode))) { 200 ocomp->obj.id = src_comp->oc_object_id.oid_object_id;
241 err = PTR_ERR(ode);
242 goto out;
243 }
244 objio_seg->ods[i] = ode;
245 }
246 err = 0;
247 201
248out: 202 memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred));
249 dprintk("%s: return=%d\n", __func__, err);
250 return err;
251} 203}
252 204
253static int _verify_data_map(struct pnfs_osd_layout *layout) 205int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags,
206 struct objio_segment **pseg)
254{ 207{
255 struct pnfs_osd_data_map *data_map = &layout->olo_map; 208 struct __alloc_objio_segment {
256 u64 stripe_length; 209 struct objio_segment olseg;
257 u32 group_width; 210 struct ore_dev *ods[numdevs];
258 211 struct ore_comp comps[numdevs];
259/* FIXME: Only raid0 for now. if not go through MDS */ 212 } *aolseg;
260 if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
261 printk(KERN_ERR "Only RAID_0 for now\n");
262 return -ENOTSUPP;
263 }
264 if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
265 printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
266 data_map->odm_num_comps, data_map->odm_mirror_cnt);
267 return -EINVAL;
268 }
269 213
270 if (data_map->odm_group_width) 214 aolseg = kzalloc(sizeof(*aolseg), gfp_flags);
271 group_width = data_map->odm_group_width; 215 if (unlikely(!aolseg)) {
272 else 216 dprintk("%s: Faild allocation numdevs=%d size=%zd\n", __func__,
273 group_width = data_map->odm_num_comps / 217 numdevs, sizeof(*aolseg));
274 (data_map->odm_mirror_cnt + 1); 218 return -ENOMEM;
275
276 stripe_length = (u64)data_map->odm_stripe_unit * group_width;
277 if (stripe_length >= (1ULL << 32)) {
278 printk(KERN_ERR "Total Stripe length(0x%llx)"
279 " >= 32bit is not supported\n", _LLU(stripe_length));
280 return -ENOTSUPP;
281 } 219 }
282 220
283 if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) { 221 aolseg->olseg.oc.numdevs = numdevs;
284 printk(KERN_ERR "Stripe Unit(0x%llx)" 222 aolseg->olseg.oc.single_comp = EC_MULTPLE_COMPS;
285 " must be Multples of PAGE_SIZE(0x%lx)\n", 223 aolseg->olseg.oc.comps = aolseg->comps;
286 _LLU(data_map->odm_stripe_unit), PAGE_SIZE); 224 aolseg->olseg.oc.ods = aolseg->ods;
287 return -ENOTSUPP;
288 }
289 225
226 *pseg = &aolseg->olseg;
290 return 0; 227 return 0;
291} 228}
292 229
293static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp,
294 struct pnfs_osd_object_cred *src_comp,
295 struct caps_buffers *caps_p)
296{
297 WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key));
298 WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds));
299
300 *cur_comp = *src_comp;
301
302 memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred,
303 sizeof(caps_p->caps_key));
304 cur_comp->oc_cap_key.cred = caps_p->caps_key;
305
306 memcpy(caps_p->creds, src_comp->oc_cap.cred,
307 sizeof(caps_p->creds));
308 cur_comp->oc_cap.cred = caps_p->creds;
309}
310
311int objio_alloc_lseg(struct pnfs_layout_segment **outp, 230int objio_alloc_lseg(struct pnfs_layout_segment **outp,
312 struct pnfs_layout_hdr *pnfslay, 231 struct pnfs_layout_hdr *pnfslay,
313 struct pnfs_layout_range *range, 232 struct pnfs_layout_range *range,
@@ -317,59 +236,43 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
317 struct objio_segment *objio_seg; 236 struct objio_segment *objio_seg;
318 struct pnfs_osd_xdr_decode_layout_iter iter; 237 struct pnfs_osd_xdr_decode_layout_iter iter;
319 struct pnfs_osd_layout layout; 238 struct pnfs_osd_layout layout;
320 struct pnfs_osd_object_cred *cur_comp, src_comp; 239 struct pnfs_osd_object_cred src_comp;
321 struct caps_buffers *caps_p; 240 unsigned cur_comp;
322 int err; 241 int err;
323 242
324 err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr); 243 err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
325 if (unlikely(err)) 244 if (unlikely(err))
326 return err; 245 return err;
327 246
328 err = _verify_data_map(&layout); 247 err = __alloc_objio_seg(layout.olo_num_comps, gfp_flags, &objio_seg);
329 if (unlikely(err)) 248 if (unlikely(err))
330 return err; 249 return err;
331 250
332 objio_seg = kzalloc(sizeof(*objio_seg) + 251 objio_seg->layout.stripe_unit = layout.olo_map.odm_stripe_unit;
333 sizeof(objio_seg->ods[0]) * layout.olo_num_comps + 252 objio_seg->layout.group_width = layout.olo_map.odm_group_width;
334 sizeof(*objio_seg->comps) * layout.olo_num_comps + 253 objio_seg->layout.group_depth = layout.olo_map.odm_group_depth;
335 sizeof(struct caps_buffers) * layout.olo_num_comps, 254 objio_seg->layout.mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
336 gfp_flags); 255 objio_seg->layout.raid_algorithm = layout.olo_map.odm_raid_algorithm;
337 if (!objio_seg)
338 return -ENOMEM;
339 256
340 objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps); 257 err = ore_verify_layout(layout.olo_map.odm_num_comps,
341 cur_comp = objio_seg->comps; 258 &objio_seg->layout);
342 caps_p = (void *)(cur_comp + layout.olo_num_comps);
343 while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err))
344 copy_single_comp(cur_comp++, &src_comp, caps_p++);
345 if (unlikely(err)) 259 if (unlikely(err))
346 goto err; 260 goto err;
347 261
348 objio_seg->num_comps = layout.olo_num_comps; 262 objio_seg->oc.first_dev = layout.olo_comps_index;
349 objio_seg->comps_index = layout.olo_comps_index; 263 cur_comp = 0;
350 err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags); 264 while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) {
351 if (err) 265 copy_single_comp(&objio_seg->oc, cur_comp, &src_comp);
352 goto err; 266 err = objio_devices_lookup(pnfslay, objio_seg, cur_comp,
353 267 &src_comp.oc_object_id.oid_device_id,
354 objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1; 268 gfp_flags);
355 objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit; 269 if (err)
356 if (layout.olo_map.odm_group_width) { 270 goto err;
357 objio_seg->group_width = layout.olo_map.odm_group_width; 271 ++cur_comp;
358 objio_seg->group_depth = layout.olo_map.odm_group_depth;
359 objio_seg->group_count = layout.olo_map.odm_num_comps /
360 objio_seg->mirrors_p1 /
361 objio_seg->group_width;
362 } else {
363 objio_seg->group_width = layout.olo_map.odm_num_comps /
364 objio_seg->mirrors_p1;
365 objio_seg->group_depth = -1;
366 objio_seg->group_count = 1;
367 } 272 }
368 273 /* pnfs_osd_xdr_decode_layout_comp returns false on error */
369 /* Cache this calculation it will hit for every page */ 274 if (unlikely(err))
370 objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - 275 goto err;
371 objio_seg->stripe_unit) *
372 objio_seg->group_width;
373 276
374 *outp = &objio_seg->lseg; 277 *outp = &objio_seg->lseg;
375 return 0; 278 return 0;
@@ -386,43 +289,63 @@ void objio_free_lseg(struct pnfs_layout_segment *lseg)
386 int i; 289 int i;
387 struct objio_segment *objio_seg = OBJIO_LSEG(lseg); 290 struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
388 291
389 for (i = 0; i < objio_seg->num_comps; i++) { 292 for (i = 0; i < objio_seg->oc.numdevs; i++) {
390 if (!objio_seg->ods[i]) 293 struct ore_dev *od = objio_seg->oc.ods[i];
294 struct objio_dev_ent *ode;
295
296 if (!od)
391 break; 297 break;
392 nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node); 298 ode = container_of(od, typeof(*ode), od);
299 nfs4_put_deviceid_node(&ode->id_node);
393 } 300 }
394 kfree(objio_seg); 301 kfree(objio_seg);
395} 302}
396 303
397int objio_alloc_io_state(struct pnfs_layout_segment *lseg, 304static int
398 struct objlayout_io_state **outp, 305objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, bool is_reading,
399 gfp_t gfp_flags) 306 struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase,
307 loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags,
308 struct objio_state **outp)
400{ 309{
401 struct objio_segment *objio_seg = OBJIO_LSEG(lseg); 310 struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
402 struct objio_state *ios; 311 struct ore_io_state *ios;
403 const unsigned first_size = sizeof(*ios) + 312 int ret;
404 objio_seg->num_comps * sizeof(ios->per_dev[0]); 313 struct __alloc_objio_state {
405 const unsigned sec_size = objio_seg->num_comps * 314 struct objio_state objios;
406 sizeof(ios->ol_state.ioerrs[0]); 315 struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs];
407 316 } *aos;
408 ios = kzalloc(first_size + sec_size, gfp_flags); 317
409 if (unlikely(!ios)) 318 aos = kzalloc(sizeof(*aos), gfp_flags);
319 if (unlikely(!aos))
410 return -ENOMEM; 320 return -ENOMEM;
411 321
412 ios->layout = objio_seg; 322 objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs,
413 ios->ol_state.ioerrs = ((void *)ios) + first_size; 323 aos->ioerrs, rpcdata, pnfs_layout_type);
414 ios->ol_state.num_comps = objio_seg->num_comps;
415 324
416 *outp = &ios->ol_state; 325 ret = ore_get_rw_state(&objio_seg->layout, &objio_seg->oc, is_reading,
326 offset, count, &ios);
327 if (unlikely(ret)) {
328 kfree(aos);
329 return ret;
330 }
331
332 ios->pages = pages;
333 ios->pgbase = pgbase;
334 ios->private = aos;
335 BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT);
336
337 aos->objios.sync = 0;
338 aos->objios.ios = ios;
339 *outp = &aos->objios;
417 return 0; 340 return 0;
418} 341}
419 342
420void objio_free_io_state(struct objlayout_io_state *ol_state) 343void objio_free_result(struct objlayout_io_res *oir)
421{ 344{
422 struct objio_state *ios = container_of(ol_state, struct objio_state, 345 struct objio_state *objios = container_of(oir, struct objio_state, oir);
423 ol_state);
424 346
425 kfree(ios); 347 ore_put_io_state(objios->ios);
348 kfree(objios);
426} 349}
427 350
428enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) 351enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
@@ -455,539 +378,152 @@ enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
455 } 378 }
456} 379}
457 380
458static void _clear_bio(struct bio *bio) 381static void __on_dev_error(struct ore_io_state *ios,
382 struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep,
383 u64 dev_offset, u64 dev_len)
459{ 384{
460 struct bio_vec *bv; 385 struct objio_state *objios = ios->private;
461 unsigned i; 386 struct pnfs_osd_objid pooid;
462 387 struct objio_dev_ent *ode = container_of(od, typeof(*ode), od);
463 __bio_for_each_segment(bv, bio, i, 0) { 388 /* FIXME: what to do with more-then-one-group layouts. We need to
464 unsigned this_count = bv->bv_len; 389 * translate from ore_io_state index to oc->comps index
465 390 */
466 if (likely(PAGE_SIZE == this_count)) 391 unsigned comp = dev_index;
467 clear_highpage(bv->bv_page);
468 else
469 zero_user(bv->bv_page, bv->bv_offset, this_count);
470 }
471}
472
473static int _io_check(struct objio_state *ios, bool is_write)
474{
475 enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
476 int lin_ret = 0;
477 int i;
478
479 for (i = 0; i < ios->numdevs; i++) {
480 struct osd_sense_info osi;
481 struct osd_request *or = ios->per_dev[i].or;
482 int ret;
483
484 if (!or)
485 continue;
486 392
487 ret = osd_req_decode_sense(or, &osi); 393 pooid.oid_device_id = ode->id_node.deviceid;
488 if (likely(!ret)) 394 pooid.oid_partition_id = ios->oc->comps[comp].obj.partition;
489 continue; 395 pooid.oid_object_id = ios->oc->comps[comp].obj.id;
490 396
491 if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { 397 objlayout_io_set_result(&objios->oir, comp,
492 /* start read offset passed endof file */ 398 &pooid, osd_pri_2_pnfs_err(oep),
493 BUG_ON(is_write); 399 dev_offset, dev_len, !ios->reading);
494 _clear_bio(ios->per_dev[i].bio);
495 dprintk("%s: start read offset passed end of file "
496 "offset=0x%llx, length=0x%lx\n", __func__,
497 _LLU(ios->per_dev[i].offset),
498 ios->per_dev[i].length);
499
500 continue; /* we recovered */
501 }
502 objlayout_io_set_result(&ios->ol_state, i,
503 &ios->layout->comps[i].oc_object_id,
504 osd_pri_2_pnfs_err(osi.osd_err_pri),
505 ios->per_dev[i].offset,
506 ios->per_dev[i].length,
507 is_write);
508
509 if (osi.osd_err_pri >= oep) {
510 oep = osi.osd_err_pri;
511 lin_ret = ret;
512 }
513 }
514
515 return lin_ret;
516}
517
518/*
519 * Common IO state helpers.
520 */
521static void _io_free(struct objio_state *ios)
522{
523 unsigned i;
524
525 for (i = 0; i < ios->numdevs; i++) {
526 struct _objio_per_comp *per_dev = &ios->per_dev[i];
527
528 if (per_dev->or) {
529 osd_end_request(per_dev->or);
530 per_dev->or = NULL;
531 }
532
533 if (per_dev->bio) {
534 bio_put(per_dev->bio);
535 per_dev->bio = NULL;
536 }
537 }
538}
539
540struct osd_dev *_io_od(struct objio_state *ios, unsigned dev)
541{
542 unsigned min_dev = ios->layout->comps_index;
543 unsigned max_dev = min_dev + ios->layout->num_comps;
544
545 BUG_ON(dev < min_dev || max_dev <= dev);
546 return ios->layout->ods[dev - min_dev]->od;
547}
548
549struct _striping_info {
550 u64 obj_offset;
551 u64 group_length;
552 unsigned dev;
553 unsigned unit_off;
554};
555
556static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
557 struct _striping_info *si)
558{
559 u32 stripe_unit = ios->layout->stripe_unit;
560 u32 group_width = ios->layout->group_width;
561 u64 group_depth = ios->layout->group_depth;
562 u32 U = stripe_unit * group_width;
563
564 u64 T = U * group_depth;
565 u64 S = T * ios->layout->group_count;
566 u64 M = div64_u64(file_offset, S);
567
568 /*
569 G = (L - (M * S)) / T
570 H = (L - (M * S)) % T
571 */
572 u64 LmodU = file_offset - M * S;
573 u32 G = div64_u64(LmodU, T);
574 u64 H = LmodU - G * T;
575
576 u32 N = div_u64(H, U);
577
578 div_u64_rem(file_offset, stripe_unit, &si->unit_off);
579 si->obj_offset = si->unit_off + (N * stripe_unit) +
580 (M * group_depth * stripe_unit);
581
582 /* "H - (N * U)" is just "H % U" so it's bound to u32 */
583 si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
584 si->dev *= ios->layout->mirrors_p1;
585
586 si->group_length = T - H;
587}
588
589static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
590 unsigned pgbase, struct _objio_per_comp *per_dev, int len,
591 gfp_t gfp_flags)
592{
593 unsigned pg = *cur_pg;
594 int cur_len = len;
595 struct request_queue *q =
596 osd_request_queue(_io_od(ios, per_dev->dev));
597
598 if (per_dev->bio == NULL) {
599 unsigned pages_in_stripe = ios->layout->group_width *
600 (ios->layout->stripe_unit / PAGE_SIZE);
601 unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
602 ios->layout->group_width;
603
604 if (BIO_MAX_PAGES_KMALLOC < bio_size)
605 bio_size = BIO_MAX_PAGES_KMALLOC;
606
607 per_dev->bio = bio_kmalloc(gfp_flags, bio_size);
608 if (unlikely(!per_dev->bio)) {
609 dprintk("Faild to allocate BIO size=%u\n", bio_size);
610 return -ENOMEM;
611 }
612 }
613
614 while (cur_len > 0) {
615 unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
616 unsigned added_len;
617
618 BUG_ON(ios->ol_state.nr_pages <= pg);
619 cur_len -= pglen;
620
621 added_len = bio_add_pc_page(q, per_dev->bio,
622 ios->ol_state.pages[pg], pglen, pgbase);
623 if (unlikely(pglen != added_len))
624 return -ENOMEM;
625 pgbase = 0;
626 ++pg;
627 }
628 BUG_ON(cur_len);
629
630 per_dev->length += len;
631 *cur_pg = pg;
632 return 0;
633}
634
635static int _prepare_one_group(struct objio_state *ios, u64 length,
636 struct _striping_info *si, unsigned *last_pg,
637 gfp_t gfp_flags)
638{
639 unsigned stripe_unit = ios->layout->stripe_unit;
640 unsigned mirrors_p1 = ios->layout->mirrors_p1;
641 unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
642 unsigned dev = si->dev;
643 unsigned first_dev = dev - (dev % devs_in_group);
644 unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
645 unsigned cur_pg = *last_pg;
646 int ret = 0;
647
648 while (length) {
649 struct _objio_per_comp *per_dev = &ios->per_dev[dev - first_dev];
650 unsigned cur_len, page_off = 0;
651
652 if (!per_dev->length) {
653 per_dev->dev = dev;
654 if (dev < si->dev) {
655 per_dev->offset = si->obj_offset + stripe_unit -
656 si->unit_off;
657 cur_len = stripe_unit;
658 } else if (dev == si->dev) {
659 per_dev->offset = si->obj_offset;
660 cur_len = stripe_unit - si->unit_off;
661 page_off = si->unit_off & ~PAGE_MASK;
662 BUG_ON(page_off &&
663 (page_off != ios->ol_state.pgbase));
664 } else { /* dev > si->dev */
665 per_dev->offset = si->obj_offset - si->unit_off;
666 cur_len = stripe_unit;
667 }
668
669 if (max_comp < dev - first_dev)
670 max_comp = dev - first_dev;
671 } else {
672 cur_len = stripe_unit;
673 }
674 if (cur_len >= length)
675 cur_len = length;
676
677 ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
678 cur_len, gfp_flags);
679 if (unlikely(ret))
680 goto out;
681
682 dev += mirrors_p1;
683 dev = (dev % devs_in_group) + first_dev;
684
685 length -= cur_len;
686 ios->length += cur_len;
687 }
688out:
689 ios->numdevs = max_comp + mirrors_p1;
690 *last_pg = cur_pg;
691 return ret;
692}
693
694static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags)
695{
696 u64 length = ios->ol_state.count;
697 u64 offset = ios->ol_state.offset;
698 struct _striping_info si;
699 unsigned last_pg = 0;
700 int ret = 0;
701
702 while (length) {
703 _calc_stripe_info(ios, offset, &si);
704
705 if (length < si.group_length)
706 si.group_length = length;
707
708 ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags);
709 if (unlikely(ret))
710 goto out;
711
712 offset += si.group_length;
713 length -= si.group_length;
714 }
715
716out:
717 if (!ios->length)
718 return ret;
719
720 return 0;
721}
722
723static ssize_t _sync_done(struct objio_state *ios)
724{
725 struct completion *waiting = ios->private;
726
727 complete(waiting);
728 return 0;
729}
730
731static void _last_io(struct kref *kref)
732{
733 struct objio_state *ios = container_of(kref, struct objio_state, kref);
734
735 ios->done(ios);
736}
737
738static void _done_io(struct osd_request *or, void *p)
739{
740 struct objio_state *ios = p;
741
742 kref_put(&ios->kref, _last_io);
743}
744
745static ssize_t _io_exec(struct objio_state *ios)
746{
747 DECLARE_COMPLETION_ONSTACK(wait);
748 ssize_t status = 0; /* sync status */
749 unsigned i;
750 objio_done_fn saved_done_fn = ios->done;
751 bool sync = ios->ol_state.sync;
752
753 if (sync) {
754 ios->done = _sync_done;
755 ios->private = &wait;
756 }
757
758 kref_init(&ios->kref);
759
760 for (i = 0; i < ios->numdevs; i++) {
761 struct osd_request *or = ios->per_dev[i].or;
762
763 if (!or)
764 continue;
765
766 kref_get(&ios->kref);
767 osd_execute_request_async(or, _done_io, ios);
768 }
769
770 kref_put(&ios->kref, _last_io);
771
772 if (sync) {
773 wait_for_completion(&wait);
774 status = saved_done_fn(ios);
775 }
776
777 return status;
778} 400}
779 401
780/* 402/*
781 * read 403 * read
782 */ 404 */
783static ssize_t _read_done(struct objio_state *ios) 405static void _read_done(struct ore_io_state *ios, void *private)
784{ 406{
407 struct objio_state *objios = private;
785 ssize_t status; 408 ssize_t status;
786 int ret = _io_check(ios, false); 409 int ret = ore_check_io(ios, &__on_dev_error);
787 410
788 _io_free(ios); 411 /* FIXME: _io_free(ios) can we dealocate the libosd resources; */
789 412
790 if (likely(!ret)) 413 if (likely(!ret))
791 status = ios->length; 414 status = ios->length;
792 else 415 else
793 status = ret; 416 status = ret;
794 417
795 objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync); 418 objlayout_read_done(&objios->oir, status, objios->sync);
796 return status;
797} 419}
798 420
799static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) 421int objio_read_pagelist(struct nfs_read_data *rdata)
800{ 422{
801 struct osd_request *or = NULL; 423 struct objio_state *objios;
802 struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
803 unsigned dev = per_dev->dev;
804 struct pnfs_osd_object_cred *cred =
805 &ios->layout->comps[cur_comp];
806 struct osd_obj_id obj = {
807 .partition = cred->oc_object_id.oid_partition_id,
808 .id = cred->oc_object_id.oid_object_id,
809 };
810 int ret; 424 int ret;
811 425
812 or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); 426 ret = objio_alloc_io_state(NFS_I(rdata->inode)->layout, true,
813 if (unlikely(!or)) { 427 rdata->lseg, rdata->args.pages, rdata->args.pgbase,
814 ret = -ENOMEM; 428 rdata->args.offset, rdata->args.count, rdata,
815 goto err; 429 GFP_KERNEL, &objios);
816 }
817 per_dev->or = or;
818
819 osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
820
821 ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
822 if (ret) {
823 dprintk("%s: Faild to osd_finalize_request() => %d\n",
824 __func__, ret);
825 goto err;
826 }
827
828 dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
829 __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
830 per_dev->length);
831
832err:
833 return ret;
834}
835
836static ssize_t _read_exec(struct objio_state *ios)
837{
838 unsigned i;
839 int ret;
840
841 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
842 if (!ios->per_dev[i].length)
843 continue;
844 ret = _read_mirrors(ios, i);
845 if (unlikely(ret))
846 goto err;
847 }
848
849 ios->done = _read_done;
850 return _io_exec(ios); /* In sync mode exec returns the io status */
851
852err:
853 _io_free(ios);
854 return ret;
855}
856
857ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
858{
859 struct objio_state *ios = container_of(ol_state, struct objio_state,
860 ol_state);
861 int ret;
862
863 ret = _io_rw_pagelist(ios, GFP_KERNEL);
864 if (unlikely(ret)) 430 if (unlikely(ret))
865 return ret; 431 return ret;
866 432
867 return _read_exec(ios); 433 objios->ios->done = _read_done;
434 dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
435 rdata->args.offset, rdata->args.count);
436 return ore_read(objios->ios);
868} 437}
869 438
870/* 439/*
871 * write 440 * write
872 */ 441 */
873static ssize_t _write_done(struct objio_state *ios) 442static void _write_done(struct ore_io_state *ios, void *private)
874{ 443{
444 struct objio_state *objios = private;
875 ssize_t status; 445 ssize_t status;
876 int ret = _io_check(ios, true); 446 int ret = ore_check_io(ios, &__on_dev_error);
877 447
878 _io_free(ios); 448 /* FIXME: _io_free(ios) can we dealocate the libosd resources; */
879 449
880 if (likely(!ret)) { 450 if (likely(!ret)) {
881 /* FIXME: should be based on the OSD's persistence model 451 /* FIXME: should be based on the OSD's persistence model
882 * See OSD2r05 Section 4.13 Data persistence model */ 452 * See OSD2r05 Section 4.13 Data persistence model */
883 ios->ol_state.committed = NFS_FILE_SYNC; 453 objios->oir.committed = NFS_FILE_SYNC;
884 status = ios->length; 454 status = ios->length;
885 } else { 455 } else {
886 status = ret; 456 status = ret;
887 } 457 }
888 458
889 objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync); 459 objlayout_write_done(&objios->oir, status, objios->sync);
890 return status;
891} 460}
892 461
893static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) 462static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
894{ 463{
895 struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp]; 464 struct objio_state *objios = priv;
896 unsigned dev = ios->per_dev[cur_comp].dev; 465 struct nfs_write_data *wdata = objios->oir.rpcdata;
897 unsigned last_comp = cur_comp + ios->layout->mirrors_p1; 466 pgoff_t index = offset / PAGE_SIZE;
898 int ret; 467 struct page *page = find_get_page(wdata->inode->i_mapping, index);
899
900 for (; cur_comp < last_comp; ++cur_comp, ++dev) {
901 struct osd_request *or = NULL;
902 struct pnfs_osd_object_cred *cred =
903 &ios->layout->comps[cur_comp];
904 struct osd_obj_id obj = {
905 .partition = cred->oc_object_id.oid_partition_id,
906 .id = cred->oc_object_id.oid_object_id,
907 };
908 struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
909 struct bio *bio;
910
911 or = osd_start_request(_io_od(ios, dev), GFP_NOFS);
912 if (unlikely(!or)) {
913 ret = -ENOMEM;
914 goto err;
915 }
916 per_dev->or = or;
917
918 if (per_dev != master_dev) {
919 bio = bio_kmalloc(GFP_NOFS,
920 master_dev->bio->bi_max_vecs);
921 if (unlikely(!bio)) {
922 dprintk("Faild to allocate BIO size=%u\n",
923 master_dev->bio->bi_max_vecs);
924 ret = -ENOMEM;
925 goto err;
926 }
927
928 __bio_clone(bio, master_dev->bio);
929 bio->bi_bdev = NULL;
930 bio->bi_next = NULL;
931 per_dev->bio = bio;
932 per_dev->dev = dev;
933 per_dev->length = master_dev->length;
934 per_dev->offset = master_dev->offset;
935 } else {
936 bio = master_dev->bio;
937 bio->bi_rw |= REQ_WRITE;
938 }
939
940 osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
941 468
942 ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); 469 if (!page) {
943 if (ret) { 470 page = find_or_create_page(wdata->inode->i_mapping,
944 dprintk("%s: Faild to osd_finalize_request() => %d\n", 471 index, GFP_NOFS);
945 __func__, ret); 472 if (unlikely(!page)) {
946 goto err; 473 dprintk("%s: grab_cache_page Failed index=0x%lx\n",
474 __func__, index);
475 return NULL;
947 } 476 }
948 477 unlock_page(page);
949 dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
950 __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
951 per_dev->length);
952 } 478 }
479 if (PageDirty(page) || PageWriteback(page))
480 *uptodate = true;
481 else
482 *uptodate = PageUptodate(page);
483 dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate);
484 return page;
485}
953 486
954err: 487static void __r4w_put_page(void *priv, struct page *page)
955 return ret; 488{
489 dprintk("%s: index=0x%lx\n", __func__, page->index);
490 page_cache_release(page);
491 return;
956} 492}
957 493
958static ssize_t _write_exec(struct objio_state *ios) 494static const struct _ore_r4w_op _r4w_op = {
495 .get_page = &__r4w_get_page,
496 .put_page = &__r4w_put_page,
497};
498
499int objio_write_pagelist(struct nfs_write_data *wdata, int how)
959{ 500{
960 unsigned i; 501 struct objio_state *objios;
961 int ret; 502 int ret;
962 503
963 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { 504 ret = objio_alloc_io_state(NFS_I(wdata->inode)->layout, false,
964 if (!ios->per_dev[i].length) 505 wdata->lseg, wdata->args.pages, wdata->args.pgbase,
965 continue; 506 wdata->args.offset, wdata->args.count, wdata, GFP_NOFS,
966 ret = _write_mirrors(ios, i); 507 &objios);
967 if (unlikely(ret)) 508 if (unlikely(ret))
968 goto err; 509 return ret;
969 }
970
971 ios->done = _write_done;
972 return _io_exec(ios); /* In sync mode exec returns the io->status */
973 510
974err: 511 objios->sync = 0 != (how & FLUSH_SYNC);
975 _io_free(ios); 512 objios->ios->r4w = &_r4w_op;
976 return ret;
977}
978 513
979ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) 514 if (!objios->sync)
980{ 515 objios->ios->done = _write_done;
981 struct objio_state *ios = container_of(ol_state, struct objio_state,
982 ol_state);
983 int ret;
984 516
985 /* TODO: ios->stable = stable; */ 517 dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
986 ret = _io_rw_pagelist(ios, GFP_NOFS); 518 wdata->args.offset, wdata->args.count);
519 ret = ore_write(objios->ios);
987 if (unlikely(ret)) 520 if (unlikely(ret))
988 return ret; 521 return ret;
989 522
990 return _write_exec(ios); 523 if (objios->sync)
524 _write_done(objios->ios, objios);
525
526 return 0;
991} 527}
992 528
993static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, 529static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
@@ -997,7 +533,7 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
997 return false; 533 return false;
998 534
999 return pgio->pg_count + req->wb_bytes <= 535 return pgio->pg_count + req->wb_bytes <=
1000 OBJIO_LSEG(pgio->pg_lseg)->max_io_size; 536 OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length;
1001} 537}
1002 538
1003static const struct nfs_pageio_ops objio_pg_read_ops = { 539static const struct nfs_pageio_ops objio_pg_read_ops = {
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 1d06f8e2adea..72074e3a04f9 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -156,77 +156,39 @@ last_byte_offset(u64 start, u64 len)
156 return end > start ? end - 1 : NFS4_MAX_UINT64; 156 return end > start ? end - 1 : NFS4_MAX_UINT64;
157} 157}
158 158
159static struct objlayout_io_state * 159void _fix_verify_io_params(struct pnfs_layout_segment *lseg,
160objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, 160 struct page ***p_pages, unsigned *p_pgbase,
161 struct page **pages, 161 u64 offset, unsigned long count)
162 unsigned pgbase,
163 loff_t offset,
164 size_t count,
165 struct pnfs_layout_segment *lseg,
166 void *rpcdata,
167 gfp_t gfp_flags)
168{ 162{
169 struct objlayout_io_state *state;
170 u64 lseg_end_offset; 163 u64 lseg_end_offset;
171 164
172 dprintk("%s: allocating io_state\n", __func__);
173 if (objio_alloc_io_state(lseg, &state, gfp_flags))
174 return NULL;
175
176 BUG_ON(offset < lseg->pls_range.offset); 165 BUG_ON(offset < lseg->pls_range.offset);
177 lseg_end_offset = end_offset(lseg->pls_range.offset, 166 lseg_end_offset = end_offset(lseg->pls_range.offset,
178 lseg->pls_range.length); 167 lseg->pls_range.length);
179 BUG_ON(offset >= lseg_end_offset); 168 BUG_ON(offset >= lseg_end_offset);
180 if (offset + count > lseg_end_offset) { 169 WARN_ON(offset + count > lseg_end_offset);
181 count = lseg->pls_range.length -
182 (offset - lseg->pls_range.offset);
183 dprintk("%s: truncated count %Zd\n", __func__, count);
184 }
185 170
186 if (pgbase > PAGE_SIZE) { 171 if (*p_pgbase > PAGE_SIZE) {
187 pages += pgbase >> PAGE_SHIFT; 172 dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__, *p_pgbase);
188 pgbase &= ~PAGE_MASK; 173 *p_pages += *p_pgbase >> PAGE_SHIFT;
174 *p_pgbase &= ~PAGE_MASK;
189 } 175 }
190
191 INIT_LIST_HEAD(&state->err_list);
192 state->lseg = lseg;
193 state->rpcdata = rpcdata;
194 state->pages = pages;
195 state->pgbase = pgbase;
196 state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
197 state->offset = offset;
198 state->count = count;
199 state->sync = 0;
200
201 return state;
202}
203
204static void
205objlayout_free_io_state(struct objlayout_io_state *state)
206{
207 dprintk("%s: freeing io_state\n", __func__);
208 if (unlikely(!state))
209 return;
210
211 objio_free_io_state(state);
212} 176}
213 177
214/* 178/*
215 * I/O done common code 179 * I/O done common code
216 */ 180 */
217static void 181static void
218objlayout_iodone(struct objlayout_io_state *state) 182objlayout_iodone(struct objlayout_io_res *oir)
219{ 183{
220 dprintk("%s: state %p status\n", __func__, state); 184 if (likely(oir->status >= 0)) {
221 185 objio_free_result(oir);
222 if (likely(state->status >= 0)) {
223 objlayout_free_io_state(state);
224 } else { 186 } else {
225 struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout); 187 struct objlayout *objlay = oir->objlay;
226 188
227 spin_lock(&objlay->lock); 189 spin_lock(&objlay->lock);
228 objlay->delta_space_valid = OBJ_DSU_INVALID; 190 objlay->delta_space_valid = OBJ_DSU_INVALID;
229 list_add(&objlay->err_list, &state->err_list); 191 list_add(&objlay->err_list, &oir->err_list);
230 spin_unlock(&objlay->lock); 192 spin_unlock(&objlay->lock);
231 } 193 }
232} 194}
@@ -238,13 +200,13 @@ objlayout_iodone(struct objlayout_io_state *state)
238 * the error for later reporting at layout-return. 200 * the error for later reporting at layout-return.
239 */ 201 */
240void 202void
241objlayout_io_set_result(struct objlayout_io_state *state, unsigned index, 203objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index,
242 struct pnfs_osd_objid *pooid, int osd_error, 204 struct pnfs_osd_objid *pooid, int osd_error,
243 u64 offset, u64 length, bool is_write) 205 u64 offset, u64 length, bool is_write)
244{ 206{
245 struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index]; 207 struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[index];
246 208
247 BUG_ON(index >= state->num_comps); 209 BUG_ON(index >= oir->num_comps);
248 if (osd_error) { 210 if (osd_error) {
249 ioerr->oer_component = *pooid; 211 ioerr->oer_component = *pooid;
250 ioerr->oer_comp_offset = offset; 212 ioerr->oer_comp_offset = offset;
@@ -285,21 +247,18 @@ static void _rpc_read_complete(struct work_struct *work)
285} 247}
286 248
287void 249void
288objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) 250objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
289{ 251{
290 int eof = state->eof; 252 struct nfs_read_data *rdata = oir->rpcdata;
291 struct nfs_read_data *rdata;
292 253
293 state->status = status; 254 oir->status = rdata->task.tk_status = status;
294 dprintk("%s: Begin status=%zd eof=%d\n", __func__, status, eof); 255 if (status >= 0)
295 rdata = state->rpcdata;
296 rdata->task.tk_status = status;
297 if (status >= 0) {
298 rdata->res.count = status; 256 rdata->res.count = status;
299 rdata->res.eof = eof; 257 objlayout_iodone(oir);
300 } 258 /* must not use oir after this point */
301 objlayout_iodone(state); 259
302 /* must not use state after this point */ 260 dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__,
261 status, rdata->res.eof, sync);
303 262
304 if (sync) 263 if (sync)
305 pnfs_ld_read_done(rdata); 264 pnfs_ld_read_done(rdata);
@@ -317,40 +276,36 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
317{ 276{
318 loff_t offset = rdata->args.offset; 277 loff_t offset = rdata->args.offset;
319 size_t count = rdata->args.count; 278 size_t count = rdata->args.count;
320 struct objlayout_io_state *state; 279 int err;
321 ssize_t status = 0;
322 loff_t eof; 280 loff_t eof;
323 281
324 dprintk("%s: Begin inode %p offset %llu count %d\n",
325 __func__, rdata->inode, offset, (int)count);
326
327 eof = i_size_read(rdata->inode); 282 eof = i_size_read(rdata->inode);
328 if (unlikely(offset + count > eof)) { 283 if (unlikely(offset + count > eof)) {
329 if (offset >= eof) { 284 if (offset >= eof) {
330 status = 0; 285 err = 0;
331 rdata->res.count = 0; 286 rdata->res.count = 0;
332 rdata->res.eof = 1; 287 rdata->res.eof = 1;
288 /*FIXME: do we need to call pnfs_ld_read_done() */
333 goto out; 289 goto out;
334 } 290 }
335 count = eof - offset; 291 count = eof - offset;
336 } 292 }
337 293
338 state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout, 294 rdata->res.eof = (offset + count) >= eof;
339 rdata->args.pages, rdata->args.pgbase, 295 _fix_verify_io_params(rdata->lseg, &rdata->args.pages,
340 offset, count, 296 &rdata->args.pgbase,
341 rdata->lseg, rdata, 297 rdata->args.offset, rdata->args.count);
342 GFP_KERNEL);
343 if (unlikely(!state)) {
344 status = -ENOMEM;
345 goto out;
346 }
347 298
348 state->eof = state->offset + state->count >= eof; 299 dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
300 __func__, rdata->inode->i_ino, offset, count, rdata->res.eof);
349 301
350 status = objio_read_pagelist(state); 302 err = objio_read_pagelist(rdata);
351 out: 303 out:
352 dprintk("%s: Return status %Zd\n", __func__, status); 304 if (unlikely(err)) {
353 rdata->pnfs_error = status; 305 rdata->pnfs_error = err;
306 dprintk("%s: Returned Error %d\n", __func__, err);
307 return PNFS_NOT_ATTEMPTED;
308 }
354 return PNFS_ATTEMPTED; 309 return PNFS_ATTEMPTED;
355} 310}
356 311
@@ -371,26 +326,20 @@ static void _rpc_write_complete(struct work_struct *work)
371} 326}
372 327
373void 328void
374objlayout_write_done(struct objlayout_io_state *state, ssize_t status, 329objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
375 bool sync)
376{ 330{
377 struct nfs_write_data *wdata; 331 struct nfs_write_data *wdata = oir->rpcdata;
378 332
379 dprintk("%s: Begin\n", __func__); 333 oir->status = wdata->task.tk_status = status;
380 wdata = state->rpcdata;
381 state->status = status;
382 wdata->task.tk_status = status;
383 if (status >= 0) { 334 if (status >= 0) {
384 wdata->res.count = status; 335 wdata->res.count = status;
385 wdata->verf.committed = state->committed; 336 wdata->verf.committed = oir->committed;
386 dprintk("%s: Return status %d committed %d\n", 337 }
387 __func__, wdata->task.tk_status, 338 objlayout_iodone(oir);
388 wdata->verf.committed); 339 /* must not use oir after this point */
389 } else 340
390 dprintk("%s: Return status %d\n", 341 dprintk("%s: Return status %zd committed %d sync=%d\n", __func__,
391 __func__, wdata->task.tk_status); 342 status, wdata->verf.committed, sync);
392 objlayout_iodone(state);
393 /* must not use state after this point */
394 343
395 if (sync) 344 if (sync)
396 pnfs_ld_write_done(wdata); 345 pnfs_ld_write_done(wdata);
@@ -407,30 +356,18 @@ enum pnfs_try_status
407objlayout_write_pagelist(struct nfs_write_data *wdata, 356objlayout_write_pagelist(struct nfs_write_data *wdata,
408 int how) 357 int how)
409{ 358{
410 struct objlayout_io_state *state; 359 int err;
411 ssize_t status;
412
413 dprintk("%s: Begin inode %p offset %llu count %u\n",
414 __func__, wdata->inode, wdata->args.offset, wdata->args.count);
415
416 state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
417 wdata->args.pages,
418 wdata->args.pgbase,
419 wdata->args.offset,
420 wdata->args.count,
421 wdata->lseg, wdata,
422 GFP_NOFS);
423 if (unlikely(!state)) {
424 status = -ENOMEM;
425 goto out;
426 }
427 360
428 state->sync = how & FLUSH_SYNC; 361 _fix_verify_io_params(wdata->lseg, &wdata->args.pages,
362 &wdata->args.pgbase,
363 wdata->args.offset, wdata->args.count);
429 364
430 status = objio_write_pagelist(state, how & FLUSH_STABLE); 365 err = objio_write_pagelist(wdata, how);
431 out: 366 if (unlikely(err)) {
432 dprintk("%s: Return status %Zd\n", __func__, status); 367 wdata->pnfs_error = err;
433 wdata->pnfs_error = status; 368 dprintk("%s: Returned Error %d\n", __func__, err);
369 return PNFS_NOT_ATTEMPTED;
370 }
434 return PNFS_ATTEMPTED; 371 return PNFS_ATTEMPTED;
435} 372}
436 373
@@ -537,14 +474,14 @@ merge_ioerr(struct pnfs_osd_ioerr *dest_err,
537static void 474static void
538encode_accumulated_error(struct objlayout *objlay, __be32 *p) 475encode_accumulated_error(struct objlayout *objlay, __be32 *p)
539{ 476{
540 struct objlayout_io_state *state, *tmp; 477 struct objlayout_io_res *oir, *tmp;
541 struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; 478 struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};
542 479
543 list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { 480 list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) {
544 unsigned i; 481 unsigned i;
545 482
546 for (i = 0; i < state->num_comps; i++) { 483 for (i = 0; i < oir->num_comps; i++) {
547 struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; 484 struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i];
548 485
549 if (!ioerr->oer_errno) 486 if (!ioerr->oer_errno)
550 continue; 487 continue;
@@ -563,8 +500,8 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p)
563 500
564 merge_ioerr(&accumulated_err, ioerr); 501 merge_ioerr(&accumulated_err, ioerr);
565 } 502 }
566 list_del(&state->err_list); 503 list_del(&oir->err_list);
567 objlayout_free_io_state(state); 504 objio_free_result(oir);
568 } 505 }
569 506
570 pnfs_osd_xdr_encode_ioerr(p, &accumulated_err); 507 pnfs_osd_xdr_encode_ioerr(p, &accumulated_err);
@@ -576,7 +513,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
576 const struct nfs4_layoutreturn_args *args) 513 const struct nfs4_layoutreturn_args *args)
577{ 514{
578 struct objlayout *objlay = OBJLAYOUT(pnfslay); 515 struct objlayout *objlay = OBJLAYOUT(pnfslay);
579 struct objlayout_io_state *state, *tmp; 516 struct objlayout_io_res *oir, *tmp;
580 __be32 *start; 517 __be32 *start;
581 518
582 dprintk("%s: Begin\n", __func__); 519 dprintk("%s: Begin\n", __func__);
@@ -585,13 +522,13 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
585 522
586 spin_lock(&objlay->lock); 523 spin_lock(&objlay->lock);
587 524
588 list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { 525 list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) {
589 __be32 *last_xdr = NULL, *p; 526 __be32 *last_xdr = NULL, *p;
590 unsigned i; 527 unsigned i;
591 int res = 0; 528 int res = 0;
592 529
593 for (i = 0; i < state->num_comps; i++) { 530 for (i = 0; i < oir->num_comps; i++) {
594 struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; 531 struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i];
595 532
596 if (!ioerr->oer_errno) 533 if (!ioerr->oer_errno)
597 continue; 534 continue;
@@ -615,7 +552,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
615 } 552 }
616 553
617 last_xdr = p; 554 last_xdr = p;
618 pnfs_osd_xdr_encode_ioerr(p, &state->ioerrs[i]); 555 pnfs_osd_xdr_encode_ioerr(p, &oir->ioerrs[i]);
619 } 556 }
620 557
621 /* TODO: use xdr_write_pages */ 558 /* TODO: use xdr_write_pages */
@@ -631,8 +568,8 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
631 encode_accumulated_error(objlay, last_xdr); 568 encode_accumulated_error(objlay, last_xdr);
632 goto loop_done; 569 goto loop_done;
633 } 570 }
634 list_del(&state->err_list); 571 list_del(&oir->err_list);
635 objlayout_free_io_state(state); 572 objio_free_result(oir);
636 } 573 }
637loop_done: 574loop_done:
638 spin_unlock(&objlay->lock); 575 spin_unlock(&objlay->lock);
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index a8244c8e042d..8ec34727ed21 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -74,19 +74,11 @@ OBJLAYOUT(struct pnfs_layout_hdr *lo)
74 * per-I/O operation state 74 * per-I/O operation state
75 * embedded in objects provider io_state data structure 75 * embedded in objects provider io_state data structure
76 */ 76 */
77struct objlayout_io_state { 77struct objlayout_io_res {
78 struct pnfs_layout_segment *lseg; 78 struct objlayout *objlay;
79
80 struct page **pages;
81 unsigned pgbase;
82 unsigned nr_pages;
83 unsigned long count;
84 loff_t offset;
85 bool sync;
86 79
87 void *rpcdata; 80 void *rpcdata;
88 int status; /* res */ 81 int status; /* res */
89 int eof; /* res */
90 int committed; /* res */ 82 int committed; /* res */
91 83
92 /* Error reporting (layout_return) */ 84 /* Error reporting (layout_return) */
@@ -100,6 +92,18 @@ struct objlayout_io_state {
100 struct pnfs_osd_ioerr *ioerrs; 92 struct pnfs_osd_ioerr *ioerrs;
101}; 93};
102 94
95static inline
96void objlayout_init_ioerrs(struct objlayout_io_res *oir, unsigned num_comps,
97 struct pnfs_osd_ioerr *ioerrs, void *rpcdata,
98 struct pnfs_layout_hdr *pnfs_layout_type)
99{
100 oir->objlay = OBJLAYOUT(pnfs_layout_type);
101 oir->rpcdata = rpcdata;
102 INIT_LIST_HEAD(&oir->err_list);
103 oir->num_comps = num_comps;
104 oir->ioerrs = ioerrs;
105}
106
103/* 107/*
104 * Raid engine I/O API 108 * Raid engine I/O API
105 */ 109 */
@@ -110,28 +114,24 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
110 gfp_t gfp_flags); 114 gfp_t gfp_flags);
111extern void objio_free_lseg(struct pnfs_layout_segment *lseg); 115extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
112 116
113extern int objio_alloc_io_state( 117/* objio_free_result will free these @oir structs recieved from
114 struct pnfs_layout_segment *lseg, 118 * objlayout_{read,write}_done
115 struct objlayout_io_state **outp, 119 */
116 gfp_t gfp_flags); 120extern void objio_free_result(struct objlayout_io_res *oir);
117extern void objio_free_io_state(struct objlayout_io_state *state);
118 121
119extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state); 122extern int objio_read_pagelist(struct nfs_read_data *rdata);
120extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, 123extern int objio_write_pagelist(struct nfs_write_data *wdata, int how);
121 bool stable);
122 124
123/* 125/*
124 * callback API 126 * callback API
125 */ 127 */
126extern void objlayout_io_set_result(struct objlayout_io_state *state, 128extern void objlayout_io_set_result(struct objlayout_io_res *oir,
127 unsigned index, struct pnfs_osd_objid *pooid, 129 unsigned index, struct pnfs_osd_objid *pooid,
128 int osd_error, u64 offset, u64 length, bool is_write); 130 int osd_error, u64 offset, u64 length, bool is_write);
129 131
130static inline void 132static inline void
131objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used) 133objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used)
132{ 134{
133 struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
134
135 /* If one of the I/Os errored out and the delta_space_used was 135 /* If one of the I/Os errored out and the delta_space_used was
136 * invalid we render the complete report as invalid. Protocol mandate 136 * invalid we render the complete report as invalid. Protocol mandate
137 * the DSU be accurate or not reported. 137 * the DSU be accurate or not reported.
@@ -144,9 +144,9 @@ objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
144 spin_unlock(&objlay->lock); 144 spin_unlock(&objlay->lock);
145} 145}
146 146
147extern void objlayout_read_done(struct objlayout_io_state *state, 147extern void objlayout_read_done(struct objlayout_io_res *oir,
148 ssize_t status, bool sync); 148 ssize_t status, bool sync);
149extern void objlayout_write_done(struct objlayout_io_state *state, 149extern void objlayout_write_done(struct objlayout_io_res *oir,
150 ssize_t status, bool sync); 150 ssize_t status, bool sync);
151 151
152extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, 152extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index b60970cc7f1f..5668f7c54c41 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -18,6 +18,7 @@
18#include <linux/nfs_page.h> 18#include <linux/nfs_page.h>
19#include <linux/nfs_fs.h> 19#include <linux/nfs_fs.h>
20#include <linux/nfs_mount.h> 20#include <linux/nfs_mount.h>
21#include <linux/export.h>
21 22
22#include "internal.h" 23#include "internal.h"
23#include "pnfs.h" 24#include "pnfs.h"
@@ -41,7 +42,7 @@ nfs_page_free(struct nfs_page *p)
41 42
42/** 43/**
43 * nfs_create_request - Create an NFS read/write request. 44 * nfs_create_request - Create an NFS read/write request.
44 * @file: file descriptor to use 45 * @ctx: open context to use
45 * @inode: inode to which the request is attached 46 * @inode: inode to which the request is attached
46 * @page: page to write 47 * @page: page to write
47 * @offset: starting offset within the page for the write 48 * @offset: starting offset within the page for the write
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index ee73d9a4f700..baf73536bc04 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -29,6 +29,7 @@
29 29
30#include <linux/nfs_fs.h> 30#include <linux/nfs_fs.h>
31#include <linux/nfs_page.h> 31#include <linux/nfs_page.h>
32#include <linux/module.h>
32#include "internal.h" 33#include "internal.h"
33#include "pnfs.h" 34#include "pnfs.h"
34#include "iostat.h" 35#include "iostat.h"
@@ -1443,17 +1444,31 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
1443 /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ 1444 /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
1444 data = kzalloc(sizeof(*data), GFP_NOFS); 1445 data = kzalloc(sizeof(*data), GFP_NOFS);
1445 if (!data) { 1446 if (!data) {
1446 mark_inode_dirty_sync(inode);
1447 status = -ENOMEM; 1447 status = -ENOMEM;
1448 goto out; 1448 goto out;
1449 } 1449 }
1450 1450
1451 if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
1452 goto out_free;
1453
1454 if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
1455 if (!sync) {
1456 status = -EAGAIN;
1457 goto out_free;
1458 }
1459 status = wait_on_bit_lock(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING,
1460 nfs_wait_bit_killable, TASK_KILLABLE);
1461 if (status)
1462 goto out_free;
1463 }
1464
1451 INIT_LIST_HEAD(&data->lseg_list); 1465 INIT_LIST_HEAD(&data->lseg_list);
1452 spin_lock(&inode->i_lock); 1466 spin_lock(&inode->i_lock);
1453 if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { 1467 if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
1468 clear_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags);
1454 spin_unlock(&inode->i_lock); 1469 spin_unlock(&inode->i_lock);
1455 kfree(data); 1470 wake_up_bit(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING);
1456 goto out; 1471 goto out_free;
1457 } 1472 }
1458 1473
1459 pnfs_list_write_lseg(inode, &data->lseg_list); 1474 pnfs_list_write_lseg(inode, &data->lseg_list);
@@ -1475,6 +1490,11 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
1475 1490
1476 status = nfs4_proc_layoutcommit(data, sync); 1491 status = nfs4_proc_layoutcommit(data, sync);
1477out: 1492out:
1493 if (status)
1494 mark_inode_dirty_sync(inode);
1478 dprintk("<-- %s status %d\n", __func__, status); 1495 dprintk("<-- %s status %d\n", __func__, status);
1479 return status; 1496 return status;
1497out_free:
1498 kfree(data);
1499 goto out;
1480} 1500}
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 6fda5228ef56..4f359d2a26eb 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -28,6 +28,7 @@
28 * such damages. 28 * such damages.
29 */ 29 */
30 30
31#include <linux/export.h>
31#include "pnfs.h" 32#include "pnfs.h"
32 33
33#define NFSDBG_FACILITY NFSDBG_PNFS 34#define NFSDBG_FACILITY NFSDBG_PNFS
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 2219c88d96b2..1dda78db6a73 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -20,6 +20,7 @@
20#include <linux/nfs_mount.h> 20#include <linux/nfs_mount.h>
21#include <linux/nfs_page.h> 21#include <linux/nfs_page.h>
22#include <linux/backing-dev.h> 22#include <linux/backing-dev.h>
23#include <linux/export.h>
23 24
24#include <asm/uaccess.h> 25#include <asm/uaccess.h>
25 26
@@ -1243,7 +1244,6 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1243{ 1244{
1244 struct nfs_writeargs *argp = &data->args; 1245 struct nfs_writeargs *argp = &data->args;
1245 struct nfs_writeres *resp = &data->res; 1246 struct nfs_writeres *resp = &data->res;
1246 struct nfs_server *server = NFS_SERVER(data->inode);
1247 int status; 1247 int status;
1248 1248
1249 dprintk("NFS: %5u nfs_writeback_done (status %d)\n", 1249 dprintk("NFS: %5u nfs_writeback_done (status %d)\n",
@@ -1277,7 +1277,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1277 if (time_before(complain, jiffies)) { 1277 if (time_before(complain, jiffies)) {
1278 dprintk("NFS: faulty NFS server %s:" 1278 dprintk("NFS: faulty NFS server %s:"
1279 " (committed = %d) != (stable = %d)\n", 1279 " (committed = %d) != (stable = %d)\n",
1280 server->nfs_client->cl_hostname, 1280 NFS_SERVER(data->inode)->nfs_client->cl_hostname,
1281 resp->verf->committed, argp->stable); 1281 resp->verf->committed, argp->stable);
1282 complain = jiffies + 300 * HZ; 1282 complain = jiffies + 300 * HZ;
1283 } 1283 }
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index ad88f1c0a4c3..9c51aff02ae2 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -36,6 +36,7 @@
36 36
37#include <linux/slab.h> 37#include <linux/slab.h>
38#include <linux/nfs_fs.h> 38#include <linux/nfs_fs.h>
39#include <linux/export.h>
39#include "acl.h" 40#include "acl.h"
40 41
41 42
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index db34a585e112..c45a2ea4a090 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -13,6 +13,7 @@
13#include <linux/sunrpc/clnt.h> 13#include <linux/sunrpc/clnt.h>
14#include <linux/sunrpc/gss_api.h> 14#include <linux/sunrpc/gss_api.h>
15#include <linux/sunrpc/gss_krb5_enctypes.h> 15#include <linux/sunrpc/gss_krb5_enctypes.h>
16#include <linux/module.h>
16 17
17#include "idmap.h" 18#include "idmap.h"
18#include "nfsd.h" 19#include "nfsd.h"
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index dc5a1bf476b1..eda7d7e55e05 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/sched.h> 9#include <linux/sched.h>
10#include <linux/freezer.h> 10#include <linux/freezer.h>
11#include <linux/module.h>
11#include <linux/fs_struct.h> 12#include <linux/fs_struct.h>
12#include <linux/swap.h> 13#include <linux/swap.h>
13 14
@@ -256,6 +257,8 @@ static void nfsd_last_thread(struct svc_serv *serv)
256 nfsd_serv = NULL; 257 nfsd_serv = NULL;
257 nfsd_shutdown(); 258 nfsd_shutdown();
258 259
260 svc_rpcb_cleanup(serv);
261
259 printk(KERN_WARNING "nfsd: last server has exited, flushing export " 262 printk(KERN_WARNING "nfsd: last server has exited, flushing export "
260 "cache\n"); 263 "cache\n");
261 nfsd_export_flush(); 264 nfsd_export_flush();
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index db5ee4b4f47a..ad7d0c155de4 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -59,6 +59,7 @@
59#include <linux/idr.h> 59#include <linux/idr.h>
60#include <linux/kref.h> 60#include <linux/kref.h>
61#include <linux/net.h> 61#include <linux/net.h>
62#include <linux/export.h>
62#include <net/tcp.h> 63#include <net/tcp.h>
63 64
64#include <asm/uaccess.h> 65#include <asm/uaccess.h>
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 56f82cb912e3..0e28e242226d 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -30,6 +30,7 @@
30#include <linux/sysctl.h> 30#include <linux/sysctl.h>
31#include <linux/spinlock.h> 31#include <linux/spinlock.h>
32#include <linux/debugfs.h> 32#include <linux/debugfs.h>
33#include <linux/export.h>
33 34
34#include "cluster/heartbeat.h" 35#include "cluster/heartbeat.h"
35#include "cluster/nodemanager.h" 36#include "cluster/nodemanager.h"
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index cd99bf557650..b0f450a2bb7c 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -12,6 +12,7 @@
12#include <linux/user.h> 12#include <linux/user.h>
13#include <linux/elf.h> 13#include <linux/elf.h>
14#include <linux/elfcore.h> 14#include <linux/elfcore.h>
15#include <linux/export.h>
15#include <linux/slab.h> 16#include <linux/slab.h>
16#include <linux/highmem.h> 17#include <linux/highmem.h>
17#include <linux/bootmem.h> 18#include <linux/bootmem.h>
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index aae0edb95c6c..35f4b0ecdeb3 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -286,7 +286,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
286 /* caller already holds s_umount */ 286 /* caller already holds s_umount */
287 if (sb->s_flags & MS_RDONLY) 287 if (sb->s_flags & MS_RDONLY)
288 return -EROFS; 288 return -EROFS;
289 writeback_inodes_sb(sb); 289 writeback_inodes_sb(sb, WB_REASON_SYNC);
290 return 0; 290 return 0;
291 default: 291 default:
292 return -EINVAL; 292 return -EINVAL;
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 048b59d5b2f0..c70111ebefd4 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -78,6 +78,28 @@ config SQUASHFS_XZ
78 78
79 If unsure, say N. 79 If unsure, say N.
80 80
81config SQUASHFS_4K_DEVBLK_SIZE
82 bool "Use 4K device block size?"
83 depends on SQUASHFS
84 help
85 By default Squashfs sets the dev block size (sb_min_blocksize)
86 to 1K or the smallest block size supported by the block device
87 (if larger). This, because blocks are packed together and
88 unaligned in Squashfs, should reduce latency.
89
90 This, however, gives poor performance on MTD NAND devices where
91 the optimal I/O size is 4K (even though the devices can support
92 smaller block sizes).
93
94 Using a 4K device block size may also improve overall I/O
95 performance for some file access patterns (e.g. sequential
96 accesses of files in filesystem order) on all media.
97
98 Setting this option will force Squashfs to use a 4K device block
99 size by default.
100
101 If unsure, say N.
102
81config SQUASHFS_EMBEDDED 103config SQUASHFS_EMBEDDED
82 bool "Additional option for memory-constrained systems" 104 bool "Additional option for memory-constrained systems"
83 depends on SQUASHFS 105 depends on SQUASHFS
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index b4a4e539a08c..e8e14645de9a 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -36,6 +36,13 @@
36#define SQUASHFS_FILE_SIZE 131072 36#define SQUASHFS_FILE_SIZE 131072
37#define SQUASHFS_FILE_LOG 17 37#define SQUASHFS_FILE_LOG 17
38 38
39/* default size of block device I/O */
40#ifdef CONFIG_SQUASHFS_4K_DEVBLK_SIZE
41#define SQUASHFS_DEVBLK_SIZE 4096
42#else
43#define SQUASHFS_DEVBLK_SIZE 1024
44#endif
45
39#define SQUASHFS_FILE_MAX_SIZE 1048576 46#define SQUASHFS_FILE_MAX_SIZE 1048576
40#define SQUASHFS_FILE_MAX_LOG 20 47#define SQUASHFS_FILE_MAX_LOG 20
41 48
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 7438850c62d0..2da1715452ac 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -95,7 +95,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
95 } 95 }
96 msblk = sb->s_fs_info; 96 msblk = sb->s_fs_info;
97 97
98 msblk->devblksize = sb_min_blocksize(sb, BLOCK_SIZE); 98 msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE);
99 msblk->devblksize_log2 = ffz(~msblk->devblksize); 99 msblk->devblksize_log2 = ffz(~msblk->devblksize);
100 100
101 mutex_init(&msblk->read_data_mutex); 101 mutex_init(&msblk->read_data_mutex);
diff --git a/fs/statfs.c b/fs/statfs.c
index 8244924dec55..9cf04a118965 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -76,7 +76,7 @@ EXPORT_SYMBOL(vfs_statfs);
76int user_statfs(const char __user *pathname, struct kstatfs *st) 76int user_statfs(const char __user *pathname, struct kstatfs *st)
77{ 77{
78 struct path path; 78 struct path path;
79 int error = user_path(pathname, &path); 79 int error = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
80 if (!error) { 80 if (!error) {
81 error = vfs_statfs(&path, st); 81 error = vfs_statfs(&path, st);
82 path_put(&path); 82 path_put(&path);
diff --git a/fs/sync.c b/fs/sync.c
index c98a7477edfd..101b8ef901d7 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -43,7 +43,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
43 if (wait) 43 if (wait)
44 sync_inodes_sb(sb); 44 sync_inodes_sb(sb);
45 else 45 else
46 writeback_inodes_sb(sb); 46 writeback_inodes_sb(sb, WB_REASON_SYNC);
47 47
48 if (sb->s_op->sync_fs) 48 if (sb->s_op->sync_fs)
49 sb->s_op->sync_fs(sb, wait); 49 sb->s_op->sync_fs(sb, wait);
@@ -98,7 +98,7 @@ static void sync_filesystems(int wait)
98 */ 98 */
99SYSCALL_DEFINE0(sync) 99SYSCALL_DEFINE0(sync)
100{ 100{
101 wakeup_flusher_threads(0); 101 wakeup_flusher_threads(0, WB_REASON_SYNC);
102 sync_filesystems(0); 102 sync_filesystems(0);
103 sync_filesystems(1); 103 sync_filesystems(1);
104 if (unlikely(laptop_mode)) 104 if (unlikely(laptop_mode))
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 315de66e52b2..bc4f94b28706 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -63,7 +63,7 @@
63static void shrink_liability(struct ubifs_info *c, int nr_to_write) 63static void shrink_liability(struct ubifs_info *c, int nr_to_write)
64{ 64{
65 down_read(&c->vfs_sb->s_umount); 65 down_read(&c->vfs_sb->s_umount);
66 writeback_inodes_sb(c->vfs_sb); 66 writeback_inodes_sb(c->vfs_sb, WB_REASON_FS_FREE_SPACE);
67 up_read(&c->vfs_sb->s_umount); 67 up_read(&c->vfs_sb->s_umount);
68} 68}
69 69
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index eef109a1a927..b09ba2dd8b62 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -870,6 +870,22 @@ void dbg_dump_lpt_info(struct ubifs_info *c)
870 spin_unlock(&dbg_lock); 870 spin_unlock(&dbg_lock);
871} 871}
872 872
873void dbg_dump_sleb(const struct ubifs_info *c,
874 const struct ubifs_scan_leb *sleb, int offs)
875{
876 struct ubifs_scan_node *snod;
877
878 printk(KERN_DEBUG "(pid %d) start dumping scanned data from LEB %d:%d\n",
879 current->pid, sleb->lnum, offs);
880
881 list_for_each_entry(snod, &sleb->nodes, list) {
882 cond_resched();
883 printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", sleb->lnum,
884 snod->offs, snod->len);
885 dbg_dump_node(c, snod->node);
886 }
887}
888
873void dbg_dump_leb(const struct ubifs_info *c, int lnum) 889void dbg_dump_leb(const struct ubifs_info *c, int lnum)
874{ 890{
875 struct ubifs_scan_leb *sleb; 891 struct ubifs_scan_leb *sleb;
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index feb361e252ac..8d9c46810189 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -269,6 +269,8 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp);
269void dbg_dump_lprops(struct ubifs_info *c); 269void dbg_dump_lprops(struct ubifs_info *c);
270void dbg_dump_lpt_info(struct ubifs_info *c); 270void dbg_dump_lpt_info(struct ubifs_info *c);
271void dbg_dump_leb(const struct ubifs_info *c, int lnum); 271void dbg_dump_leb(const struct ubifs_info *c, int lnum);
272void dbg_dump_sleb(const struct ubifs_info *c,
273 const struct ubifs_scan_leb *sleb, int offs);
272void dbg_dump_znode(const struct ubifs_info *c, 274void dbg_dump_znode(const struct ubifs_info *c,
273 const struct ubifs_znode *znode); 275 const struct ubifs_znode *znode);
274void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat); 276void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat);
@@ -387,6 +389,9 @@ static inline void dbg_dump_lpt_info(struct ubifs_info *c) { return; }
387static inline void dbg_dump_leb(const struct ubifs_info *c, 389static inline void dbg_dump_leb(const struct ubifs_info *c,
388 int lnum) { return; } 390 int lnum) { return; }
389static inline void 391static inline void
392dbg_dump_sleb(const struct ubifs_info *c,
393 const struct ubifs_scan_leb *sleb, int offs) { return; }
394static inline void
390dbg_dump_znode(const struct ubifs_info *c, 395dbg_dump_znode(const struct ubifs_info *c,
391 const struct ubifs_znode *znode) { return; } 396 const struct ubifs_znode *znode) { return; }
392static inline void dbg_dump_heap(struct ubifs_info *c, 397static inline void dbg_dump_heap(struct ubifs_info *c,
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index af02790d9328..ee4f43f4bb99 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -983,7 +983,7 @@ int ubifs_recover_inl_heads(struct ubifs_info *c, void *sbuf)
983} 983}
984 984
985/** 985/**
986 * clean_an_unclean_leb - read and write a LEB to remove corruption. 986 * clean_an_unclean_leb - read and write a LEB to remove corruption.
987 * @c: UBIFS file-system description object 987 * @c: UBIFS file-system description object
988 * @ucleb: unclean LEB information 988 * @ucleb: unclean LEB information
989 * @sbuf: LEB-sized buffer to use 989 * @sbuf: LEB-sized buffer to use
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 93d938ad3d2a..6094c5a5d7a8 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -247,7 +247,7 @@ static int create_default_filesystem(struct ubifs_info *c)
247 mst->total_dirty = cpu_to_le64(tmp64); 247 mst->total_dirty = cpu_to_le64(tmp64);
248 248
249 /* The indexing LEB does not contribute to dark space */ 249 /* The indexing LEB does not contribute to dark space */
250 tmp64 = (c->main_lebs - 1) * c->dark_wm; 250 tmp64 = ((long long)(c->main_lebs - 1) * c->dark_wm);
251 mst->total_dark = cpu_to_le64(tmp64); 251 mst->total_dark = cpu_to_le64(tmp64);
252 252
253 mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ); 253 mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ);