aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/bio-integrity.c1
-rw-r--r--fs/bio.c8
-rw-r--r--fs/block_dev.c15
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/acl.c17
-rw-r--r--fs/btrfs/backref.c776
-rw-r--r--fs/btrfs/backref.h62
-rw-r--r--fs/btrfs/btrfs_inode.h21
-rw-r--r--fs/btrfs/compression.c3
-rw-r--r--fs/btrfs/ctree.c27
-rw-r--r--fs/btrfs/ctree.h203
-rw-r--r--fs/btrfs/delayed-inode.c108
-rw-r--r--fs/btrfs/disk-io.c630
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c992
-rw-r--r--fs/btrfs/extent_io.c619
-rw-r--r--fs/btrfs/extent_io.h23
-rw-r--r--fs/btrfs/file-item.c17
-rw-r--r--fs/btrfs/file.c25
-rw-r--r--fs/btrfs/free-space-cache.c992
-rw-r--r--fs/btrfs/inode-map.c34
-rw-r--r--fs/btrfs/inode.c545
-rw-r--r--fs/btrfs/ioctl.c236
-rw-r--r--fs/btrfs/ioctl.h29
-rw-r--r--fs/btrfs/print-tree.c8
-rw-r--r--fs/btrfs/reada.c951
-rw-r--r--fs/btrfs/relocation.c26
-rw-r--r--fs/btrfs/scrub.c655
-rw-r--r--fs/btrfs/super.c309
-rw-r--r--fs/btrfs/transaction.c156
-rw-r--r--fs/btrfs/tree-log.c19
-rw-r--r--fs/btrfs/volumes.c212
-rw-r--r--fs/btrfs/volumes.h24
-rw-r--r--fs/btrfs/xattr.c11
-rw-r--r--fs/buffer.c2
-rw-r--r--fs/ceph/caps.c8
-rw-r--r--fs/ceph/dir.c87
-rw-r--r--fs/ceph/inode.c17
-rw-r--r--fs/ceph/mds_client.c10
-rw-r--r--fs/ceph/super.c10
-rw-r--r--fs/ceph/super.h23
-rw-r--r--fs/cifs/connect.c1
-rw-r--r--fs/cifs/file.c105
-rw-r--r--fs/dcache.c17
-rw-r--r--fs/exofs/ore.c1
-rw-r--r--fs/exofs/super.c1
-rw-r--r--fs/ext4/balloc.c2
-rw-r--r--fs/ext4/inode.c3
-rw-r--r--fs/ext4/super.c6
-rw-r--r--fs/fs-writeback.c84
-rw-r--r--fs/fuse/cuse.c1
-rw-r--r--fs/gfs2/ops_fstype.c1
-rw-r--r--fs/hfs/trans.c2
-rw-r--r--fs/ioprio.c1
-rw-r--r--fs/jffs2/compr.c128
-rw-r--r--fs/jffs2/compr.h2
-rw-r--r--fs/jffs2/fs.c2
-rw-r--r--fs/jffs2/jffs2_fs_sb.h6
-rw-r--r--fs/jffs2/os-linux.h2
-rw-r--r--fs/jffs2/scan.c4
-rw-r--r--fs/jffs2/super.c119
-rw-r--r--fs/jffs2/wbuf.c9
-rw-r--r--fs/jfs/jfs_logmgr.c1
-rw-r--r--fs/logfs/super.c1
-rw-r--r--fs/minix/bitmap.c55
-rw-r--r--fs/minix/inode.c25
-rw-r--r--fs/minix/minix.h11
-rw-r--r--fs/namei.c16
-rw-r--r--fs/namespace.c30
-rw-r--r--fs/nfs/nfs4filelayout.c1
-rw-r--r--fs/nfs/pagelist.c1
-rw-r--r--fs/nfs/pnfs.c1
-rw-r--r--fs/nfs/pnfs_dev.c1
-rw-r--r--fs/nfs/super.c37
-rw-r--r--fs/nfs/write.c1
-rw-r--r--fs/nfsd/nfs4acl.c1
-rw-r--r--fs/nfsd/nfsctl.c1
-rw-r--r--fs/nfsd/nfssvc.c1
-rw-r--r--fs/ocfs2/cluster/tcp.c1
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c1
-rw-r--r--fs/proc/base.c146
-rw-r--r--fs/proc/vmcore.c1
-rw-r--r--fs/quota/quota.c2
-rw-r--r--fs/squashfs/Kconfig22
-rw-r--r--fs/squashfs/squashfs_fs.h7
-rw-r--r--fs/squashfs/super.c2
-rw-r--r--fs/statfs.c2
-rw-r--r--fs/sync.c4
-rw-r--r--fs/ubifs/budget.c2
-rw-r--r--fs/ubifs/debug.c16
-rw-r--r--fs/ubifs/debug.h5
-rw-r--r--fs/ubifs/recovery.c2
-rw-r--r--fs/ubifs/sb.c2
-rw-r--r--fs/xfs/xfs_aops.c2
-rw-r--r--fs/xfs/xfs_buf_item.c2
-rw-r--r--fs/xfs/xfs_dquot_item.c6
-rw-r--r--fs/xfs/xfs_extfree_item.c4
-rw-r--r--fs/xfs/xfs_inode_item.c2
-rw-r--r--fs/xfs/xfs_log.c2
-rw-r--r--fs/xfs/xfs_log.h2
-rw-r--r--fs/xfs/xfs_qm.c3
-rw-r--r--fs/xfs/xfs_trans.h6
-rw-r--r--fs/xfs/xfs_vnodeops.c14
103 files changed, 6657 insertions, 2200 deletions
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 9c5e6b2cd11a..c2183f3917cd 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/blkdev.h> 23#include <linux/blkdev.h>
24#include <linux/mempool.h> 24#include <linux/mempool.h>
25#include <linux/export.h>
25#include <linux/bio.h> 26#include <linux/bio.h>
26#include <linux/workqueue.h> 27#include <linux/workqueue.h>
27#include <linux/slab.h> 28#include <linux/slab.h>
diff --git a/fs/bio.c b/fs/bio.c
index 9bfade8a609b..b1fe82cf88cf 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -255,7 +255,6 @@ void bio_init(struct bio *bio)
255{ 255{
256 memset(bio, 0, sizeof(*bio)); 256 memset(bio, 0, sizeof(*bio));
257 bio->bi_flags = 1 << BIO_UPTODATE; 257 bio->bi_flags = 1 << BIO_UPTODATE;
258 bio->bi_comp_cpu = -1;
259 atomic_set(&bio->bi_cnt, 1); 258 atomic_set(&bio->bi_cnt, 1);
260} 259}
261EXPORT_SYMBOL(bio_init); 260EXPORT_SYMBOL(bio_init);
@@ -338,7 +337,7 @@ static void bio_fs_destructor(struct bio *bio)
338 * RETURNS: 337 * RETURNS:
339 * Pointer to new bio on success, NULL on failure. 338 * Pointer to new bio on success, NULL on failure.
340 */ 339 */
341struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) 340struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
342{ 341{
343 struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); 342 struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
344 343
@@ -366,7 +365,7 @@ static void bio_kmalloc_destructor(struct bio *bio)
366 * %__GFP_WAIT, the allocation is guaranteed to succeed. 365 * %__GFP_WAIT, the allocation is guaranteed to succeed.
367 * 366 *
368 **/ 367 **/
369struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) 368struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
370{ 369{
371 struct bio *bio; 370 struct bio *bio;
372 371
@@ -697,7 +696,8 @@ static void bio_free_map_data(struct bio_map_data *bmd)
697 kfree(bmd); 696 kfree(bmd);
698} 697}
699 698
700static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, 699static struct bio_map_data *bio_alloc_map_data(int nr_segs,
700 unsigned int iov_count,
701 gfp_t gfp_mask) 701 gfp_t gfp_mask)
702{ 702{
703 struct bio_map_data *bmd; 703 struct bio_map_data *bmd;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 95f786ec7f08..b07f1da1de4e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -971,7 +971,7 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty)
971 971
972 if (!bdev->bd_disk) 972 if (!bdev->bd_disk)
973 return; 973 return;
974 if (disk_partitionable(bdev->bd_disk)) 974 if (disk_part_scan_enabled(bdev->bd_disk))
975 bdev->bd_invalidated = 1; 975 bdev->bd_invalidated = 1;
976} 976}
977 977
@@ -1085,6 +1085,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
1085static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) 1085static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1086{ 1086{
1087 struct gendisk *disk; 1087 struct gendisk *disk;
1088 struct module *owner;
1088 int ret; 1089 int ret;
1089 int partno; 1090 int partno;
1090 int perm = 0; 1091 int perm = 0;
@@ -1110,6 +1111,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1110 disk = get_gendisk(bdev->bd_dev, &partno); 1111 disk = get_gendisk(bdev->bd_dev, &partno);
1111 if (!disk) 1112 if (!disk)
1112 goto out; 1113 goto out;
1114 owner = disk->fops->owner;
1113 1115
1114 disk_block_events(disk); 1116 disk_block_events(disk);
1115 mutex_lock_nested(&bdev->bd_mutex, for_part); 1117 mutex_lock_nested(&bdev->bd_mutex, for_part);
@@ -1137,8 +1139,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1137 bdev->bd_disk = NULL; 1139 bdev->bd_disk = NULL;
1138 mutex_unlock(&bdev->bd_mutex); 1140 mutex_unlock(&bdev->bd_mutex);
1139 disk_unblock_events(disk); 1141 disk_unblock_events(disk);
1140 module_put(disk->fops->owner);
1141 put_disk(disk); 1142 put_disk(disk);
1143 module_put(owner);
1142 goto restart; 1144 goto restart;
1143 } 1145 }
1144 } 1146 }
@@ -1194,8 +1196,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1194 goto out_unlock_bdev; 1196 goto out_unlock_bdev;
1195 } 1197 }
1196 /* only one opener holds refs to the module and disk */ 1198 /* only one opener holds refs to the module and disk */
1197 module_put(disk->fops->owner);
1198 put_disk(disk); 1199 put_disk(disk);
1200 module_put(owner);
1199 } 1201 }
1200 bdev->bd_openers++; 1202 bdev->bd_openers++;
1201 if (for_part) 1203 if (for_part)
@@ -1215,8 +1217,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1215 out_unlock_bdev: 1217 out_unlock_bdev:
1216 mutex_unlock(&bdev->bd_mutex); 1218 mutex_unlock(&bdev->bd_mutex);
1217 disk_unblock_events(disk); 1219 disk_unblock_events(disk);
1218 module_put(disk->fops->owner);
1219 put_disk(disk); 1220 put_disk(disk);
1221 module_put(owner);
1220 out: 1222 out:
1221 bdput(bdev); 1223 bdput(bdev);
1222 1224
@@ -1442,14 +1444,15 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1442 if (!bdev->bd_openers) { 1444 if (!bdev->bd_openers) {
1443 struct module *owner = disk->fops->owner; 1445 struct module *owner = disk->fops->owner;
1444 1446
1445 put_disk(disk);
1446 module_put(owner);
1447 disk_put_part(bdev->bd_part); 1447 disk_put_part(bdev->bd_part);
1448 bdev->bd_part = NULL; 1448 bdev->bd_part = NULL;
1449 bdev->bd_disk = NULL; 1449 bdev->bd_disk = NULL;
1450 if (bdev != bdev->bd_contains) 1450 if (bdev != bdev->bd_contains)
1451 victim = bdev->bd_contains; 1451 victim = bdev->bd_contains;
1452 bdev->bd_contains = NULL; 1452 bdev->bd_contains = NULL;
1453
1454 put_disk(disk);
1455 module_put(owner);
1453 } 1456 }
1454 mutex_unlock(&bdev->bd_mutex); 1457 mutex_unlock(&bdev->bd_mutex);
1455 bdput(bdev); 1458 bdput(bdev);
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 40e6ac08c21f..c0ddfd29c5e5 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,6 +7,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o backref.o
11 12
12btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index eb159aaa5a11..89b156d85d63 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -59,22 +59,19 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
59 if (!value) 59 if (!value)
60 return ERR_PTR(-ENOMEM); 60 return ERR_PTR(-ENOMEM);
61 size = __btrfs_getxattr(inode, name, value, size); 61 size = __btrfs_getxattr(inode, name, value, size);
62 if (size > 0) { 62 }
63 acl = posix_acl_from_xattr(value, size); 63 if (size > 0) {
64 if (IS_ERR(acl)) { 64 acl = posix_acl_from_xattr(value, size);
65 kfree(value);
66 return acl;
67 }
68 set_cached_acl(inode, type, acl);
69 }
70 kfree(value);
71 } else if (size == -ENOENT || size == -ENODATA || size == 0) { 65 } else if (size == -ENOENT || size == -ENODATA || size == 0) {
72 /* FIXME, who returns -ENOENT? I think nobody */ 66 /* FIXME, who returns -ENOENT? I think nobody */
73 acl = NULL; 67 acl = NULL;
74 set_cached_acl(inode, type, acl);
75 } else { 68 } else {
76 acl = ERR_PTR(-EIO); 69 acl = ERR_PTR(-EIO);
77 } 70 }
71 kfree(value);
72
73 if (!IS_ERR(acl))
74 set_cached_acl(inode, type, acl);
78 75
79 return acl; 76 return acl;
80} 77}
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
new file mode 100644
index 000000000000..22c64fff1bd5
--- /dev/null
+++ b/fs/btrfs/backref.c
@@ -0,0 +1,776 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "backref.h"
22
23struct __data_ref {
24 struct list_head list;
25 u64 inum;
26 u64 root;
27 u64 extent_data_item_offset;
28};
29
30struct __shared_ref {
31 struct list_head list;
32 u64 disk_byte;
33};
34
35static int __inode_info(u64 inum, u64 ioff, u8 key_type,
36 struct btrfs_root *fs_root, struct btrfs_path *path,
37 struct btrfs_key *found_key)
38{
39 int ret;
40 struct btrfs_key key;
41 struct extent_buffer *eb;
42
43 key.type = key_type;
44 key.objectid = inum;
45 key.offset = ioff;
46
47 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
48 if (ret < 0)
49 return ret;
50
51 eb = path->nodes[0];
52 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
53 ret = btrfs_next_leaf(fs_root, path);
54 if (ret)
55 return ret;
56 eb = path->nodes[0];
57 }
58
59 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
60 if (found_key->type != key.type || found_key->objectid != key.objectid)
61 return 1;
62
63 return 0;
64}
65
66/*
67 * this makes the path point to (inum INODE_ITEM ioff)
68 */
69int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
70 struct btrfs_path *path)
71{
72 struct btrfs_key key;
73 return __inode_info(inum, ioff, BTRFS_INODE_ITEM_KEY, fs_root, path,
74 &key);
75}
76
77static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
78 struct btrfs_path *path,
79 struct btrfs_key *found_key)
80{
81 return __inode_info(inum, ioff, BTRFS_INODE_REF_KEY, fs_root, path,
82 found_key);
83}
84
85/*
86 * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
87 * of the path are separated by '/' and the path is guaranteed to be
88 * 0-terminated. the path is only given within the current file system.
89 * Therefore, it never starts with a '/'. the caller is responsible to provide
90 * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
91 * the start point of the resulting string is returned. this pointer is within
92 * dest, normally.
93 * in case the path buffer would overflow, the pointer is decremented further
94 * as if output was written to the buffer, though no more output is actually
95 * generated. that way, the caller can determine how much space would be
96 * required for the path to fit into the buffer. in that case, the returned
97 * value will be smaller than dest. callers must check this!
98 */
99static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
100 struct btrfs_inode_ref *iref,
101 struct extent_buffer *eb_in, u64 parent,
102 char *dest, u32 size)
103{
104 u32 len;
105 int slot;
106 u64 next_inum;
107 int ret;
108 s64 bytes_left = size - 1;
109 struct extent_buffer *eb = eb_in;
110 struct btrfs_key found_key;
111
112 if (bytes_left >= 0)
113 dest[bytes_left] = '\0';
114
115 while (1) {
116 len = btrfs_inode_ref_name_len(eb, iref);
117 bytes_left -= len;
118 if (bytes_left >= 0)
119 read_extent_buffer(eb, dest + bytes_left,
120 (unsigned long)(iref + 1), len);
121 if (eb != eb_in)
122 free_extent_buffer(eb);
123 ret = inode_ref_info(parent, 0, fs_root, path, &found_key);
124 if (ret)
125 break;
126 next_inum = found_key.offset;
127
128 /* regular exit ahead */
129 if (parent == next_inum)
130 break;
131
132 slot = path->slots[0];
133 eb = path->nodes[0];
134 /* make sure we can use eb after releasing the path */
135 if (eb != eb_in)
136 atomic_inc(&eb->refs);
137 btrfs_release_path(path);
138
139 iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
140 parent = next_inum;
141 --bytes_left;
142 if (bytes_left >= 0)
143 dest[bytes_left] = '/';
144 }
145
146 btrfs_release_path(path);
147
148 if (ret)
149 return ERR_PTR(ret);
150
151 return dest + bytes_left;
152}
153
154/*
155 * this makes the path point to (logical EXTENT_ITEM *)
156 * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
157 * tree blocks and <0 on error.
158 */
159int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
160 struct btrfs_path *path, struct btrfs_key *found_key)
161{
162 int ret;
163 u64 flags;
164 u32 item_size;
165 struct extent_buffer *eb;
166 struct btrfs_extent_item *ei;
167 struct btrfs_key key;
168
169 key.type = BTRFS_EXTENT_ITEM_KEY;
170 key.objectid = logical;
171 key.offset = (u64)-1;
172
173 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
174 if (ret < 0)
175 return ret;
176 ret = btrfs_previous_item(fs_info->extent_root, path,
177 0, BTRFS_EXTENT_ITEM_KEY);
178 if (ret < 0)
179 return ret;
180
181 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
182 if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
183 found_key->objectid > logical ||
184 found_key->objectid + found_key->offset <= logical)
185 return -ENOENT;
186
187 eb = path->nodes[0];
188 item_size = btrfs_item_size_nr(eb, path->slots[0]);
189 BUG_ON(item_size < sizeof(*ei));
190
191 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
192 flags = btrfs_extent_flags(eb, ei);
193
194 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
195 return BTRFS_EXTENT_FLAG_TREE_BLOCK;
196 if (flags & BTRFS_EXTENT_FLAG_DATA)
197 return BTRFS_EXTENT_FLAG_DATA;
198
199 return -EIO;
200}
201
202/*
203 * helper function to iterate extent inline refs. ptr must point to a 0 value
204 * for the first call and may be modified. it is used to track state.
205 * if more refs exist, 0 is returned and the next call to
206 * __get_extent_inline_ref must pass the modified ptr parameter to get the
207 * next ref. after the last ref was processed, 1 is returned.
208 * returns <0 on error
209 */
210static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
211 struct btrfs_extent_item *ei, u32 item_size,
212 struct btrfs_extent_inline_ref **out_eiref,
213 int *out_type)
214{
215 unsigned long end;
216 u64 flags;
217 struct btrfs_tree_block_info *info;
218
219 if (!*ptr) {
220 /* first call */
221 flags = btrfs_extent_flags(eb, ei);
222 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
223 info = (struct btrfs_tree_block_info *)(ei + 1);
224 *out_eiref =
225 (struct btrfs_extent_inline_ref *)(info + 1);
226 } else {
227 *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1);
228 }
229 *ptr = (unsigned long)*out_eiref;
230 if ((void *)*ptr >= (void *)ei + item_size)
231 return -ENOENT;
232 }
233
234 end = (unsigned long)ei + item_size;
235 *out_eiref = (struct btrfs_extent_inline_ref *)*ptr;
236 *out_type = btrfs_extent_inline_ref_type(eb, *out_eiref);
237
238 *ptr += btrfs_extent_inline_ref_size(*out_type);
239 WARN_ON(*ptr > end);
240 if (*ptr == end)
241 return 1; /* last */
242
243 return 0;
244}
245
246/*
247 * reads the tree block backref for an extent. tree level and root are returned
248 * through out_level and out_root. ptr must point to a 0 value for the first
249 * call and may be modified (see __get_extent_inline_ref comment).
250 * returns 0 if data was provided, 1 if there was no more data to provide or
251 * <0 on error.
252 */
253int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
254 struct btrfs_extent_item *ei, u32 item_size,
255 u64 *out_root, u8 *out_level)
256{
257 int ret;
258 int type;
259 struct btrfs_tree_block_info *info;
260 struct btrfs_extent_inline_ref *eiref;
261
262 if (*ptr == (unsigned long)-1)
263 return 1;
264
265 while (1) {
266 ret = __get_extent_inline_ref(ptr, eb, ei, item_size,
267 &eiref, &type);
268 if (ret < 0)
269 return ret;
270
271 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
272 type == BTRFS_SHARED_BLOCK_REF_KEY)
273 break;
274
275 if (ret == 1)
276 return 1;
277 }
278
279 /* we can treat both ref types equally here */
280 info = (struct btrfs_tree_block_info *)(ei + 1);
281 *out_root = btrfs_extent_inline_ref_offset(eb, eiref);
282 *out_level = btrfs_tree_block_level(eb, info);
283
284 if (ret == 1)
285 *ptr = (unsigned long)-1;
286
287 return 0;
288}
289
290static int __data_list_add(struct list_head *head, u64 inum,
291 u64 extent_data_item_offset, u64 root)
292{
293 struct __data_ref *ref;
294
295 ref = kmalloc(sizeof(*ref), GFP_NOFS);
296 if (!ref)
297 return -ENOMEM;
298
299 ref->inum = inum;
300 ref->extent_data_item_offset = extent_data_item_offset;
301 ref->root = root;
302 list_add_tail(&ref->list, head);
303
304 return 0;
305}
306
307static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb,
308 struct btrfs_extent_data_ref *dref)
309{
310 return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref),
311 btrfs_extent_data_ref_offset(eb, dref),
312 btrfs_extent_data_ref_root(eb, dref));
313}
314
315static int __shared_list_add(struct list_head *head, u64 disk_byte)
316{
317 struct __shared_ref *ref;
318
319 ref = kmalloc(sizeof(*ref), GFP_NOFS);
320 if (!ref)
321 return -ENOMEM;
322
323 ref->disk_byte = disk_byte;
324 list_add_tail(&ref->list, head);
325
326 return 0;
327}
328
329static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info,
330 u64 logical, u64 inum,
331 u64 extent_data_item_offset,
332 u64 extent_offset,
333 struct btrfs_path *path,
334 struct list_head *data_refs,
335 iterate_extent_inodes_t *iterate,
336 void *ctx)
337{
338 u64 ref_root;
339 u32 item_size;
340 struct btrfs_key key;
341 struct extent_buffer *eb;
342 struct btrfs_extent_item *ei;
343 struct btrfs_extent_inline_ref *eiref;
344 struct __data_ref *ref;
345 int ret;
346 int type;
347 int last;
348 unsigned long ptr = 0;
349
350 WARN_ON(!list_empty(data_refs));
351 ret = extent_from_logical(fs_info, logical, path, &key);
352 if (ret & BTRFS_EXTENT_FLAG_DATA)
353 ret = -EIO;
354 if (ret < 0)
355 goto out;
356
357 eb = path->nodes[0];
358 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
359 item_size = btrfs_item_size_nr(eb, path->slots[0]);
360
361 ret = 0;
362 ref_root = 0;
363 /*
364 * as done in iterate_extent_inodes, we first build a list of refs to
365 * iterate, then free the path and then iterate them to avoid deadlocks.
366 */
367 do {
368 last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
369 &eiref, &type);
370 if (last < 0) {
371 ret = last;
372 goto out;
373 }
374 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
375 type == BTRFS_SHARED_BLOCK_REF_KEY) {
376 ref_root = btrfs_extent_inline_ref_offset(eb, eiref);
377 ret = __data_list_add(data_refs, inum,
378 extent_data_item_offset,
379 ref_root);
380 }
381 } while (!ret && !last);
382
383 btrfs_release_path(path);
384
385 if (ref_root == 0) {
386 printk(KERN_ERR "btrfs: failed to find tree block ref "
387 "for shared data backref %llu\n", logical);
388 WARN_ON(1);
389 ret = -EIO;
390 }
391
392out:
393 while (!list_empty(data_refs)) {
394 ref = list_first_entry(data_refs, struct __data_ref, list);
395 list_del(&ref->list);
396 if (!ret)
397 ret = iterate(ref->inum, extent_offset +
398 ref->extent_data_item_offset,
399 ref->root, ctx);
400 kfree(ref);
401 }
402
403 return ret;
404}
405
406static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
407 u64 logical, u64 orig_extent_item_objectid,
408 u64 extent_offset, struct btrfs_path *path,
409 struct list_head *data_refs,
410 iterate_extent_inodes_t *iterate,
411 void *ctx)
412{
413 u64 disk_byte;
414 struct btrfs_key key;
415 struct btrfs_file_extent_item *fi;
416 struct extent_buffer *eb;
417 int slot;
418 int nritems;
419 int ret;
420 int found = 0;
421
422 eb = read_tree_block(fs_info->tree_root, logical,
423 fs_info->tree_root->leafsize, 0);
424 if (!eb)
425 return -EIO;
426
427 /*
428 * from the shared data ref, we only have the leaf but we need
429 * the key. thus, we must look into all items and see that we
430 * find one (some) with a reference to our extent item.
431 */
432 nritems = btrfs_header_nritems(eb);
433 for (slot = 0; slot < nritems; ++slot) {
434 btrfs_item_key_to_cpu(eb, &key, slot);
435 if (key.type != BTRFS_EXTENT_DATA_KEY)
436 continue;
437 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
438 if (!fi) {
439 free_extent_buffer(eb);
440 return -EIO;
441 }
442 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
443 if (disk_byte != orig_extent_item_objectid) {
444 if (found)
445 break;
446 else
447 continue;
448 }
449 ++found;
450 ret = __iter_shared_inline_ref_inodes(fs_info, logical,
451 key.objectid,
452 key.offset,
453 extent_offset, path,
454 data_refs,
455 iterate, ctx);
456 if (ret)
457 break;
458 }
459
460 if (!found) {
461 printk(KERN_ERR "btrfs: failed to follow shared data backref "
462 "to parent %llu\n", logical);
463 WARN_ON(1);
464 ret = -EIO;
465 }
466
467 free_extent_buffer(eb);
468 return ret;
469}
470
471/*
472 * calls iterate() for every inode that references the extent identified by
473 * the given parameters. will use the path given as a parameter and return it
474 * released.
475 * when the iterator function returns a non-zero value, iteration stops.
476 */
477int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
478 struct btrfs_path *path,
479 u64 extent_item_objectid,
480 u64 extent_offset,
481 iterate_extent_inodes_t *iterate, void *ctx)
482{
483 unsigned long ptr = 0;
484 int last;
485 int ret;
486 int type;
487 u64 logical;
488 u32 item_size;
489 struct btrfs_extent_inline_ref *eiref;
490 struct btrfs_extent_data_ref *dref;
491 struct extent_buffer *eb;
492 struct btrfs_extent_item *ei;
493 struct btrfs_key key;
494 struct list_head data_refs = LIST_HEAD_INIT(data_refs);
495 struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
496 struct __data_ref *ref_d;
497 struct __shared_ref *ref_s;
498
499 eb = path->nodes[0];
500 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
501 item_size = btrfs_item_size_nr(eb, path->slots[0]);
502
503 /* first we iterate the inline refs, ... */
504 do {
505 last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
506 &eiref, &type);
507 if (last == -ENOENT) {
508 ret = 0;
509 break;
510 }
511 if (last < 0) {
512 ret = last;
513 break;
514 }
515
516 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
517 dref = (struct btrfs_extent_data_ref *)(&eiref->offset);
518 ret = __data_list_add_eb(&data_refs, eb, dref);
519 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
520 logical = btrfs_extent_inline_ref_offset(eb, eiref);
521 ret = __shared_list_add(&shared_refs, logical);
522 }
523 } while (!ret && !last);
524
525 /* ... then we proceed to in-tree references and ... */
526 while (!ret) {
527 ++path->slots[0];
528 if (path->slots[0] > btrfs_header_nritems(eb)) {
529 ret = btrfs_next_leaf(fs_info->extent_root, path);
530 if (ret) {
531 if (ret == 1)
532 ret = 0; /* we're done */
533 break;
534 }
535 eb = path->nodes[0];
536 }
537 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
538 if (key.objectid != extent_item_objectid)
539 break;
540 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
541 dref = btrfs_item_ptr(eb, path->slots[0],
542 struct btrfs_extent_data_ref);
543 ret = __data_list_add_eb(&data_refs, eb, dref);
544 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
545 ret = __shared_list_add(&shared_refs, key.offset);
546 }
547 }
548
549 btrfs_release_path(path);
550
551 /*
552 * ... only at the very end we can process the refs we found. this is
553 * because the iterator function we call is allowed to make tree lookups
554 * and we have to avoid deadlocks. additionally, we need more tree
555 * lookups ourselves for shared data refs.
556 */
557 while (!list_empty(&data_refs)) {
558 ref_d = list_first_entry(&data_refs, struct __data_ref, list);
559 list_del(&ref_d->list);
560 if (!ret)
561 ret = iterate(ref_d->inum, extent_offset +
562 ref_d->extent_data_item_offset,
563 ref_d->root, ctx);
564 kfree(ref_d);
565 }
566
567 while (!list_empty(&shared_refs)) {
568 ref_s = list_first_entry(&shared_refs, struct __shared_ref,
569 list);
570 list_del(&ref_s->list);
571 if (!ret)
572 ret = __iter_shared_inline_ref(fs_info,
573 ref_s->disk_byte,
574 extent_item_objectid,
575 extent_offset, path,
576 &data_refs,
577 iterate, ctx);
578 kfree(ref_s);
579 }
580
581 return ret;
582}
583
584int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
585 struct btrfs_path *path,
586 iterate_extent_inodes_t *iterate, void *ctx)
587{
588 int ret;
589 u64 offset;
590 struct btrfs_key found_key;
591
592 ret = extent_from_logical(fs_info, logical, path,
593 &found_key);
594 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
595 ret = -EINVAL;
596 if (ret < 0)
597 return ret;
598
599 offset = logical - found_key.objectid;
600 ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
601 offset, iterate, ctx);
602
603 return ret;
604}
605
606static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
607 struct btrfs_path *path,
608 iterate_irefs_t *iterate, void *ctx)
609{
610 int ret;
611 int slot;
612 u32 cur;
613 u32 len;
614 u32 name_len;
615 u64 parent = 0;
616 int found = 0;
617 struct extent_buffer *eb;
618 struct btrfs_item *item;
619 struct btrfs_inode_ref *iref;
620 struct btrfs_key found_key;
621
622 while (1) {
623 ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
624 &found_key);
625 if (ret < 0)
626 break;
627 if (ret) {
628 ret = found ? 0 : -ENOENT;
629 break;
630 }
631 ++found;
632
633 parent = found_key.offset;
634 slot = path->slots[0];
635 eb = path->nodes[0];
636 /* make sure we can use eb after releasing the path */
637 atomic_inc(&eb->refs);
638 btrfs_release_path(path);
639
640 item = btrfs_item_nr(eb, slot);
641 iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
642
643 for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
644 name_len = btrfs_inode_ref_name_len(eb, iref);
645 /* path must be released before calling iterate()! */
646 ret = iterate(parent, iref, eb, ctx);
647 if (ret) {
648 free_extent_buffer(eb);
649 break;
650 }
651 len = sizeof(*iref) + name_len;
652 iref = (struct btrfs_inode_ref *)((char *)iref + len);
653 }
654 free_extent_buffer(eb);
655 }
656
657 btrfs_release_path(path);
658
659 return ret;
660}
661
662/*
663 * returns 0 if the path could be dumped (probably truncated)
664 * returns <0 in case of an error
665 */
666static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
667 struct extent_buffer *eb, void *ctx)
668{
669 struct inode_fs_paths *ipath = ctx;
670 char *fspath;
671 char *fspath_min;
672 int i = ipath->fspath->elem_cnt;
673 const int s_ptr = sizeof(char *);
674 u32 bytes_left;
675
676 bytes_left = ipath->fspath->bytes_left > s_ptr ?
677 ipath->fspath->bytes_left - s_ptr : 0;
678
679 fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
680 fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb,
681 inum, fspath_min, bytes_left);
682 if (IS_ERR(fspath))
683 return PTR_ERR(fspath);
684
685 if (fspath > fspath_min) {
686 ipath->fspath->val[i] = (u64)(unsigned long)fspath;
687 ++ipath->fspath->elem_cnt;
688 ipath->fspath->bytes_left = fspath - fspath_min;
689 } else {
690 ++ipath->fspath->elem_missed;
691 ipath->fspath->bytes_missing += fspath_min - fspath;
692 ipath->fspath->bytes_left = 0;
693 }
694
695 return 0;
696}
697
698/*
699 * this dumps all file system paths to the inode into the ipath struct, provided
700 * is has been created large enough. each path is zero-terminated and accessed
701 * from ipath->fspath->val[i].
702 * when it returns, there are ipath->fspath->elem_cnt number of paths available
703 * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the
704 * number of missed paths in recored in ipath->fspath->elem_missed, otherwise,
705 * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would
706 * have been needed to return all paths.
707 */
708int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
709{
710 return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
711 inode_to_path, ipath);
712}
713
714/*
715 * allocates space to return multiple file system paths for an inode.
716 * total_bytes to allocate are passed, note that space usable for actual path
717 * information will be total_bytes - sizeof(struct inode_fs_paths).
718 * the returned pointer must be freed with free_ipath() in the end.
719 */
720struct btrfs_data_container *init_data_container(u32 total_bytes)
721{
722 struct btrfs_data_container *data;
723 size_t alloc_bytes;
724
725 alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
726 data = kmalloc(alloc_bytes, GFP_NOFS);
727 if (!data)
728 return ERR_PTR(-ENOMEM);
729
730 if (total_bytes >= sizeof(*data)) {
731 data->bytes_left = total_bytes - sizeof(*data);
732 data->bytes_missing = 0;
733 } else {
734 data->bytes_missing = sizeof(*data) - total_bytes;
735 data->bytes_left = 0;
736 }
737
738 data->elem_cnt = 0;
739 data->elem_missed = 0;
740
741 return data;
742}
743
744/*
745 * allocates space to return multiple file system paths for an inode.
746 * total_bytes to allocate are passed, note that space usable for actual path
747 * information will be total_bytes - sizeof(struct inode_fs_paths).
748 * the returned pointer must be freed with free_ipath() in the end.
749 */
750struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
751 struct btrfs_path *path)
752{
753 struct inode_fs_paths *ifp;
754 struct btrfs_data_container *fspath;
755
756 fspath = init_data_container(total_bytes);
757 if (IS_ERR(fspath))
758 return (void *)fspath;
759
760 ifp = kmalloc(sizeof(*ifp), GFP_NOFS);
761 if (!ifp) {
762 kfree(fspath);
763 return ERR_PTR(-ENOMEM);
764 }
765
766 ifp->btrfs_path = path;
767 ifp->fspath = fspath;
768 ifp->fs_root = fs_root;
769
770 return ifp;
771}
772
773void free_ipath(struct inode_fs_paths *ipath)
774{
775 kfree(ipath);
776}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
new file mode 100644
index 000000000000..92618837cb8f
--- /dev/null
+++ b/fs/btrfs/backref.h
@@ -0,0 +1,62 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_BACKREF__
20#define __BTRFS_BACKREF__
21
22#include "ioctl.h"
23
24struct inode_fs_paths {
25 struct btrfs_path *btrfs_path;
26 struct btrfs_root *fs_root;
27 struct btrfs_data_container *fspath;
28};
29
30typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
31 void *ctx);
32typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref,
33 struct extent_buffer *eb, void *ctx);
34
35int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
36 struct btrfs_path *path);
37
38int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
39 struct btrfs_path *path, struct btrfs_key *found_key);
40
41int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
42 struct btrfs_extent_item *ei, u32 item_size,
43 u64 *out_root, u8 *out_level);
44
45int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
46 struct btrfs_path *path,
47 u64 extent_item_objectid,
48 u64 extent_offset,
49 iterate_extent_inodes_t *iterate, void *ctx);
50
51int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
52 struct btrfs_path *path,
53 iterate_extent_inodes_t *iterate, void *ctx);
54
55int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
56
57struct btrfs_data_container *init_data_container(u32 total_bytes);
58struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
59 struct btrfs_path *path);
60void free_ipath(struct inode_fs_paths *ipath);
61
62#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index d9f99a16edd6..634608d2a6d0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -103,11 +103,6 @@ struct btrfs_inode {
103 */ 103 */
104 u64 delalloc_bytes; 104 u64 delalloc_bytes;
105 105
106 /* total number of bytes that may be used for this inode for
107 * delalloc
108 */
109 u64 reserved_bytes;
110
111 /* 106 /*
112 * the size of the file stored in the metadata on disk. data=ordered 107 * the size of the file stored in the metadata on disk. data=ordered
113 * means the in-memory i_size might be larger than the size on disk 108 * means the in-memory i_size might be larger than the size on disk
@@ -115,9 +110,6 @@ struct btrfs_inode {
115 */ 110 */
116 u64 disk_i_size; 111 u64 disk_i_size;
117 112
118 /* flags field from the on disk inode */
119 u32 flags;
120
121 /* 113 /*
122 * if this is a directory then index_cnt is the counter for the index 114 * if this is a directory then index_cnt is the counter for the index
123 * number for new files that are created 115 * number for new files that are created
@@ -132,6 +124,15 @@ struct btrfs_inode {
132 u64 last_unlink_trans; 124 u64 last_unlink_trans;
133 125
134 /* 126 /*
127 * Number of bytes outstanding that are going to need csums. This is
128 * used in ENOSPC accounting.
129 */
130 u64 csum_bytes;
131
132 /* flags field from the on disk inode */
133 u32 flags;
134
135 /*
135 * Counters to keep track of the number of extent item's we may use due 136 * Counters to keep track of the number of extent item's we may use due
136 * to delalloc and such. outstanding_extents is the number of extent 137 * to delalloc and such. outstanding_extents is the number of extent
137 * items we think we'll end up using, and reserved_extents is the number 138 * items we think we'll end up using, and reserved_extents is the number
@@ -146,14 +147,12 @@ struct btrfs_inode {
146 * the btrfs file release call will add this inode to the 147 * the btrfs file release call will add this inode to the
147 * ordered operations list so that we make sure to flush out any 148 * ordered operations list so that we make sure to flush out any
148 * new data the application may have written before commit. 149 * new data the application may have written before commit.
149 *
150 * yes, its silly to have a single bitflag, but we might grow more
151 * of these.
152 */ 150 */
153 unsigned ordered_data_close:1; 151 unsigned ordered_data_close:1;
154 unsigned orphan_meta_reserved:1; 152 unsigned orphan_meta_reserved:1;
155 unsigned dummy_inode:1; 153 unsigned dummy_inode:1;
156 unsigned in_defrag:1; 154 unsigned in_defrag:1;
155 unsigned delalloc_meta_reserved:1;
157 156
158 /* 157 /*
159 * always compress this one file 158 * always compress this one file
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8ec5d86f1734..14f1c5a0b2d2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -85,7 +85,8 @@ struct compressed_bio {
85static inline int compressed_bio_size(struct btrfs_root *root, 85static inline int compressed_bio_size(struct btrfs_root *root,
86 unsigned long disk_size) 86 unsigned long disk_size)
87{ 87{
88 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); 88 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
89
89 return sizeof(struct compressed_bio) + 90 return sizeof(struct compressed_bio) +
90 ((disk_size + root->sectorsize - 1) / root->sectorsize) * 91 ((disk_size + root->sectorsize - 1) / root->sectorsize) *
91 csum_size; 92 csum_size;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 011cab3aca8d..dede441bdeee 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -514,10 +514,25 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
514 struct btrfs_root *root, 514 struct btrfs_root *root,
515 struct extent_buffer *buf) 515 struct extent_buffer *buf)
516{ 516{
517 /* ensure we can see the force_cow */
518 smp_rmb();
519
520 /*
521 * We do not need to cow a block if
522 * 1) this block is not created or changed in this transaction;
523 * 2) this block does not belong to TREE_RELOC tree;
524 * 3) the root is not forced COW.
525 *
526 * What is forced COW:
527 * when we create snapshot during commiting the transaction,
528 * after we've finished coping src root, we must COW the shared
529 * block to ensure the metadata consistency.
530 */
517 if (btrfs_header_generation(buf) == trans->transid && 531 if (btrfs_header_generation(buf) == trans->transid &&
518 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && 532 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
519 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 533 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
520 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) 534 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
535 !root->force_cow)
521 return 0; 536 return 0;
522 return 1; 537 return 1;
523} 538}
@@ -902,9 +917,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
902 917
903 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 918 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
904 919
905 if (level < BTRFS_MAX_LEVEL - 1) 920 if (level < BTRFS_MAX_LEVEL - 1) {
906 parent = path->nodes[level + 1]; 921 parent = path->nodes[level + 1];
907 pslot = path->slots[level + 1]; 922 pslot = path->slots[level + 1];
923 }
908 924
909 /* 925 /*
910 * deal with the case where there is only one pointer in the root 926 * deal with the case where there is only one pointer in the root
@@ -1107,9 +1123,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1107 mid = path->nodes[level]; 1123 mid = path->nodes[level];
1108 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1124 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1109 1125
1110 if (level < BTRFS_MAX_LEVEL - 1) 1126 if (level < BTRFS_MAX_LEVEL - 1) {
1111 parent = path->nodes[level + 1]; 1127 parent = path->nodes[level + 1];
1112 pslot = path->slots[level + 1]; 1128 pslot = path->slots[level + 1];
1129 }
1113 1130
1114 if (!parent) 1131 if (!parent)
1115 return 1; 1132 return 1;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 03912c5c6f49..04a5dfcee5a1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -30,6 +30,7 @@
30#include <linux/kobject.h> 30#include <linux/kobject.h>
31#include <trace/events/btrfs.h> 31#include <trace/events/btrfs.h>
32#include <asm/kmap_types.h> 32#include <asm/kmap_types.h>
33#include <linux/pagemap.h>
33#include "extent_io.h" 34#include "extent_io.h"
34#include "extent_map.h" 35#include "extent_map.h"
35#include "async-thread.h" 36#include "async-thread.h"
@@ -360,6 +361,47 @@ struct btrfs_header {
360#define BTRFS_LABEL_SIZE 256 361#define BTRFS_LABEL_SIZE 256
361 362
362/* 363/*
364 * just in case we somehow lose the roots and are not able to mount,
365 * we store an array of the roots from previous transactions
366 * in the super.
367 */
368#define BTRFS_NUM_BACKUP_ROOTS 4
369struct btrfs_root_backup {
370 __le64 tree_root;
371 __le64 tree_root_gen;
372
373 __le64 chunk_root;
374 __le64 chunk_root_gen;
375
376 __le64 extent_root;
377 __le64 extent_root_gen;
378
379 __le64 fs_root;
380 __le64 fs_root_gen;
381
382 __le64 dev_root;
383 __le64 dev_root_gen;
384
385 __le64 csum_root;
386 __le64 csum_root_gen;
387
388 __le64 total_bytes;
389 __le64 bytes_used;
390 __le64 num_devices;
391 /* future */
392 __le64 unsed_64[4];
393
394 u8 tree_root_level;
395 u8 chunk_root_level;
396 u8 extent_root_level;
397 u8 fs_root_level;
398 u8 dev_root_level;
399 u8 csum_root_level;
400 /* future and to align */
401 u8 unused_8[10];
402} __attribute__ ((__packed__));
403
404/*
363 * the super block basically lists the main trees of the FS 405 * the super block basically lists the main trees of the FS
364 * it currently lacks any block count etc etc 406 * it currently lacks any block count etc etc
365 */ 407 */
@@ -405,6 +447,7 @@ struct btrfs_super_block {
405 /* future expansion */ 447 /* future expansion */
406 __le64 reserved[31]; 448 __le64 reserved[31];
407 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; 449 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
450 struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
408} __attribute__ ((__packed__)); 451} __attribute__ ((__packed__));
409 452
410/* 453/*
@@ -772,14 +815,8 @@ struct btrfs_space_info {
772struct btrfs_block_rsv { 815struct btrfs_block_rsv {
773 u64 size; 816 u64 size;
774 u64 reserved; 817 u64 reserved;
775 u64 freed[2];
776 struct btrfs_space_info *space_info; 818 struct btrfs_space_info *space_info;
777 struct list_head list;
778 spinlock_t lock; 819 spinlock_t lock;
779 atomic_t usage;
780 unsigned int priority:8;
781 unsigned int durable:1;
782 unsigned int refill_used:1;
783 unsigned int full:1; 820 unsigned int full:1;
784}; 821};
785 822
@@ -811,7 +848,8 @@ struct btrfs_free_cluster {
811enum btrfs_caching_type { 848enum btrfs_caching_type {
812 BTRFS_CACHE_NO = 0, 849 BTRFS_CACHE_NO = 0,
813 BTRFS_CACHE_STARTED = 1, 850 BTRFS_CACHE_STARTED = 1,
814 BTRFS_CACHE_FINISHED = 2, 851 BTRFS_CACHE_FAST = 2,
852 BTRFS_CACHE_FINISHED = 3,
815}; 853};
816 854
817enum btrfs_disk_cache_state { 855enum btrfs_disk_cache_state {
@@ -840,10 +878,10 @@ struct btrfs_block_group_cache {
840 spinlock_t lock; 878 spinlock_t lock;
841 u64 pinned; 879 u64 pinned;
842 u64 reserved; 880 u64 reserved;
843 u64 reserved_pinned;
844 u64 bytes_super; 881 u64 bytes_super;
845 u64 flags; 882 u64 flags;
846 u64 sectorsize; 883 u64 sectorsize;
884 u64 cache_generation;
847 unsigned int ro:1; 885 unsigned int ro:1;
848 unsigned int dirty:1; 886 unsigned int dirty:1;
849 unsigned int iref:1; 887 unsigned int iref:1;
@@ -899,6 +937,10 @@ struct btrfs_fs_info {
899 spinlock_t block_group_cache_lock; 937 spinlock_t block_group_cache_lock;
900 struct rb_root block_group_cache_tree; 938 struct rb_root block_group_cache_tree;
901 939
940 /* keep track of unallocated space */
941 spinlock_t free_chunk_lock;
942 u64 free_chunk_space;
943
902 struct extent_io_tree freed_extents[2]; 944 struct extent_io_tree freed_extents[2];
903 struct extent_io_tree *pinned_extents; 945 struct extent_io_tree *pinned_extents;
904 946
@@ -916,14 +958,11 @@ struct btrfs_fs_info {
916 struct btrfs_block_rsv trans_block_rsv; 958 struct btrfs_block_rsv trans_block_rsv;
917 /* block reservation for chunk tree */ 959 /* block reservation for chunk tree */
918 struct btrfs_block_rsv chunk_block_rsv; 960 struct btrfs_block_rsv chunk_block_rsv;
961 /* block reservation for delayed operations */
962 struct btrfs_block_rsv delayed_block_rsv;
919 963
920 struct btrfs_block_rsv empty_block_rsv; 964 struct btrfs_block_rsv empty_block_rsv;
921 965
922 /* list of block reservations that cross multiple transactions */
923 struct list_head durable_block_rsv_list;
924
925 struct mutex durable_block_rsv_mutex;
926
927 u64 generation; 966 u64 generation;
928 u64 last_trans_committed; 967 u64 last_trans_committed;
929 968
@@ -942,8 +981,8 @@ struct btrfs_fs_info {
942 wait_queue_head_t transaction_blocked_wait; 981 wait_queue_head_t transaction_blocked_wait;
943 wait_queue_head_t async_submit_wait; 982 wait_queue_head_t async_submit_wait;
944 983
945 struct btrfs_super_block super_copy; 984 struct btrfs_super_block *super_copy;
946 struct btrfs_super_block super_for_commit; 985 struct btrfs_super_block *super_for_commit;
947 struct block_device *__bdev; 986 struct block_device *__bdev;
948 struct super_block *sb; 987 struct super_block *sb;
949 struct inode *btree_inode; 988 struct inode *btree_inode;
@@ -1036,6 +1075,7 @@ struct btrfs_fs_info {
1036 struct btrfs_workers endio_freespace_worker; 1075 struct btrfs_workers endio_freespace_worker;
1037 struct btrfs_workers submit_workers; 1076 struct btrfs_workers submit_workers;
1038 struct btrfs_workers caching_workers; 1077 struct btrfs_workers caching_workers;
1078 struct btrfs_workers readahead_workers;
1039 1079
1040 /* 1080 /*
1041 * fixup workers take dirty pages that didn't properly go through 1081 * fixup workers take dirty pages that didn't properly go through
@@ -1119,6 +1159,13 @@ struct btrfs_fs_info {
1119 u64 fs_state; 1159 u64 fs_state;
1120 1160
1121 struct btrfs_delayed_root *delayed_root; 1161 struct btrfs_delayed_root *delayed_root;
1162
1163 /* readahead tree */
1164 spinlock_t reada_lock;
1165 struct radix_tree_root reada_tree;
1166
1167 /* next backup root to be overwritten */
1168 int backup_root_index;
1122}; 1169};
1123 1170
1124/* 1171/*
@@ -1225,6 +1272,8 @@ struct btrfs_root {
1225 * for stat. It may be used for more later 1272 * for stat. It may be used for more later
1226 */ 1273 */
1227 dev_t anon_dev; 1274 dev_t anon_dev;
1275
1276 int force_cow;
1228}; 1277};
1229 1278
1230struct btrfs_ioctl_defrag_range_args { 1279struct btrfs_ioctl_defrag_range_args {
@@ -1363,6 +1412,7 @@ struct btrfs_ioctl_defrag_range_args {
1363#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) 1412#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
1364#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) 1413#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
1365#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) 1414#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
1415#define BTRFS_MOUNT_RECOVERY (1 << 18)
1366 1416
1367#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1417#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1368#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1418#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1978,6 +2028,55 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root)
1978 return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY; 2028 return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
1979} 2029}
1980 2030
2031/* struct btrfs_root_backup */
2032BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
2033 tree_root, 64);
2034BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
2035 tree_root_gen, 64);
2036BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
2037 tree_root_level, 8);
2038
2039BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
2040 chunk_root, 64);
2041BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
2042 chunk_root_gen, 64);
2043BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
2044 chunk_root_level, 8);
2045
2046BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
2047 extent_root, 64);
2048BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
2049 extent_root_gen, 64);
2050BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
2051 extent_root_level, 8);
2052
2053BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
2054 fs_root, 64);
2055BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
2056 fs_root_gen, 64);
2057BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
2058 fs_root_level, 8);
2059
2060BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
2061 dev_root, 64);
2062BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
2063 dev_root_gen, 64);
2064BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
2065 dev_root_level, 8);
2066
2067BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
2068 csum_root, 64);
2069BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
2070 csum_root_gen, 64);
2071BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
2072 csum_root_level, 8);
2073BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
2074 total_bytes, 64);
2075BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
2076 bytes_used, 64);
2077BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
2078 num_devices, 64);
2079
1981/* struct btrfs_super_block */ 2080/* struct btrfs_super_block */
1982 2081
1983BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); 2082BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@ -2129,6 +2228,11 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
2129 (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); 2228 (space_info->flags & BTRFS_BLOCK_GROUP_DATA));
2130} 2229}
2131 2230
2231static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
2232{
2233 return mapping_gfp_mask(mapping) & ~__GFP_FS;
2234}
2235
2132/* extent-tree.c */ 2236/* extent-tree.c */
2133static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, 2237static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
2134 unsigned num_items) 2238 unsigned num_items)
@@ -2137,6 +2241,17 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
2137 3 * num_items; 2241 3 * num_items;
2138} 2242}
2139 2243
2244/*
2245 * Doing a truncate won't result in new nodes or leaves, just what we need for
2246 * COW.
2247 */
2248static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
2249 unsigned num_items)
2250{
2251 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
2252 num_items;
2253}
2254
2140void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 2255void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
2141int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2256int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2142 struct btrfs_root *root, unsigned long count); 2257 struct btrfs_root *root, unsigned long count);
@@ -2146,6 +2261,9 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
2146 u64 num_bytes, u64 *refs, u64 *flags); 2261 u64 num_bytes, u64 *refs, u64 *flags);
2147int btrfs_pin_extent(struct btrfs_root *root, 2262int btrfs_pin_extent(struct btrfs_root *root,
2148 u64 bytenr, u64 num, int reserved); 2263 u64 bytenr, u64 num, int reserved);
2264int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
2265 struct btrfs_root *root,
2266 u64 bytenr, u64 num_bytes);
2149int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 2267int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2150 struct btrfs_root *root, 2268 struct btrfs_root *root,
2151 u64 objectid, u64 offset, u64 bytenr); 2269 u64 objectid, u64 offset, u64 bytenr);
@@ -2196,8 +2314,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
2196 u64 root_objectid, u64 owner, u64 offset); 2314 u64 root_objectid, u64 owner, u64 offset);
2197 2315
2198int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); 2316int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
2199int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 2317int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
2200 u64 num_bytes, int reserve, int sinfo); 2318 u64 start, u64 len);
2201int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 2319int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
2202 struct btrfs_root *root); 2320 struct btrfs_root *root);
2203int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 2321int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
@@ -2240,25 +2358,23 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
2240struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); 2358struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
2241void btrfs_free_block_rsv(struct btrfs_root *root, 2359void btrfs_free_block_rsv(struct btrfs_root *root,
2242 struct btrfs_block_rsv *rsv); 2360 struct btrfs_block_rsv *rsv);
2243void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, 2361int btrfs_block_rsv_add(struct btrfs_root *root,
2244 struct btrfs_block_rsv *rsv);
2245int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
2246 struct btrfs_root *root,
2247 struct btrfs_block_rsv *block_rsv, 2362 struct btrfs_block_rsv *block_rsv,
2248 u64 num_bytes); 2363 u64 num_bytes);
2249int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, 2364int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
2250 struct btrfs_root *root, 2365 struct btrfs_block_rsv *block_rsv,
2366 u64 num_bytes);
2367int btrfs_block_rsv_check(struct btrfs_root *root,
2368 struct btrfs_block_rsv *block_rsv, int min_factor);
2369int btrfs_block_rsv_refill(struct btrfs_root *root,
2251 struct btrfs_block_rsv *block_rsv, 2370 struct btrfs_block_rsv *block_rsv,
2252 u64 min_reserved, int min_factor); 2371 u64 min_reserved);
2253int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 2372int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2254 struct btrfs_block_rsv *dst_rsv, 2373 struct btrfs_block_rsv *dst_rsv,
2255 u64 num_bytes); 2374 u64 num_bytes);
2256void btrfs_block_rsv_release(struct btrfs_root *root, 2375void btrfs_block_rsv_release(struct btrfs_root *root,
2257 struct btrfs_block_rsv *block_rsv, 2376 struct btrfs_block_rsv *block_rsv,
2258 u64 num_bytes); 2377 u64 num_bytes);
2259int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
2260 struct btrfs_root *root,
2261 struct btrfs_block_rsv *rsv);
2262int btrfs_set_block_group_ro(struct btrfs_root *root, 2378int btrfs_set_block_group_ro(struct btrfs_root *root,
2263 struct btrfs_block_group_cache *cache); 2379 struct btrfs_block_group_cache *cache);
2264int btrfs_set_block_group_rw(struct btrfs_root *root, 2380int btrfs_set_block_group_rw(struct btrfs_root *root,
@@ -2379,6 +2495,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
2379 smp_mb(); 2495 smp_mb();
2380 return fs_info->closing; 2496 return fs_info->closing;
2381} 2497}
2498static inline void free_fs_info(struct btrfs_fs_info *fs_info)
2499{
2500 kfree(fs_info->delayed_root);
2501 kfree(fs_info->extent_root);
2502 kfree(fs_info->tree_root);
2503 kfree(fs_info->chunk_root);
2504 kfree(fs_info->dev_root);
2505 kfree(fs_info->csum_root);
2506 kfree(fs_info->super_copy);
2507 kfree(fs_info->super_for_commit);
2508 kfree(fs_info);
2509}
2382 2510
2383/* root-item.c */ 2511/* root-item.c */
2384int btrfs_find_root_ref(struct btrfs_root *tree_root, 2512int btrfs_find_root_ref(struct btrfs_root *tree_root,
@@ -2579,11 +2707,6 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
2579int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); 2707int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
2580int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); 2708int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2581int btrfs_orphan_cleanup(struct btrfs_root *root); 2709int btrfs_orphan_cleanup(struct btrfs_root *root);
2582void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2583 struct btrfs_pending_snapshot *pending,
2584 u64 *bytes_to_reserve);
2585void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2586 struct btrfs_pending_snapshot *pending);
2587void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, 2710void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2588 struct btrfs_root *root); 2711 struct btrfs_root *root);
2589int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); 2712int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
@@ -2697,4 +2820,20 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
2697int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 2820int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
2698 struct btrfs_scrub_progress *progress); 2821 struct btrfs_scrub_progress *progress);
2699 2822
2823/* reada.c */
2824struct reada_control {
2825 struct btrfs_root *root; /* tree to prefetch */
2826 struct btrfs_key key_start;
2827 struct btrfs_key key_end; /* exclusive */
2828 atomic_t elems;
2829 struct kref refcnt;
2830 wait_queue_head_t wait;
2831};
2832struct reada_control *btrfs_reada_add(struct btrfs_root *root,
2833 struct btrfs_key *start, struct btrfs_key *end);
2834int btrfs_reada_wait(void *handle);
2835void btrfs_reada_detach(void *handle);
2836int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
2837 u64 start, int err);
2838
2700#endif 2839#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index ae4d9cd10961..5b163572e0ca 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -591,7 +591,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
591 return 0; 591 return 0;
592 592
593 src_rsv = trans->block_rsv; 593 src_rsv = trans->block_rsv;
594 dst_rsv = &root->fs_info->global_block_rsv; 594 dst_rsv = &root->fs_info->delayed_block_rsv;
595 595
596 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 596 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
597 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); 597 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
@@ -609,7 +609,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
609 if (!item->bytes_reserved) 609 if (!item->bytes_reserved)
610 return; 610 return;
611 611
612 rsv = &root->fs_info->global_block_rsv; 612 rsv = &root->fs_info->delayed_block_rsv;
613 btrfs_block_rsv_release(root, rsv, 613 btrfs_block_rsv_release(root, rsv,
614 item->bytes_reserved); 614 item->bytes_reserved);
615} 615}
@@ -617,24 +617,102 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
617static int btrfs_delayed_inode_reserve_metadata( 617static int btrfs_delayed_inode_reserve_metadata(
618 struct btrfs_trans_handle *trans, 618 struct btrfs_trans_handle *trans,
619 struct btrfs_root *root, 619 struct btrfs_root *root,
620 struct inode *inode,
620 struct btrfs_delayed_node *node) 621 struct btrfs_delayed_node *node)
621{ 622{
622 struct btrfs_block_rsv *src_rsv; 623 struct btrfs_block_rsv *src_rsv;
623 struct btrfs_block_rsv *dst_rsv; 624 struct btrfs_block_rsv *dst_rsv;
624 u64 num_bytes; 625 u64 num_bytes;
625 int ret; 626 int ret;
626 627 int release = false;
627 if (!trans->bytes_reserved)
628 return 0;
629 628
630 src_rsv = trans->block_rsv; 629 src_rsv = trans->block_rsv;
631 dst_rsv = &root->fs_info->global_block_rsv; 630 dst_rsv = &root->fs_info->delayed_block_rsv;
632 631
633 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 632 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
633
634 /*
635 * btrfs_dirty_inode will update the inode under btrfs_join_transaction
636 * which doesn't reserve space for speed. This is a problem since we
637 * still need to reserve space for this update, so try to reserve the
638 * space.
639 *
640 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
641 * we're accounted for.
642 */
643 if (!trans->bytes_reserved &&
644 src_rsv != &root->fs_info->delalloc_block_rsv) {
645 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
646 /*
647 * Since we're under a transaction reserve_metadata_bytes could
648 * try to commit the transaction which will make it return
649 * EAGAIN to make us stop the transaction we have, so return
650 * ENOSPC instead so that btrfs_dirty_inode knows what to do.
651 */
652 if (ret == -EAGAIN)
653 ret = -ENOSPC;
654 if (!ret)
655 node->bytes_reserved = num_bytes;
656 return ret;
657 } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
658 spin_lock(&BTRFS_I(inode)->lock);
659 if (BTRFS_I(inode)->delalloc_meta_reserved) {
660 BTRFS_I(inode)->delalloc_meta_reserved = 0;
661 spin_unlock(&BTRFS_I(inode)->lock);
662 release = true;
663 goto migrate;
664 }
665 spin_unlock(&BTRFS_I(inode)->lock);
666
667 /* Ok we didn't have space pre-reserved. This shouldn't happen
668 * too often but it can happen if we do delalloc to an existing
669 * inode which gets dirtied because of the time update, and then
670 * isn't touched again until after the transaction commits and
671 * then we try to write out the data. First try to be nice and
672 * reserve something strictly for us. If not be a pain and try
673 * to steal from the delalloc block rsv.
674 */
675 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
676 if (!ret)
677 goto out;
678
679 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
680 if (!ret)
681 goto out;
682
683 /*
684 * Ok this is a problem, let's just steal from the global rsv
685 * since this really shouldn't happen that often.
686 */
687 WARN_ON(1);
688 ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv,
689 dst_rsv, num_bytes);
690 goto out;
691 }
692
693migrate:
634 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); 694 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
695
696out:
697 /*
698 * Migrate only takes a reservation, it doesn't touch the size of the
699 * block_rsv. This is to simplify people who don't normally have things
700 * migrated from their block rsv. If they go to release their
701 * reservation, that will decrease the size as well, so if migrate
702 * reduced size we'd end up with a negative size. But for the
703 * delalloc_meta_reserved stuff we will only know to drop 1 reservation,
704 * but we could in fact do this reserve/migrate dance several times
705 * between the time we did the original reservation and we'd clean it
706 * up. So to take care of this, release the space for the meta
707 * reservation here. I think it may be time for a documentation page on
708 * how block rsvs. work.
709 */
635 if (!ret) 710 if (!ret)
636 node->bytes_reserved = num_bytes; 711 node->bytes_reserved = num_bytes;
637 712
713 if (release)
714 btrfs_block_rsv_release(root, src_rsv, num_bytes);
715
638 return ret; 716 return ret;
639} 717}
640 718
@@ -646,7 +724,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
646 if (!node->bytes_reserved) 724 if (!node->bytes_reserved)
647 return; 725 return;
648 726
649 rsv = &root->fs_info->global_block_rsv; 727 rsv = &root->fs_info->delayed_block_rsv;
650 btrfs_block_rsv_release(root, rsv, 728 btrfs_block_rsv_release(root, rsv,
651 node->bytes_reserved); 729 node->bytes_reserved);
652 node->bytes_reserved = 0; 730 node->bytes_reserved = 0;
@@ -1026,7 +1104,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1026 path->leave_spinning = 1; 1104 path->leave_spinning = 1;
1027 1105
1028 block_rsv = trans->block_rsv; 1106 block_rsv = trans->block_rsv;
1029 trans->block_rsv = &root->fs_info->global_block_rsv; 1107 trans->block_rsv = &root->fs_info->delayed_block_rsv;
1030 1108
1031 delayed_root = btrfs_get_delayed_root(root); 1109 delayed_root = btrfs_get_delayed_root(root);
1032 1110
@@ -1069,7 +1147,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
1069 path->leave_spinning = 1; 1147 path->leave_spinning = 1;
1070 1148
1071 block_rsv = trans->block_rsv; 1149 block_rsv = trans->block_rsv;
1072 trans->block_rsv = &node->root->fs_info->global_block_rsv; 1150 trans->block_rsv = &node->root->fs_info->delayed_block_rsv;
1073 1151
1074 ret = btrfs_insert_delayed_items(trans, path, node->root, node); 1152 ret = btrfs_insert_delayed_items(trans, path, node->root, node);
1075 if (!ret) 1153 if (!ret)
@@ -1149,7 +1227,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
1149 goto free_path; 1227 goto free_path;
1150 1228
1151 block_rsv = trans->block_rsv; 1229 block_rsv = trans->block_rsv;
1152 trans->block_rsv = &root->fs_info->global_block_rsv; 1230 trans->block_rsv = &root->fs_info->delayed_block_rsv;
1153 1231
1154 ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); 1232 ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);
1155 if (!ret) 1233 if (!ret)
@@ -1685,12 +1763,10 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
1685 goto release_node; 1763 goto release_node;
1686 } 1764 }
1687 1765
1688 ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); 1766 ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode,
1689 /* 1767 delayed_node);
1690 * we must reserve enough space when we start a new transaction, 1768 if (ret)
1691 * so reserving metadata failure is impossible 1769 goto release_node;
1692 */
1693 BUG_ON(ret);
1694 1770
1695 fill_stack_inode_item(trans, &delayed_node->inode_item, inode); 1771 fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
1696 delayed_node->inode_dirty = 1; 1772 delayed_node->inode_dirty = 1;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 07ea91879a91..632f8f3cc9db 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -256,8 +256,7 @@ void btrfs_csum_final(u32 crc, char *result)
256static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, 256static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
257 int verify) 257 int verify)
258{ 258{
259 u16 csum_size = 259 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
260 btrfs_super_csum_size(&root->fs_info->super_copy);
261 char *result = NULL; 260 char *result = NULL;
262 unsigned long len; 261 unsigned long len;
263 unsigned long cur_len; 262 unsigned long cur_len;
@@ -367,7 +366,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
367 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); 366 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
368 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; 367 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
369 while (1) { 368 while (1) {
370 ret = read_extent_buffer_pages(io_tree, eb, start, 1, 369 ret = read_extent_buffer_pages(io_tree, eb, start,
370 WAIT_COMPLETE,
371 btree_get_extent, mirror_num); 371 btree_get_extent, mirror_num);
372 if (!ret && 372 if (!ret &&
373 !verify_parent_transid(io_tree, eb, parent_transid)) 373 !verify_parent_transid(io_tree, eb, parent_transid))
@@ -608,11 +608,48 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
608 end = min_t(u64, eb->len, PAGE_CACHE_SIZE); 608 end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
609 end = eb->start + end - 1; 609 end = eb->start + end - 1;
610err: 610err:
611 if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
612 clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
613 btree_readahead_hook(root, eb, eb->start, ret);
614 }
615
611 free_extent_buffer(eb); 616 free_extent_buffer(eb);
612out: 617out:
613 return ret; 618 return ret;
614} 619}
615 620
621static int btree_io_failed_hook(struct bio *failed_bio,
622 struct page *page, u64 start, u64 end,
623 int mirror_num, struct extent_state *state)
624{
625 struct extent_io_tree *tree;
626 unsigned long len;
627 struct extent_buffer *eb;
628 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
629
630 tree = &BTRFS_I(page->mapping->host)->io_tree;
631 if (page->private == EXTENT_PAGE_PRIVATE)
632 goto out;
633 if (!page->private)
634 goto out;
635
636 len = page->private >> 2;
637 WARN_ON(len == 0);
638
639 eb = alloc_extent_buffer(tree, start, len, page);
640 if (eb == NULL)
641 goto out;
642
643 if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
644 clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
645 btree_readahead_hook(root, eb, eb->start, -EIO);
646 }
647 free_extent_buffer(eb);
648
649out:
650 return -EIO; /* we fixed nothing */
651}
652
616static void end_workqueue_bio(struct bio *bio, int err) 653static void end_workqueue_bio(struct bio *bio, int err)
617{ 654{
618 struct end_io_wq *end_io_wq = bio->bi_private; 655 struct end_io_wq *end_io_wq = bio->bi_private;
@@ -908,7 +945,7 @@ static int btree_readpage(struct file *file, struct page *page)
908{ 945{
909 struct extent_io_tree *tree; 946 struct extent_io_tree *tree;
910 tree = &BTRFS_I(page->mapping->host)->io_tree; 947 tree = &BTRFS_I(page->mapping->host)->io_tree;
911 return extent_read_full_page(tree, page, btree_get_extent); 948 return extent_read_full_page(tree, page, btree_get_extent, 0);
912} 949}
913 950
914static int btree_releasepage(struct page *page, gfp_t gfp_flags) 951static int btree_releasepage(struct page *page, gfp_t gfp_flags)
@@ -974,11 +1011,43 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
974 if (!buf) 1011 if (!buf)
975 return 0; 1012 return 0;
976 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, 1013 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
977 buf, 0, 0, btree_get_extent, 0); 1014 buf, 0, WAIT_NONE, btree_get_extent, 0);
978 free_extent_buffer(buf); 1015 free_extent_buffer(buf);
979 return ret; 1016 return ret;
980} 1017}
981 1018
1019int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
1020 int mirror_num, struct extent_buffer **eb)
1021{
1022 struct extent_buffer *buf = NULL;
1023 struct inode *btree_inode = root->fs_info->btree_inode;
1024 struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
1025 int ret;
1026
1027 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
1028 if (!buf)
1029 return 0;
1030
1031 set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
1032
1033 ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
1034 btree_get_extent, mirror_num);
1035 if (ret) {
1036 free_extent_buffer(buf);
1037 return ret;
1038 }
1039
1040 if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
1041 free_extent_buffer(buf);
1042 return -EIO;
1043 } else if (extent_buffer_uptodate(io_tree, buf, NULL)) {
1044 *eb = buf;
1045 } else {
1046 free_extent_buffer(buf);
1047 }
1048 return 0;
1049}
1050
982struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 1051struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
983 u64 bytenr, u32 blocksize) 1052 u64 bytenr, u32 blocksize)
984{ 1053{
@@ -1135,10 +1204,12 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
1135 1204
1136 generation = btrfs_root_generation(&root->root_item); 1205 generation = btrfs_root_generation(&root->root_item);
1137 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 1206 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1207 root->commit_root = NULL;
1138 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1208 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1139 blocksize, generation); 1209 blocksize, generation);
1140 if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) { 1210 if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
1141 free_extent_buffer(root->node); 1211 free_extent_buffer(root->node);
1212 root->node = NULL;
1142 return -EIO; 1213 return -EIO;
1143 } 1214 }
1144 root->commit_root = btrfs_root_node(root); 1215 root->commit_root = btrfs_root_node(root);
@@ -1577,6 +1648,235 @@ sleep:
1577 return 0; 1648 return 0;
1578} 1649}
1579 1650
1651/*
1652 * this will find the highest generation in the array of
1653 * root backups. The index of the highest array is returned,
1654 * or -1 if we can't find anything.
1655 *
1656 * We check to make sure the array is valid by comparing the
1657 * generation of the latest root in the array with the generation
1658 * in the super block. If they don't match we pitch it.
1659 */
1660static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
1661{
1662 u64 cur;
1663 int newest_index = -1;
1664 struct btrfs_root_backup *root_backup;
1665 int i;
1666
1667 for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
1668 root_backup = info->super_copy->super_roots + i;
1669 cur = btrfs_backup_tree_root_gen(root_backup);
1670 if (cur == newest_gen)
1671 newest_index = i;
1672 }
1673
1674 /* check to see if we actually wrapped around */
1675 if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
1676 root_backup = info->super_copy->super_roots;
1677 cur = btrfs_backup_tree_root_gen(root_backup);
1678 if (cur == newest_gen)
1679 newest_index = 0;
1680 }
1681 return newest_index;
1682}
1683
1684
1685/*
1686 * find the oldest backup so we know where to store new entries
1687 * in the backup array. This will set the backup_root_index
1688 * field in the fs_info struct
1689 */
1690static void find_oldest_super_backup(struct btrfs_fs_info *info,
1691 u64 newest_gen)
1692{
1693 int newest_index = -1;
1694
1695 newest_index = find_newest_super_backup(info, newest_gen);
1696 /* if there was garbage in there, just move along */
1697 if (newest_index == -1) {
1698 info->backup_root_index = 0;
1699 } else {
1700 info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
1701 }
1702}
1703
1704/*
1705 * copy all the root pointers into the super backup array.
1706 * this will bump the backup pointer by one when it is
1707 * done
1708 */
1709static void backup_super_roots(struct btrfs_fs_info *info)
1710{
1711 int next_backup;
1712 struct btrfs_root_backup *root_backup;
1713 int last_backup;
1714
1715 next_backup = info->backup_root_index;
1716 last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
1717 BTRFS_NUM_BACKUP_ROOTS;
1718
1719 /*
1720 * just overwrite the last backup if we're at the same generation
1721 * this happens only at umount
1722 */
1723 root_backup = info->super_for_commit->super_roots + last_backup;
1724 if (btrfs_backup_tree_root_gen(root_backup) ==
1725 btrfs_header_generation(info->tree_root->node))
1726 next_backup = last_backup;
1727
1728 root_backup = info->super_for_commit->super_roots + next_backup;
1729
1730 /*
1731 * make sure all of our padding and empty slots get zero filled
1732 * regardless of which ones we use today
1733 */
1734 memset(root_backup, 0, sizeof(*root_backup));
1735
1736 info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
1737
1738 btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
1739 btrfs_set_backup_tree_root_gen(root_backup,
1740 btrfs_header_generation(info->tree_root->node));
1741
1742 btrfs_set_backup_tree_root_level(root_backup,
1743 btrfs_header_level(info->tree_root->node));
1744
1745 btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
1746 btrfs_set_backup_chunk_root_gen(root_backup,
1747 btrfs_header_generation(info->chunk_root->node));
1748 btrfs_set_backup_chunk_root_level(root_backup,
1749 btrfs_header_level(info->chunk_root->node));
1750
1751 btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
1752 btrfs_set_backup_extent_root_gen(root_backup,
1753 btrfs_header_generation(info->extent_root->node));
1754 btrfs_set_backup_extent_root_level(root_backup,
1755 btrfs_header_level(info->extent_root->node));
1756
1757 /*
1758 * we might commit during log recovery, which happens before we set
1759 * the fs_root. Make sure it is valid before we fill it in.
1760 */
1761 if (info->fs_root && info->fs_root->node) {
1762 btrfs_set_backup_fs_root(root_backup,
1763 info->fs_root->node->start);
1764 btrfs_set_backup_fs_root_gen(root_backup,
1765 btrfs_header_generation(info->fs_root->node));
1766 btrfs_set_backup_fs_root_level(root_backup,
1767 btrfs_header_level(info->fs_root->node));
1768 }
1769
1770 btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
1771 btrfs_set_backup_dev_root_gen(root_backup,
1772 btrfs_header_generation(info->dev_root->node));
1773 btrfs_set_backup_dev_root_level(root_backup,
1774 btrfs_header_level(info->dev_root->node));
1775
1776 btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
1777 btrfs_set_backup_csum_root_gen(root_backup,
1778 btrfs_header_generation(info->csum_root->node));
1779 btrfs_set_backup_csum_root_level(root_backup,
1780 btrfs_header_level(info->csum_root->node));
1781
1782 btrfs_set_backup_total_bytes(root_backup,
1783 btrfs_super_total_bytes(info->super_copy));
1784 btrfs_set_backup_bytes_used(root_backup,
1785 btrfs_super_bytes_used(info->super_copy));
1786 btrfs_set_backup_num_devices(root_backup,
1787 btrfs_super_num_devices(info->super_copy));
1788
1789 /*
1790 * if we don't copy this out to the super_copy, it won't get remembered
1791 * for the next commit
1792 */
1793 memcpy(&info->super_copy->super_roots,
1794 &info->super_for_commit->super_roots,
1795 sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
1796}
1797
1798/*
1799 * this copies info out of the root backup array and back into
1800 * the in-memory super block. It is meant to help iterate through
1801 * the array, so you send it the number of backups you've already
1802 * tried and the last backup index you used.
1803 *
1804 * this returns -1 when it has tried all the backups
1805 */
1806static noinline int next_root_backup(struct btrfs_fs_info *info,
1807 struct btrfs_super_block *super,
1808 int *num_backups_tried, int *backup_index)
1809{
1810 struct btrfs_root_backup *root_backup;
1811 int newest = *backup_index;
1812
1813 if (*num_backups_tried == 0) {
1814 u64 gen = btrfs_super_generation(super);
1815
1816 newest = find_newest_super_backup(info, gen);
1817 if (newest == -1)
1818 return -1;
1819
1820 *backup_index = newest;
1821 *num_backups_tried = 1;
1822 } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
1823 /* we've tried all the backups, all done */
1824 return -1;
1825 } else {
1826 /* jump to the next oldest backup */
1827 newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
1828 BTRFS_NUM_BACKUP_ROOTS;
1829 *backup_index = newest;
1830 *num_backups_tried += 1;
1831 }
1832 root_backup = super->super_roots + newest;
1833
1834 btrfs_set_super_generation(super,
1835 btrfs_backup_tree_root_gen(root_backup));
1836 btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
1837 btrfs_set_super_root_level(super,
1838 btrfs_backup_tree_root_level(root_backup));
1839 btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
1840
1841 /*
1842 * fixme: the total bytes and num_devices need to match or we should
1843 * need a fsck
1844 */
1845 btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
1846 btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
1847 return 0;
1848}
1849
1850/* helper to cleanup tree roots */
1851static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
1852{
1853 free_extent_buffer(info->tree_root->node);
1854 free_extent_buffer(info->tree_root->commit_root);
1855 free_extent_buffer(info->dev_root->node);
1856 free_extent_buffer(info->dev_root->commit_root);
1857 free_extent_buffer(info->extent_root->node);
1858 free_extent_buffer(info->extent_root->commit_root);
1859 free_extent_buffer(info->csum_root->node);
1860 free_extent_buffer(info->csum_root->commit_root);
1861
1862 info->tree_root->node = NULL;
1863 info->tree_root->commit_root = NULL;
1864 info->dev_root->node = NULL;
1865 info->dev_root->commit_root = NULL;
1866 info->extent_root->node = NULL;
1867 info->extent_root->commit_root = NULL;
1868 info->csum_root->node = NULL;
1869 info->csum_root->commit_root = NULL;
1870
1871 if (chunk_root) {
1872 free_extent_buffer(info->chunk_root->node);
1873 free_extent_buffer(info->chunk_root->commit_root);
1874 info->chunk_root->node = NULL;
1875 info->chunk_root->commit_root = NULL;
1876 }
1877}
1878
1879
1580struct btrfs_root *open_ctree(struct super_block *sb, 1880struct btrfs_root *open_ctree(struct super_block *sb,
1581 struct btrfs_fs_devices *fs_devices, 1881 struct btrfs_fs_devices *fs_devices,
1582 char *options) 1882 char *options)
@@ -1590,29 +1890,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1590 u64 features; 1890 u64 features;
1591 struct btrfs_key location; 1891 struct btrfs_key location;
1592 struct buffer_head *bh; 1892 struct buffer_head *bh;
1593 struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root), 1893 struct btrfs_super_block *disk_super;
1594 GFP_NOFS);
1595 struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
1596 GFP_NOFS);
1597 struct btrfs_root *tree_root = btrfs_sb(sb); 1894 struct btrfs_root *tree_root = btrfs_sb(sb);
1598 struct btrfs_fs_info *fs_info = NULL; 1895 struct btrfs_fs_info *fs_info = tree_root->fs_info;
1599 struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), 1896 struct btrfs_root *extent_root;
1600 GFP_NOFS); 1897 struct btrfs_root *csum_root;
1601 struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), 1898 struct btrfs_root *chunk_root;
1602 GFP_NOFS); 1899 struct btrfs_root *dev_root;
1603 struct btrfs_root *log_tree_root; 1900 struct btrfs_root *log_tree_root;
1604
1605 int ret; 1901 int ret;
1606 int err = -EINVAL; 1902 int err = -EINVAL;
1607 1903 int num_backups_tried = 0;
1608 struct btrfs_super_block *disk_super; 1904 int backup_index = 0;
1609 1905
1610 if (!extent_root || !tree_root || !tree_root->fs_info || 1906 extent_root = fs_info->extent_root =
1611 !chunk_root || !dev_root || !csum_root) { 1907 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1908 csum_root = fs_info->csum_root =
1909 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1910 chunk_root = fs_info->chunk_root =
1911 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1912 dev_root = fs_info->dev_root =
1913 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1914
1915 if (!extent_root || !csum_root || !chunk_root || !dev_root) {
1612 err = -ENOMEM; 1916 err = -ENOMEM;
1613 goto fail; 1917 goto fail;
1614 } 1918 }
1615 fs_info = tree_root->fs_info;
1616 1919
1617 ret = init_srcu_struct(&fs_info->subvol_srcu); 1920 ret = init_srcu_struct(&fs_info->subvol_srcu);
1618 if (ret) { 1921 if (ret) {
@@ -1648,15 +1951,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1648 spin_lock_init(&fs_info->fs_roots_radix_lock); 1951 spin_lock_init(&fs_info->fs_roots_radix_lock);
1649 spin_lock_init(&fs_info->delayed_iput_lock); 1952 spin_lock_init(&fs_info->delayed_iput_lock);
1650 spin_lock_init(&fs_info->defrag_inodes_lock); 1953 spin_lock_init(&fs_info->defrag_inodes_lock);
1954 spin_lock_init(&fs_info->free_chunk_lock);
1651 mutex_init(&fs_info->reloc_mutex); 1955 mutex_init(&fs_info->reloc_mutex);
1652 1956
1653 init_completion(&fs_info->kobj_unregister); 1957 init_completion(&fs_info->kobj_unregister);
1654 fs_info->tree_root = tree_root;
1655 fs_info->extent_root = extent_root;
1656 fs_info->csum_root = csum_root;
1657 fs_info->chunk_root = chunk_root;
1658 fs_info->dev_root = dev_root;
1659 fs_info->fs_devices = fs_devices;
1660 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 1958 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
1661 INIT_LIST_HEAD(&fs_info->space_info); 1959 INIT_LIST_HEAD(&fs_info->space_info);
1662 btrfs_mapping_init(&fs_info->mapping_tree); 1960 btrfs_mapping_init(&fs_info->mapping_tree);
@@ -1665,8 +1963,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1665 btrfs_init_block_rsv(&fs_info->trans_block_rsv); 1963 btrfs_init_block_rsv(&fs_info->trans_block_rsv);
1666 btrfs_init_block_rsv(&fs_info->chunk_block_rsv); 1964 btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
1667 btrfs_init_block_rsv(&fs_info->empty_block_rsv); 1965 btrfs_init_block_rsv(&fs_info->empty_block_rsv);
1668 INIT_LIST_HEAD(&fs_info->durable_block_rsv_list); 1966 btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
1669 mutex_init(&fs_info->durable_block_rsv_mutex);
1670 atomic_set(&fs_info->nr_async_submits, 0); 1967 atomic_set(&fs_info->nr_async_submits, 0);
1671 atomic_set(&fs_info->async_delalloc_pages, 0); 1968 atomic_set(&fs_info->async_delalloc_pages, 0);
1672 atomic_set(&fs_info->async_submit_draining, 0); 1969 atomic_set(&fs_info->async_submit_draining, 0);
@@ -1677,6 +1974,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1677 fs_info->metadata_ratio = 0; 1974 fs_info->metadata_ratio = 0;
1678 fs_info->defrag_inodes = RB_ROOT; 1975 fs_info->defrag_inodes = RB_ROOT;
1679 fs_info->trans_no_join = 0; 1976 fs_info->trans_no_join = 0;
1977 fs_info->free_chunk_space = 0;
1978
1979 /* readahead state */
1980 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
1981 spin_lock_init(&fs_info->reada_lock);
1680 1982
1681 fs_info->thread_pool_size = min_t(unsigned long, 1983 fs_info->thread_pool_size = min_t(unsigned long,
1682 num_online_cpus() + 2, 8); 1984 num_online_cpus() + 2, 8);
@@ -1766,14 +2068,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1766 goto fail_alloc; 2068 goto fail_alloc;
1767 } 2069 }
1768 2070
1769 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); 2071 memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
1770 memcpy(&fs_info->super_for_commit, &fs_info->super_copy, 2072 memcpy(fs_info->super_for_commit, fs_info->super_copy,
1771 sizeof(fs_info->super_for_commit)); 2073 sizeof(*fs_info->super_for_commit));
1772 brelse(bh); 2074 brelse(bh);
1773 2075
1774 memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE); 2076 memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
1775 2077
1776 disk_super = &fs_info->super_copy; 2078 disk_super = fs_info->super_copy;
1777 if (!btrfs_super_root(disk_super)) 2079 if (!btrfs_super_root(disk_super))
1778 goto fail_alloc; 2080 goto fail_alloc;
1779 2081
@@ -1783,6 +2085,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1783 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); 2085 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
1784 2086
1785 /* 2087 /*
2088 * run through our array of backup supers and setup
2089 * our ring pointer to the oldest one
2090 */
2091 generation = btrfs_super_generation(disk_super);
2092 find_oldest_super_backup(fs_info, generation);
2093
2094 /*
1786 * In the long term, we'll store the compression type in the super 2095 * In the long term, we'll store the compression type in the super
1787 * block, and it'll be used for per file compression control. 2096 * block, and it'll be used for per file compression control.
1788 */ 2097 */
@@ -1870,6 +2179,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1870 btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta", 2179 btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
1871 fs_info->thread_pool_size, 2180 fs_info->thread_pool_size,
1872 &fs_info->generic_worker); 2181 &fs_info->generic_worker);
2182 btrfs_init_workers(&fs_info->readahead_workers, "readahead",
2183 fs_info->thread_pool_size,
2184 &fs_info->generic_worker);
1873 2185
1874 /* 2186 /*
1875 * endios are largely parallel and should have a very 2187 * endios are largely parallel and should have a very
@@ -1880,6 +2192,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1880 2192
1881 fs_info->endio_write_workers.idle_thresh = 2; 2193 fs_info->endio_write_workers.idle_thresh = 2;
1882 fs_info->endio_meta_write_workers.idle_thresh = 2; 2194 fs_info->endio_meta_write_workers.idle_thresh = 2;
2195 fs_info->readahead_workers.idle_thresh = 2;
1883 2196
1884 btrfs_start_workers(&fs_info->workers, 1); 2197 btrfs_start_workers(&fs_info->workers, 1);
1885 btrfs_start_workers(&fs_info->generic_worker, 1); 2198 btrfs_start_workers(&fs_info->generic_worker, 1);
@@ -1893,6 +2206,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1893 btrfs_start_workers(&fs_info->endio_freespace_worker, 1); 2206 btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
1894 btrfs_start_workers(&fs_info->delayed_workers, 1); 2207 btrfs_start_workers(&fs_info->delayed_workers, 1);
1895 btrfs_start_workers(&fs_info->caching_workers, 1); 2208 btrfs_start_workers(&fs_info->caching_workers, 1);
2209 btrfs_start_workers(&fs_info->readahead_workers, 1);
1896 2210
1897 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 2211 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1898 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 2212 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1939,7 +2253,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1939 if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { 2253 if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
1940 printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", 2254 printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
1941 sb->s_id); 2255 sb->s_id);
1942 goto fail_chunk_root; 2256 goto fail_tree_roots;
1943 } 2257 }
1944 btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); 2258 btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
1945 chunk_root->commit_root = btrfs_root_node(chunk_root); 2259 chunk_root->commit_root = btrfs_root_node(chunk_root);
@@ -1954,11 +2268,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1954 if (ret) { 2268 if (ret) {
1955 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", 2269 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
1956 sb->s_id); 2270 sb->s_id);
1957 goto fail_chunk_root; 2271 goto fail_tree_roots;
1958 } 2272 }
1959 2273
1960 btrfs_close_extra_devices(fs_devices); 2274 btrfs_close_extra_devices(fs_devices);
1961 2275
2276retry_root_backup:
1962 blocksize = btrfs_level_size(tree_root, 2277 blocksize = btrfs_level_size(tree_root,
1963 btrfs_super_root_level(disk_super)); 2278 btrfs_super_root_level(disk_super));
1964 generation = btrfs_super_generation(disk_super); 2279 generation = btrfs_super_generation(disk_super);
@@ -1966,32 +2281,33 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1966 tree_root->node = read_tree_block(tree_root, 2281 tree_root->node = read_tree_block(tree_root,
1967 btrfs_super_root(disk_super), 2282 btrfs_super_root(disk_super),
1968 blocksize, generation); 2283 blocksize, generation);
1969 if (!tree_root->node) 2284 if (!tree_root->node ||
1970 goto fail_chunk_root; 2285 !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
1971 if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
1972 printk(KERN_WARNING "btrfs: failed to read tree root on %s\n", 2286 printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
1973 sb->s_id); 2287 sb->s_id);
1974 goto fail_tree_root; 2288
2289 goto recovery_tree_root;
1975 } 2290 }
2291
1976 btrfs_set_root_node(&tree_root->root_item, tree_root->node); 2292 btrfs_set_root_node(&tree_root->root_item, tree_root->node);
1977 tree_root->commit_root = btrfs_root_node(tree_root); 2293 tree_root->commit_root = btrfs_root_node(tree_root);
1978 2294
1979 ret = find_and_setup_root(tree_root, fs_info, 2295 ret = find_and_setup_root(tree_root, fs_info,
1980 BTRFS_EXTENT_TREE_OBJECTID, extent_root); 2296 BTRFS_EXTENT_TREE_OBJECTID, extent_root);
1981 if (ret) 2297 if (ret)
1982 goto fail_tree_root; 2298 goto recovery_tree_root;
1983 extent_root->track_dirty = 1; 2299 extent_root->track_dirty = 1;
1984 2300
1985 ret = find_and_setup_root(tree_root, fs_info, 2301 ret = find_and_setup_root(tree_root, fs_info,
1986 BTRFS_DEV_TREE_OBJECTID, dev_root); 2302 BTRFS_DEV_TREE_OBJECTID, dev_root);
1987 if (ret) 2303 if (ret)
1988 goto fail_extent_root; 2304 goto recovery_tree_root;
1989 dev_root->track_dirty = 1; 2305 dev_root->track_dirty = 1;
1990 2306
1991 ret = find_and_setup_root(tree_root, fs_info, 2307 ret = find_and_setup_root(tree_root, fs_info,
1992 BTRFS_CSUM_TREE_OBJECTID, csum_root); 2308 BTRFS_CSUM_TREE_OBJECTID, csum_root);
1993 if (ret) 2309 if (ret)
1994 goto fail_dev_root; 2310 goto recovery_tree_root;
1995 2311
1996 csum_root->track_dirty = 1; 2312 csum_root->track_dirty = 1;
1997 2313
@@ -2124,22 +2440,13 @@ fail_cleaner:
2124 2440
2125fail_block_groups: 2441fail_block_groups:
2126 btrfs_free_block_groups(fs_info); 2442 btrfs_free_block_groups(fs_info);
2127 free_extent_buffer(csum_root->node); 2443
2128 free_extent_buffer(csum_root->commit_root); 2444fail_tree_roots:
2129fail_dev_root: 2445 free_root_pointers(fs_info, 1);
2130 free_extent_buffer(dev_root->node); 2446
2131 free_extent_buffer(dev_root->commit_root);
2132fail_extent_root:
2133 free_extent_buffer(extent_root->node);
2134 free_extent_buffer(extent_root->commit_root);
2135fail_tree_root:
2136 free_extent_buffer(tree_root->node);
2137 free_extent_buffer(tree_root->commit_root);
2138fail_chunk_root:
2139 free_extent_buffer(chunk_root->node);
2140 free_extent_buffer(chunk_root->commit_root);
2141fail_sb_buffer: 2447fail_sb_buffer:
2142 btrfs_stop_workers(&fs_info->generic_worker); 2448 btrfs_stop_workers(&fs_info->generic_worker);
2449 btrfs_stop_workers(&fs_info->readahead_workers);
2143 btrfs_stop_workers(&fs_info->fixup_workers); 2450 btrfs_stop_workers(&fs_info->fixup_workers);
2144 btrfs_stop_workers(&fs_info->delalloc_workers); 2451 btrfs_stop_workers(&fs_info->delalloc_workers);
2145 btrfs_stop_workers(&fs_info->workers); 2452 btrfs_stop_workers(&fs_info->workers);
@@ -2152,25 +2459,37 @@ fail_sb_buffer:
2152 btrfs_stop_workers(&fs_info->delayed_workers); 2459 btrfs_stop_workers(&fs_info->delayed_workers);
2153 btrfs_stop_workers(&fs_info->caching_workers); 2460 btrfs_stop_workers(&fs_info->caching_workers);
2154fail_alloc: 2461fail_alloc:
2155 kfree(fs_info->delayed_root);
2156fail_iput: 2462fail_iput:
2463 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2464
2157 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2465 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2158 iput(fs_info->btree_inode); 2466 iput(fs_info->btree_inode);
2159
2160 btrfs_close_devices(fs_info->fs_devices);
2161 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2162fail_bdi: 2467fail_bdi:
2163 bdi_destroy(&fs_info->bdi); 2468 bdi_destroy(&fs_info->bdi);
2164fail_srcu: 2469fail_srcu:
2165 cleanup_srcu_struct(&fs_info->subvol_srcu); 2470 cleanup_srcu_struct(&fs_info->subvol_srcu);
2166fail: 2471fail:
2167 kfree(extent_root); 2472 btrfs_close_devices(fs_info->fs_devices);
2168 kfree(tree_root); 2473 free_fs_info(fs_info);
2169 kfree(fs_info);
2170 kfree(chunk_root);
2171 kfree(dev_root);
2172 kfree(csum_root);
2173 return ERR_PTR(err); 2474 return ERR_PTR(err);
2475
2476recovery_tree_root:
2477 if (!btrfs_test_opt(tree_root, RECOVERY))
2478 goto fail_tree_roots;
2479
2480 free_root_pointers(fs_info, 0);
2481
2482 /* don't use the log in recovery mode, it won't be valid */
2483 btrfs_set_super_log_root(disk_super, 0);
2484
2485 /* we can't trust the free space cache either */
2486 btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
2487
2488 ret = next_root_backup(fs_info, fs_info->super_copy,
2489 &num_backups_tried, &backup_index);
2490 if (ret == -1)
2491 goto fail_block_groups;
2492 goto retry_root_backup;
2174} 2493}
2175 2494
2176static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) 2495static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
@@ -2254,22 +2573,10 @@ static int write_dev_supers(struct btrfs_device *device,
2254 int errors = 0; 2573 int errors = 0;
2255 u32 crc; 2574 u32 crc;
2256 u64 bytenr; 2575 u64 bytenr;
2257 int last_barrier = 0;
2258 2576
2259 if (max_mirrors == 0) 2577 if (max_mirrors == 0)
2260 max_mirrors = BTRFS_SUPER_MIRROR_MAX; 2578 max_mirrors = BTRFS_SUPER_MIRROR_MAX;
2261 2579
2262 /* make sure only the last submit_bh does a barrier */
2263 if (do_barriers) {
2264 for (i = 0; i < max_mirrors; i++) {
2265 bytenr = btrfs_sb_offset(i);
2266 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
2267 device->total_bytes)
2268 break;
2269 last_barrier = i;
2270 }
2271 }
2272
2273 for (i = 0; i < max_mirrors; i++) { 2580 for (i = 0; i < max_mirrors; i++) {
2274 bytenr = btrfs_sb_offset(i); 2581 bytenr = btrfs_sb_offset(i);
2275 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) 2582 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
@@ -2315,17 +2622,136 @@ static int write_dev_supers(struct btrfs_device *device,
2315 bh->b_end_io = btrfs_end_buffer_write_sync; 2622 bh->b_end_io = btrfs_end_buffer_write_sync;
2316 } 2623 }
2317 2624
2318 if (i == last_barrier && do_barriers) 2625 /*
2319 ret = submit_bh(WRITE_FLUSH_FUA, bh); 2626 * we fua the first super. The others we allow
2320 else 2627 * to go down lazy.
2321 ret = submit_bh(WRITE_SYNC, bh); 2628 */
2322 2629 ret = submit_bh(WRITE_FUA, bh);
2323 if (ret) 2630 if (ret)
2324 errors++; 2631 errors++;
2325 } 2632 }
2326 return errors < i ? 0 : -1; 2633 return errors < i ? 0 : -1;
2327} 2634}
2328 2635
2636/*
2637 * endio for the write_dev_flush, this will wake anyone waiting
2638 * for the barrier when it is done
2639 */
2640static void btrfs_end_empty_barrier(struct bio *bio, int err)
2641{
2642 if (err) {
2643 if (err == -EOPNOTSUPP)
2644 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2645 clear_bit(BIO_UPTODATE, &bio->bi_flags);
2646 }
2647 if (bio->bi_private)
2648 complete(bio->bi_private);
2649 bio_put(bio);
2650}
2651
2652/*
2653 * trigger flushes for one the devices. If you pass wait == 0, the flushes are
2654 * sent down. With wait == 1, it waits for the previous flush.
2655 *
2656 * any device where the flush fails with eopnotsupp are flagged as not-barrier
2657 * capable
2658 */
2659static int write_dev_flush(struct btrfs_device *device, int wait)
2660{
2661 struct bio *bio;
2662 int ret = 0;
2663
2664 if (device->nobarriers)
2665 return 0;
2666
2667 if (wait) {
2668 bio = device->flush_bio;
2669 if (!bio)
2670 return 0;
2671
2672 wait_for_completion(&device->flush_wait);
2673
2674 if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
2675 printk("btrfs: disabling barriers on dev %s\n",
2676 device->name);
2677 device->nobarriers = 1;
2678 }
2679 if (!bio_flagged(bio, BIO_UPTODATE)) {
2680 ret = -EIO;
2681 }
2682
2683 /* drop the reference from the wait == 0 run */
2684 bio_put(bio);
2685 device->flush_bio = NULL;
2686
2687 return ret;
2688 }
2689
2690 /*
2691 * one reference for us, and we leave it for the
2692 * caller
2693 */
2694 device->flush_bio = NULL;;
2695 bio = bio_alloc(GFP_NOFS, 0);
2696 if (!bio)
2697 return -ENOMEM;
2698
2699 bio->bi_end_io = btrfs_end_empty_barrier;
2700 bio->bi_bdev = device->bdev;
2701 init_completion(&device->flush_wait);
2702 bio->bi_private = &device->flush_wait;
2703 device->flush_bio = bio;
2704
2705 bio_get(bio);
2706 submit_bio(WRITE_FLUSH, bio);
2707
2708 return 0;
2709}
2710
2711/*
2712 * send an empty flush down to each device in parallel,
2713 * then wait for them
2714 */
2715static int barrier_all_devices(struct btrfs_fs_info *info)
2716{
2717 struct list_head *head;
2718 struct btrfs_device *dev;
2719 int errors = 0;
2720 int ret;
2721
2722 /* send down all the barriers */
2723 head = &info->fs_devices->devices;
2724 list_for_each_entry_rcu(dev, head, dev_list) {
2725 if (!dev->bdev) {
2726 errors++;
2727 continue;
2728 }
2729 if (!dev->in_fs_metadata || !dev->writeable)
2730 continue;
2731
2732 ret = write_dev_flush(dev, 0);
2733 if (ret)
2734 errors++;
2735 }
2736
2737 /* wait for all the barriers */
2738 list_for_each_entry_rcu(dev, head, dev_list) {
2739 if (!dev->bdev) {
2740 errors++;
2741 continue;
2742 }
2743 if (!dev->in_fs_metadata || !dev->writeable)
2744 continue;
2745
2746 ret = write_dev_flush(dev, 1);
2747 if (ret)
2748 errors++;
2749 }
2750 if (errors)
2751 return -EIO;
2752 return 0;
2753}
2754
2329int write_all_supers(struct btrfs_root *root, int max_mirrors) 2755int write_all_supers(struct btrfs_root *root, int max_mirrors)
2330{ 2756{
2331 struct list_head *head; 2757 struct list_head *head;
@@ -2338,14 +2764,19 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2338 int total_errors = 0; 2764 int total_errors = 0;
2339 u64 flags; 2765 u64 flags;
2340 2766
2341 max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 2767 max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
2342 do_barriers = !btrfs_test_opt(root, NOBARRIER); 2768 do_barriers = !btrfs_test_opt(root, NOBARRIER);
2769 backup_super_roots(root->fs_info);
2343 2770
2344 sb = &root->fs_info->super_for_commit; 2771 sb = root->fs_info->super_for_commit;
2345 dev_item = &sb->dev_item; 2772 dev_item = &sb->dev_item;
2346 2773
2347 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2774 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2348 head = &root->fs_info->fs_devices->devices; 2775 head = &root->fs_info->fs_devices->devices;
2776
2777 if (do_barriers)
2778 barrier_all_devices(root->fs_info);
2779
2349 list_for_each_entry_rcu(dev, head, dev_list) { 2780 list_for_each_entry_rcu(dev, head, dev_list) {
2350 if (!dev->bdev) { 2781 if (!dev->bdev) {
2351 total_errors++; 2782 total_errors++;
@@ -2545,8 +2976,6 @@ int close_ctree(struct btrfs_root *root)
2545 /* clear out the rbtree of defraggable inodes */ 2976 /* clear out the rbtree of defraggable inodes */
2546 btrfs_run_defrag_inodes(root->fs_info); 2977 btrfs_run_defrag_inodes(root->fs_info);
2547 2978
2548 btrfs_put_block_group_cache(fs_info);
2549
2550 /* 2979 /*
2551 * Here come 2 situations when btrfs is broken to flip readonly: 2980 * Here come 2 situations when btrfs is broken to flip readonly:
2552 * 2981 *
@@ -2572,6 +3001,8 @@ int close_ctree(struct btrfs_root *root)
2572 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 3001 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2573 } 3002 }
2574 3003
3004 btrfs_put_block_group_cache(fs_info);
3005
2575 kthread_stop(root->fs_info->transaction_kthread); 3006 kthread_stop(root->fs_info->transaction_kthread);
2576 kthread_stop(root->fs_info->cleaner_kthread); 3007 kthread_stop(root->fs_info->cleaner_kthread);
2577 3008
@@ -2603,7 +3034,6 @@ int close_ctree(struct btrfs_root *root)
2603 del_fs_roots(fs_info); 3034 del_fs_roots(fs_info);
2604 3035
2605 iput(fs_info->btree_inode); 3036 iput(fs_info->btree_inode);
2606 kfree(fs_info->delayed_root);
2607 3037
2608 btrfs_stop_workers(&fs_info->generic_worker); 3038 btrfs_stop_workers(&fs_info->generic_worker);
2609 btrfs_stop_workers(&fs_info->fixup_workers); 3039 btrfs_stop_workers(&fs_info->fixup_workers);
@@ -2617,6 +3047,7 @@ int close_ctree(struct btrfs_root *root)
2617 btrfs_stop_workers(&fs_info->submit_workers); 3047 btrfs_stop_workers(&fs_info->submit_workers);
2618 btrfs_stop_workers(&fs_info->delayed_workers); 3048 btrfs_stop_workers(&fs_info->delayed_workers);
2619 btrfs_stop_workers(&fs_info->caching_workers); 3049 btrfs_stop_workers(&fs_info->caching_workers);
3050 btrfs_stop_workers(&fs_info->readahead_workers);
2620 3051
2621 btrfs_close_devices(fs_info->fs_devices); 3052 btrfs_close_devices(fs_info->fs_devices);
2622 btrfs_mapping_tree_free(&fs_info->mapping_tree); 3053 btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2624,12 +3055,7 @@ int close_ctree(struct btrfs_root *root)
2624 bdi_destroy(&fs_info->bdi); 3055 bdi_destroy(&fs_info->bdi);
2625 cleanup_srcu_struct(&fs_info->subvol_srcu); 3056 cleanup_srcu_struct(&fs_info->subvol_srcu);
2626 3057
2627 kfree(fs_info->extent_root); 3058 free_fs_info(fs_info);
2628 kfree(fs_info->tree_root);
2629 kfree(fs_info->chunk_root);
2630 kfree(fs_info->dev_root);
2631 kfree(fs_info->csum_root);
2632 kfree(fs_info);
2633 3059
2634 return 0; 3060 return 0;
2635} 3061}
@@ -2735,7 +3161,8 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2735 return ret; 3161 return ret;
2736} 3162}
2737 3163
2738int btree_lock_page_hook(struct page *page) 3164static int btree_lock_page_hook(struct page *page, void *data,
3165 void (*flush_fn)(void *))
2739{ 3166{
2740 struct inode *inode = page->mapping->host; 3167 struct inode *inode = page->mapping->host;
2741 struct btrfs_root *root = BTRFS_I(inode)->root; 3168 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2752,7 +3179,10 @@ int btree_lock_page_hook(struct page *page)
2752 if (!eb) 3179 if (!eb)
2753 goto out; 3180 goto out;
2754 3181
2755 btrfs_tree_lock(eb); 3182 if (!btrfs_try_tree_write_lock(eb)) {
3183 flush_fn(data);
3184 btrfs_tree_lock(eb);
3185 }
2756 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3186 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
2757 3187
2758 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3188 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
@@ -2767,7 +3197,10 @@ int btree_lock_page_hook(struct page *page)
2767 btrfs_tree_unlock(eb); 3197 btrfs_tree_unlock(eb);
2768 free_extent_buffer(eb); 3198 free_extent_buffer(eb);
2769out: 3199out:
2770 lock_page(page); 3200 if (!trylock_page(page)) {
3201 flush_fn(data);
3202 lock_page(page);
3203 }
2771 return 0; 3204 return 0;
2772} 3205}
2773 3206
@@ -3123,6 +3556,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
3123static struct extent_io_ops btree_extent_io_ops = { 3556static struct extent_io_ops btree_extent_io_ops = {
3124 .write_cache_pages_lock_hook = btree_lock_page_hook, 3557 .write_cache_pages_lock_hook = btree_lock_page_hook,
3125 .readpage_end_io_hook = btree_readpage_end_io_hook, 3558 .readpage_end_io_hook = btree_readpage_end_io_hook,
3559 .readpage_io_failed_hook = btree_io_failed_hook,
3126 .submit_bio_hook = btree_submit_bio_hook, 3560 .submit_bio_hook = btree_submit_bio_hook,
3127 /* note we're sharing with inode.c for the merge bio hook */ 3561 /* note we're sharing with inode.c for the merge bio hook */
3128 .merge_bio_hook = btrfs_merge_bio_hook, 3562 .merge_bio_hook = btrfs_merge_bio_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index bec3ea4bd67f..c99d0a8f13fa 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -40,6 +40,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
40 u32 blocksize, u64 parent_transid); 40 u32 blocksize, u64 parent_transid);
41int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, 41int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
42 u64 parent_transid); 42 u64 parent_transid);
43int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
44 int mirror_num, struct extent_buffer **eb);
43struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, 45struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
44 u64 bytenr, u32 blocksize); 46 u64 bytenr, u32 blocksize);
45int clean_tree_block(struct btrfs_trans_handle *trans, 47int clean_tree_block(struct btrfs_trans_handle *trans,
@@ -83,8 +85,6 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
83 struct btrfs_fs_info *fs_info); 85 struct btrfs_fs_info *fs_info);
84int btrfs_add_log_tree(struct btrfs_trans_handle *trans, 86int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
85 struct btrfs_root *root); 87 struct btrfs_root *root);
86int btree_lock_page_hook(struct page *page);
87
88 88
89#ifdef CONFIG_DEBUG_LOCK_ALLOC 89#ifdef CONFIG_DEBUG_LOCK_ALLOC
90void btrfs_init_lockdep(void); 90void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5be06a2462f..930ae8949737 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -23,6 +23,7 @@
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ratelimit.h>
26#include "compat.h" 27#include "compat.h"
27#include "hash.h" 28#include "hash.h"
28#include "ctree.h" 29#include "ctree.h"
@@ -52,6 +53,21 @@ enum {
52 CHUNK_ALLOC_LIMITED = 2, 53 CHUNK_ALLOC_LIMITED = 2,
53}; 54};
54 55
56/*
57 * Control how reservations are dealt with.
58 *
59 * RESERVE_FREE - freeing a reservation.
60 * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
61 * ENOSPC accounting
62 * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
63 * bytes_may_use as the ENOSPC accounting is done elsewhere
64 */
65enum {
66 RESERVE_FREE = 0,
67 RESERVE_ALLOC = 1,
68 RESERVE_ALLOC_NO_ACCOUNT = 2,
69};
70
55static int update_block_group(struct btrfs_trans_handle *trans, 71static int update_block_group(struct btrfs_trans_handle *trans,
56 struct btrfs_root *root, 72 struct btrfs_root *root,
57 u64 bytenr, u64 num_bytes, int alloc); 73 u64 bytenr, u64 num_bytes, int alloc);
@@ -81,6 +97,8 @@ static int find_next_key(struct btrfs_path *path, int level,
81 struct btrfs_key *key); 97 struct btrfs_key *key);
82static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 98static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
83 int dump_block_groups); 99 int dump_block_groups);
100static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
101 u64 num_bytes, int reserve);
84 102
85static noinline int 103static noinline int
86block_group_cache_done(struct btrfs_block_group_cache *cache) 104block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -104,7 +122,6 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
104 if (atomic_dec_and_test(&cache->count)) { 122 if (atomic_dec_and_test(&cache->count)) {
105 WARN_ON(cache->pinned > 0); 123 WARN_ON(cache->pinned > 0);
106 WARN_ON(cache->reserved > 0); 124 WARN_ON(cache->reserved > 0);
107 WARN_ON(cache->reserved_pinned > 0);
108 kfree(cache->free_space_ctl); 125 kfree(cache->free_space_ctl);
109 kfree(cache); 126 kfree(cache);
110 } 127 }
@@ -450,13 +467,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
450 struct btrfs_root *root, 467 struct btrfs_root *root,
451 int load_cache_only) 468 int load_cache_only)
452{ 469{
470 DEFINE_WAIT(wait);
453 struct btrfs_fs_info *fs_info = cache->fs_info; 471 struct btrfs_fs_info *fs_info = cache->fs_info;
454 struct btrfs_caching_control *caching_ctl; 472 struct btrfs_caching_control *caching_ctl;
455 int ret = 0; 473 int ret = 0;
456 474
457 smp_mb(); 475 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
458 if (cache->cached != BTRFS_CACHE_NO) 476 BUG_ON(!caching_ctl);
477
478 INIT_LIST_HEAD(&caching_ctl->list);
479 mutex_init(&caching_ctl->mutex);
480 init_waitqueue_head(&caching_ctl->wait);
481 caching_ctl->block_group = cache;
482 caching_ctl->progress = cache->key.objectid;
483 atomic_set(&caching_ctl->count, 1);
484 caching_ctl->work.func = caching_thread;
485
486 spin_lock(&cache->lock);
487 /*
488 * This should be a rare occasion, but this could happen I think in the
489 * case where one thread starts to load the space cache info, and then
490 * some other thread starts a transaction commit which tries to do an
491 * allocation while the other thread is still loading the space cache
492 * info. The previous loop should have kept us from choosing this block
493 * group, but if we've moved to the state where we will wait on caching
494 * block groups we need to first check if we're doing a fast load here,
495 * so we can wait for it to finish, otherwise we could end up allocating
496 * from a block group who's cache gets evicted for one reason or
497 * another.
498 */
499 while (cache->cached == BTRFS_CACHE_FAST) {
500 struct btrfs_caching_control *ctl;
501
502 ctl = cache->caching_ctl;
503 atomic_inc(&ctl->count);
504 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
505 spin_unlock(&cache->lock);
506
507 schedule();
508
509 finish_wait(&ctl->wait, &wait);
510 put_caching_control(ctl);
511 spin_lock(&cache->lock);
512 }
513
514 if (cache->cached != BTRFS_CACHE_NO) {
515 spin_unlock(&cache->lock);
516 kfree(caching_ctl);
459 return 0; 517 return 0;
518 }
519 WARN_ON(cache->caching_ctl);
520 cache->caching_ctl = caching_ctl;
521 cache->cached = BTRFS_CACHE_FAST;
522 spin_unlock(&cache->lock);
460 523
461 /* 524 /*
462 * We can't do the read from on-disk cache during a commit since we need 525 * We can't do the read from on-disk cache during a commit since we need
@@ -465,57 +528,53 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
465 * we likely hold important locks. 528 * we likely hold important locks.
466 */ 529 */
467 if (trans && (!trans->transaction->in_commit) && 530 if (trans && (!trans->transaction->in_commit) &&
468 (root && root != root->fs_info->tree_root)) { 531 (root && root != root->fs_info->tree_root) &&
469 spin_lock(&cache->lock); 532 btrfs_test_opt(root, SPACE_CACHE)) {
470 if (cache->cached != BTRFS_CACHE_NO) {
471 spin_unlock(&cache->lock);
472 return 0;
473 }
474 cache->cached = BTRFS_CACHE_STARTED;
475 spin_unlock(&cache->lock);
476
477 ret = load_free_space_cache(fs_info, cache); 533 ret = load_free_space_cache(fs_info, cache);
478 534
479 spin_lock(&cache->lock); 535 spin_lock(&cache->lock);
480 if (ret == 1) { 536 if (ret == 1) {
537 cache->caching_ctl = NULL;
481 cache->cached = BTRFS_CACHE_FINISHED; 538 cache->cached = BTRFS_CACHE_FINISHED;
482 cache->last_byte_to_unpin = (u64)-1; 539 cache->last_byte_to_unpin = (u64)-1;
483 } else { 540 } else {
484 cache->cached = BTRFS_CACHE_NO; 541 if (load_cache_only) {
542 cache->caching_ctl = NULL;
543 cache->cached = BTRFS_CACHE_NO;
544 } else {
545 cache->cached = BTRFS_CACHE_STARTED;
546 }
485 } 547 }
486 spin_unlock(&cache->lock); 548 spin_unlock(&cache->lock);
549 wake_up(&caching_ctl->wait);
487 if (ret == 1) { 550 if (ret == 1) {
551 put_caching_control(caching_ctl);
488 free_excluded_extents(fs_info->extent_root, cache); 552 free_excluded_extents(fs_info->extent_root, cache);
489 return 0; 553 return 0;
490 } 554 }
555 } else {
556 /*
557 * We are not going to do the fast caching, set cached to the
558 * appropriate value and wakeup any waiters.
559 */
560 spin_lock(&cache->lock);
561 if (load_cache_only) {
562 cache->caching_ctl = NULL;
563 cache->cached = BTRFS_CACHE_NO;
564 } else {
565 cache->cached = BTRFS_CACHE_STARTED;
566 }
567 spin_unlock(&cache->lock);
568 wake_up(&caching_ctl->wait);
491 } 569 }
492 570
493 if (load_cache_only) 571 if (load_cache_only) {
494 return 0; 572 put_caching_control(caching_ctl);
495
496 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
497 BUG_ON(!caching_ctl);
498
499 INIT_LIST_HEAD(&caching_ctl->list);
500 mutex_init(&caching_ctl->mutex);
501 init_waitqueue_head(&caching_ctl->wait);
502 caching_ctl->block_group = cache;
503 caching_ctl->progress = cache->key.objectid;
504 /* one for caching kthread, one for caching block group list */
505 atomic_set(&caching_ctl->count, 2);
506 caching_ctl->work.func = caching_thread;
507
508 spin_lock(&cache->lock);
509 if (cache->cached != BTRFS_CACHE_NO) {
510 spin_unlock(&cache->lock);
511 kfree(caching_ctl);
512 return 0; 573 return 0;
513 } 574 }
514 cache->caching_ctl = caching_ctl;
515 cache->cached = BTRFS_CACHE_STARTED;
516 spin_unlock(&cache->lock);
517 575
518 down_write(&fs_info->extent_commit_sem); 576 down_write(&fs_info->extent_commit_sem);
577 atomic_inc(&caching_ctl->count);
519 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 578 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
520 up_write(&fs_info->extent_commit_sem); 579 up_write(&fs_info->extent_commit_sem);
521 580
@@ -1770,18 +1829,18 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1770{ 1829{
1771 int ret; 1830 int ret;
1772 u64 discarded_bytes = 0; 1831 u64 discarded_bytes = 0;
1773 struct btrfs_multi_bio *multi = NULL; 1832 struct btrfs_bio *bbio = NULL;
1774 1833
1775 1834
1776 /* Tell the block device(s) that the sectors can be discarded */ 1835 /* Tell the block device(s) that the sectors can be discarded */
1777 ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, 1836 ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
1778 bytenr, &num_bytes, &multi, 0); 1837 bytenr, &num_bytes, &bbio, 0);
1779 if (!ret) { 1838 if (!ret) {
1780 struct btrfs_bio_stripe *stripe = multi->stripes; 1839 struct btrfs_bio_stripe *stripe = bbio->stripes;
1781 int i; 1840 int i;
1782 1841
1783 1842
1784 for (i = 0; i < multi->num_stripes; i++, stripe++) { 1843 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1785 if (!stripe->dev->can_discard) 1844 if (!stripe->dev->can_discard)
1786 continue; 1845 continue;
1787 1846
@@ -1800,7 +1859,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1800 */ 1859 */
1801 ret = 0; 1860 ret = 0;
1802 } 1861 }
1803 kfree(multi); 1862 kfree(bbio);
1804 } 1863 }
1805 1864
1806 if (actual_bytes) 1865 if (actual_bytes)
@@ -2700,6 +2759,13 @@ again:
2700 goto again; 2759 goto again;
2701 } 2760 }
2702 2761
2762 /* We've already setup this transaction, go ahead and exit */
2763 if (block_group->cache_generation == trans->transid &&
2764 i_size_read(inode)) {
2765 dcs = BTRFS_DC_SETUP;
2766 goto out_put;
2767 }
2768
2703 /* 2769 /*
2704 * We want to set the generation to 0, that way if anything goes wrong 2770 * We want to set the generation to 0, that way if anything goes wrong
2705 * from here on out we know not to trust this cache when we load up next 2771 * from here on out we know not to trust this cache when we load up next
@@ -2749,12 +2815,15 @@ again:
2749 if (!ret) 2815 if (!ret)
2750 dcs = BTRFS_DC_SETUP; 2816 dcs = BTRFS_DC_SETUP;
2751 btrfs_free_reserved_data_space(inode, num_pages); 2817 btrfs_free_reserved_data_space(inode, num_pages);
2818
2752out_put: 2819out_put:
2753 iput(inode); 2820 iput(inode);
2754out_free: 2821out_free:
2755 btrfs_release_path(path); 2822 btrfs_release_path(path);
2756out: 2823out:
2757 spin_lock(&block_group->lock); 2824 spin_lock(&block_group->lock);
2825 if (!ret)
2826 block_group->cache_generation = trans->transid;
2758 block_group->disk_cache_state = dcs; 2827 block_group->disk_cache_state = dcs;
2759 spin_unlock(&block_group->lock); 2828 spin_unlock(&block_group->lock);
2760 2829
@@ -3122,16 +3191,13 @@ commit_trans:
3122 return -ENOSPC; 3191 return -ENOSPC;
3123 } 3192 }
3124 data_sinfo->bytes_may_use += bytes; 3193 data_sinfo->bytes_may_use += bytes;
3125 BTRFS_I(inode)->reserved_bytes += bytes;
3126 spin_unlock(&data_sinfo->lock); 3194 spin_unlock(&data_sinfo->lock);
3127 3195
3128 return 0; 3196 return 0;
3129} 3197}
3130 3198
3131/* 3199/*
3132 * called when we are clearing an delalloc extent from the 3200 * Called if we need to clear a data reservation for this inode.
3133 * inode's io_tree or there was an error for whatever reason
3134 * after calling btrfs_check_data_free_space
3135 */ 3201 */
3136void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) 3202void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3137{ 3203{
@@ -3144,7 +3210,6 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3144 data_sinfo = BTRFS_I(inode)->space_info; 3210 data_sinfo = BTRFS_I(inode)->space_info;
3145 spin_lock(&data_sinfo->lock); 3211 spin_lock(&data_sinfo->lock);
3146 data_sinfo->bytes_may_use -= bytes; 3212 data_sinfo->bytes_may_use -= bytes;
3147 BTRFS_I(inode)->reserved_bytes -= bytes;
3148 spin_unlock(&data_sinfo->lock); 3213 spin_unlock(&data_sinfo->lock);
3149} 3214}
3150 3215
@@ -3165,6 +3230,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
3165 struct btrfs_space_info *sinfo, u64 alloc_bytes, 3230 struct btrfs_space_info *sinfo, u64 alloc_bytes,
3166 int force) 3231 int force)
3167{ 3232{
3233 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3168 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3234 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3169 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; 3235 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3170 u64 thresh; 3236 u64 thresh;
@@ -3173,11 +3239,18 @@ static int should_alloc_chunk(struct btrfs_root *root,
3173 return 1; 3239 return 1;
3174 3240
3175 /* 3241 /*
3242 * We need to take into account the global rsv because for all intents
3243 * and purposes it's used space. Don't worry about locking the
3244 * global_rsv, it doesn't change except when the transaction commits.
3245 */
3246 num_allocated += global_rsv->size;
3247
3248 /*
3176 * in limited mode, we want to have some free space up to 3249 * in limited mode, we want to have some free space up to
3177 * about 1% of the FS size. 3250 * about 1% of the FS size.
3178 */ 3251 */
3179 if (force == CHUNK_ALLOC_LIMITED) { 3252 if (force == CHUNK_ALLOC_LIMITED) {
3180 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); 3253 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3181 thresh = max_t(u64, 64 * 1024 * 1024, 3254 thresh = max_t(u64, 64 * 1024 * 1024,
3182 div_factor_fine(thresh, 1)); 3255 div_factor_fine(thresh, 1));
3183 3256
@@ -3199,7 +3272,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
3199 if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) 3272 if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
3200 return 0; 3273 return 0;
3201 3274
3202 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); 3275 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3203 3276
3204 /* 256MB or 5% of the FS */ 3277 /* 256MB or 5% of the FS */
3205 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); 3278 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
@@ -3302,24 +3375,26 @@ out:
3302/* 3375/*
3303 * shrink metadata reservation for delalloc 3376 * shrink metadata reservation for delalloc
3304 */ 3377 */
3305static int shrink_delalloc(struct btrfs_trans_handle *trans, 3378static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
3306 struct btrfs_root *root, u64 to_reclaim, int sync) 3379 bool wait_ordered)
3307{ 3380{
3308 struct btrfs_block_rsv *block_rsv; 3381 struct btrfs_block_rsv *block_rsv;
3309 struct btrfs_space_info *space_info; 3382 struct btrfs_space_info *space_info;
3383 struct btrfs_trans_handle *trans;
3310 u64 reserved; 3384 u64 reserved;
3311 u64 max_reclaim; 3385 u64 max_reclaim;
3312 u64 reclaimed = 0; 3386 u64 reclaimed = 0;
3313 long time_left; 3387 long time_left;
3314 int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3388 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3315 int loops = 0; 3389 int loops = 0;
3316 unsigned long progress; 3390 unsigned long progress;
3317 3391
3392 trans = (struct btrfs_trans_handle *)current->journal_info;
3318 block_rsv = &root->fs_info->delalloc_block_rsv; 3393 block_rsv = &root->fs_info->delalloc_block_rsv;
3319 space_info = block_rsv->space_info; 3394 space_info = block_rsv->space_info;
3320 3395
3321 smp_mb(); 3396 smp_mb();
3322 reserved = space_info->bytes_reserved; 3397 reserved = space_info->bytes_may_use;
3323 progress = space_info->reservation_progress; 3398 progress = space_info->reservation_progress;
3324 3399
3325 if (reserved == 0) 3400 if (reserved == 0)
@@ -3334,18 +3409,20 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3334 } 3409 }
3335 3410
3336 max_reclaim = min(reserved, to_reclaim); 3411 max_reclaim = min(reserved, to_reclaim);
3337 3412 nr_pages = max_t(unsigned long, nr_pages,
3413 max_reclaim >> PAGE_CACHE_SHIFT);
3338 while (loops < 1024) { 3414 while (loops < 1024) {
3339 /* have the flusher threads jump in and do some IO */ 3415 /* have the flusher threads jump in and do some IO */
3340 smp_mb(); 3416 smp_mb();
3341 nr_pages = min_t(unsigned long, nr_pages, 3417 nr_pages = min_t(unsigned long, nr_pages,
3342 root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT); 3418 root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
3343 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); 3419 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
3420 WB_REASON_FS_FREE_SPACE);
3344 3421
3345 spin_lock(&space_info->lock); 3422 spin_lock(&space_info->lock);
3346 if (reserved > space_info->bytes_reserved) 3423 if (reserved > space_info->bytes_may_use)
3347 reclaimed += reserved - space_info->bytes_reserved; 3424 reclaimed += reserved - space_info->bytes_may_use;
3348 reserved = space_info->bytes_reserved; 3425 reserved = space_info->bytes_may_use;
3349 spin_unlock(&space_info->lock); 3426 spin_unlock(&space_info->lock);
3350 3427
3351 loops++; 3428 loops++;
@@ -3356,11 +3433,15 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3356 if (trans && trans->transaction->blocked) 3433 if (trans && trans->transaction->blocked)
3357 return -EAGAIN; 3434 return -EAGAIN;
3358 3435
3359 time_left = schedule_timeout_interruptible(1); 3436 if (wait_ordered && !trans) {
3437 btrfs_wait_ordered_extents(root, 0, 0);
3438 } else {
3439 time_left = schedule_timeout_interruptible(1);
3360 3440
3361 /* We were interrupted, exit */ 3441 /* We were interrupted, exit */
3362 if (time_left) 3442 if (time_left)
3363 break; 3443 break;
3444 }
3364 3445
3365 /* we've kicked the IO a few times, if anything has been freed, 3446 /* we've kicked the IO a few times, if anything has been freed,
3366 * exit. There is no sense in looping here for a long time 3447 * exit. There is no sense in looping here for a long time
@@ -3375,34 +3456,90 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3375 } 3456 }
3376 3457
3377 } 3458 }
3378 if (reclaimed >= to_reclaim && !trans) 3459
3379 btrfs_wait_ordered_extents(root, 0, 0);
3380 return reclaimed >= to_reclaim; 3460 return reclaimed >= to_reclaim;
3381} 3461}
3382 3462
3383/* 3463/**
3384 * Retries tells us how many times we've called reserve_metadata_bytes. The 3464 * maybe_commit_transaction - possibly commit the transaction if its ok to
3385 * idea is if this is the first call (retries == 0) then we will add to our 3465 * @root - the root we're allocating for
3386 * reserved count if we can't make the allocation in order to hold our place 3466 * @bytes - the number of bytes we want to reserve
3387 * while we go and try and free up space. That way for retries > 1 we don't try 3467 * @force - force the commit
3388 * and add space, we just check to see if the amount of unused space is >= the
3389 * total space, meaning that our reservation is valid.
3390 * 3468 *
3391 * However if we don't intend to retry this reservation, pass -1 as retries so 3469 * This will check to make sure that committing the transaction will actually
3392 * that it short circuits this logic. 3470 * get us somewhere and then commit the transaction if it does. Otherwise it
3471 * will return -ENOSPC.
3393 */ 3472 */
3394static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, 3473static int may_commit_transaction(struct btrfs_root *root,
3395 struct btrfs_root *root, 3474 struct btrfs_space_info *space_info,
3475 u64 bytes, int force)
3476{
3477 struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
3478 struct btrfs_trans_handle *trans;
3479
3480 trans = (struct btrfs_trans_handle *)current->journal_info;
3481 if (trans)
3482 return -EAGAIN;
3483
3484 if (force)
3485 goto commit;
3486
3487 /* See if there is enough pinned space to make this reservation */
3488 spin_lock(&space_info->lock);
3489 if (space_info->bytes_pinned >= bytes) {
3490 spin_unlock(&space_info->lock);
3491 goto commit;
3492 }
3493 spin_unlock(&space_info->lock);
3494
3495 /*
3496 * See if there is some space in the delayed insertion reservation for
3497 * this reservation.
3498 */
3499 if (space_info != delayed_rsv->space_info)
3500 return -ENOSPC;
3501
3502 spin_lock(&delayed_rsv->lock);
3503 if (delayed_rsv->size < bytes) {
3504 spin_unlock(&delayed_rsv->lock);
3505 return -ENOSPC;
3506 }
3507 spin_unlock(&delayed_rsv->lock);
3508
3509commit:
3510 trans = btrfs_join_transaction(root);
3511 if (IS_ERR(trans))
3512 return -ENOSPC;
3513
3514 return btrfs_commit_transaction(trans, root);
3515}
3516
3517/**
3518 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
3519 * @root - the root we're allocating for
3520 * @block_rsv - the block_rsv we're allocating for
3521 * @orig_bytes - the number of bytes we want
3522 * @flush - wether or not we can flush to make our reservation
3523 *
3524 * This will reserve orgi_bytes number of bytes from the space info associated
3525 * with the block_rsv. If there is not enough space it will make an attempt to
3526 * flush out space to make room. It will do this by flushing delalloc if
3527 * possible or committing the transaction. If flush is 0 then no attempts to
3528 * regain reservations will be made and this will fail if there is not enough
3529 * space already.
3530 */
3531static int reserve_metadata_bytes(struct btrfs_root *root,
3396 struct btrfs_block_rsv *block_rsv, 3532 struct btrfs_block_rsv *block_rsv,
3397 u64 orig_bytes, int flush) 3533 u64 orig_bytes, int flush)
3398{ 3534{
3399 struct btrfs_space_info *space_info = block_rsv->space_info; 3535 struct btrfs_space_info *space_info = block_rsv->space_info;
3400 u64 unused; 3536 u64 used;
3401 u64 num_bytes = orig_bytes; 3537 u64 num_bytes = orig_bytes;
3402 int retries = 0; 3538 int retries = 0;
3403 int ret = 0; 3539 int ret = 0;
3404 bool committed = false; 3540 bool committed = false;
3405 bool flushing = false; 3541 bool flushing = false;
3542 bool wait_ordered = false;
3406 3543
3407again: 3544again:
3408 ret = 0; 3545 ret = 0;
@@ -3419,7 +3556,7 @@ again:
3419 * deadlock since we are waiting for the flusher to finish, but 3556 * deadlock since we are waiting for the flusher to finish, but
3420 * hold the current transaction open. 3557 * hold the current transaction open.
3421 */ 3558 */
3422 if (trans) 3559 if (current->journal_info)
3423 return -EAGAIN; 3560 return -EAGAIN;
3424 ret = wait_event_interruptible(space_info->wait, 3561 ret = wait_event_interruptible(space_info->wait,
3425 !space_info->flush); 3562 !space_info->flush);
@@ -3431,9 +3568,9 @@ again:
3431 } 3568 }
3432 3569
3433 ret = -ENOSPC; 3570 ret = -ENOSPC;
3434 unused = space_info->bytes_used + space_info->bytes_reserved + 3571 used = space_info->bytes_used + space_info->bytes_reserved +
3435 space_info->bytes_pinned + space_info->bytes_readonly + 3572 space_info->bytes_pinned + space_info->bytes_readonly +
3436 space_info->bytes_may_use; 3573 space_info->bytes_may_use;
3437 3574
3438 /* 3575 /*
3439 * The idea here is that we've not already over-reserved the block group 3576 * The idea here is that we've not already over-reserved the block group
@@ -3442,10 +3579,9 @@ again:
3442 * lets start flushing stuff first and then come back and try to make 3579 * lets start flushing stuff first and then come back and try to make
3443 * our reservation. 3580 * our reservation.
3444 */ 3581 */
3445 if (unused <= space_info->total_bytes) { 3582 if (used <= space_info->total_bytes) {
3446 unused = space_info->total_bytes - unused; 3583 if (used + orig_bytes <= space_info->total_bytes) {
3447 if (unused >= num_bytes) { 3584 space_info->bytes_may_use += orig_bytes;
3448 space_info->bytes_reserved += orig_bytes;
3449 ret = 0; 3585 ret = 0;
3450 } else { 3586 } else {
3451 /* 3587 /*
@@ -3461,10 +3597,64 @@ again:
3461 * amount plus the amount of bytes that we need for this 3597 * amount plus the amount of bytes that we need for this
3462 * reservation. 3598 * reservation.
3463 */ 3599 */
3464 num_bytes = unused - space_info->total_bytes + 3600 wait_ordered = true;
3601 num_bytes = used - space_info->total_bytes +
3465 (orig_bytes * (retries + 1)); 3602 (orig_bytes * (retries + 1));
3466 } 3603 }
3467 3604
3605 if (ret) {
3606 u64 profile = btrfs_get_alloc_profile(root, 0);
3607 u64 avail;
3608
3609 /*
3610 * If we have a lot of space that's pinned, don't bother doing
3611 * the overcommit dance yet and just commit the transaction.
3612 */
3613 avail = (space_info->total_bytes - space_info->bytes_used) * 8;
3614 do_div(avail, 10);
3615 if (space_info->bytes_pinned >= avail && flush && !committed) {
3616 space_info->flush = 1;
3617 flushing = true;
3618 spin_unlock(&space_info->lock);
3619 ret = may_commit_transaction(root, space_info,
3620 orig_bytes, 1);
3621 if (ret)
3622 goto out;
3623 committed = true;
3624 goto again;
3625 }
3626
3627 spin_lock(&root->fs_info->free_chunk_lock);
3628 avail = root->fs_info->free_chunk_space;
3629
3630 /*
3631 * If we have dup, raid1 or raid10 then only half of the free
3632 * space is actually useable.
3633 */
3634 if (profile & (BTRFS_BLOCK_GROUP_DUP |
3635 BTRFS_BLOCK_GROUP_RAID1 |
3636 BTRFS_BLOCK_GROUP_RAID10))
3637 avail >>= 1;
3638
3639 /*
3640 * If we aren't flushing don't let us overcommit too much, say
3641 * 1/8th of the space. If we can flush, let it overcommit up to
3642 * 1/2 of the space.
3643 */
3644 if (flush)
3645 avail >>= 3;
3646 else
3647 avail >>= 1;
3648 spin_unlock(&root->fs_info->free_chunk_lock);
3649
3650 if (used + num_bytes < space_info->total_bytes + avail) {
3651 space_info->bytes_may_use += orig_bytes;
3652 ret = 0;
3653 } else {
3654 wait_ordered = true;
3655 }
3656 }
3657
3468 /* 3658 /*
3469 * Couldn't make our reservation, save our place so while we're trying 3659 * Couldn't make our reservation, save our place so while we're trying
3470 * to reclaim space we can actually use it instead of somebody else 3660 * to reclaim space we can actually use it instead of somebody else
@@ -3484,7 +3674,7 @@ again:
3484 * We do synchronous shrinking since we don't actually unreserve 3674 * We do synchronous shrinking since we don't actually unreserve
3485 * metadata until after the IO is completed. 3675 * metadata until after the IO is completed.
3486 */ 3676 */
3487 ret = shrink_delalloc(trans, root, num_bytes, 1); 3677 ret = shrink_delalloc(root, num_bytes, wait_ordered);
3488 if (ret < 0) 3678 if (ret < 0)
3489 goto out; 3679 goto out;
3490 3680
@@ -3496,35 +3686,17 @@ again:
3496 * so go back around and try again. 3686 * so go back around and try again.
3497 */ 3687 */
3498 if (retries < 2) { 3688 if (retries < 2) {
3689 wait_ordered = true;
3499 retries++; 3690 retries++;
3500 goto again; 3691 goto again;
3501 } 3692 }
3502 3693
3503 /*
3504 * Not enough space to be reclaimed, don't bother committing the
3505 * transaction.
3506 */
3507 spin_lock(&space_info->lock);
3508 if (space_info->bytes_pinned < orig_bytes)
3509 ret = -ENOSPC;
3510 spin_unlock(&space_info->lock);
3511 if (ret)
3512 goto out;
3513
3514 ret = -EAGAIN;
3515 if (trans)
3516 goto out;
3517
3518 ret = -ENOSPC; 3694 ret = -ENOSPC;
3519 if (committed) 3695 if (committed)
3520 goto out; 3696 goto out;
3521 3697
3522 trans = btrfs_join_transaction(root); 3698 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3523 if (IS_ERR(trans))
3524 goto out;
3525 ret = btrfs_commit_transaction(trans, root);
3526 if (!ret) { 3699 if (!ret) {
3527 trans = NULL;
3528 committed = true; 3700 committed = true;
3529 goto again; 3701 goto again;
3530 } 3702 }
@@ -3542,10 +3714,12 @@ out:
3542static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans, 3714static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
3543 struct btrfs_root *root) 3715 struct btrfs_root *root)
3544{ 3716{
3545 struct btrfs_block_rsv *block_rsv; 3717 struct btrfs_block_rsv *block_rsv = NULL;
3546 if (root->ref_cows) 3718
3719 if (root->ref_cows || root == root->fs_info->csum_root)
3547 block_rsv = trans->block_rsv; 3720 block_rsv = trans->block_rsv;
3548 else 3721
3722 if (!block_rsv)
3549 block_rsv = root->block_rsv; 3723 block_rsv = root->block_rsv;
3550 3724
3551 if (!block_rsv) 3725 if (!block_rsv)
@@ -3616,7 +3790,7 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3616 } 3790 }
3617 if (num_bytes) { 3791 if (num_bytes) {
3618 spin_lock(&space_info->lock); 3792 spin_lock(&space_info->lock);
3619 space_info->bytes_reserved -= num_bytes; 3793 space_info->bytes_may_use -= num_bytes;
3620 space_info->reservation_progress++; 3794 space_info->reservation_progress++;
3621 spin_unlock(&space_info->lock); 3795 spin_unlock(&space_info->lock);
3622 } 3796 }
@@ -3640,9 +3814,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
3640{ 3814{
3641 memset(rsv, 0, sizeof(*rsv)); 3815 memset(rsv, 0, sizeof(*rsv));
3642 spin_lock_init(&rsv->lock); 3816 spin_lock_init(&rsv->lock);
3643 atomic_set(&rsv->usage, 1);
3644 rsv->priority = 6;
3645 INIT_LIST_HEAD(&rsv->list);
3646} 3817}
3647 3818
3648struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) 3819struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
@@ -3663,38 +3834,20 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3663void btrfs_free_block_rsv(struct btrfs_root *root, 3834void btrfs_free_block_rsv(struct btrfs_root *root,
3664 struct btrfs_block_rsv *rsv) 3835 struct btrfs_block_rsv *rsv)
3665{ 3836{
3666 if (rsv && atomic_dec_and_test(&rsv->usage)) { 3837 btrfs_block_rsv_release(root, rsv, (u64)-1);
3667 btrfs_block_rsv_release(root, rsv, (u64)-1); 3838 kfree(rsv);
3668 if (!rsv->durable)
3669 kfree(rsv);
3670 }
3671}
3672
3673/*
3674 * make the block_rsv struct be able to capture freed space.
3675 * the captured space will re-add to the the block_rsv struct
3676 * after transaction commit
3677 */
3678void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
3679 struct btrfs_block_rsv *block_rsv)
3680{
3681 block_rsv->durable = 1;
3682 mutex_lock(&fs_info->durable_block_rsv_mutex);
3683 list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
3684 mutex_unlock(&fs_info->durable_block_rsv_mutex);
3685} 3839}
3686 3840
3687int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, 3841static inline int __block_rsv_add(struct btrfs_root *root,
3688 struct btrfs_root *root, 3842 struct btrfs_block_rsv *block_rsv,
3689 struct btrfs_block_rsv *block_rsv, 3843 u64 num_bytes, int flush)
3690 u64 num_bytes)
3691{ 3844{
3692 int ret; 3845 int ret;
3693 3846
3694 if (num_bytes == 0) 3847 if (num_bytes == 0)
3695 return 0; 3848 return 0;
3696 3849
3697 ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1); 3850 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
3698 if (!ret) { 3851 if (!ret) {
3699 block_rsv_add_bytes(block_rsv, num_bytes, 1); 3852 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3700 return 0; 3853 return 0;
@@ -3703,55 +3856,66 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
3703 return ret; 3856 return ret;
3704} 3857}
3705 3858
3706int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, 3859int btrfs_block_rsv_add(struct btrfs_root *root,
3707 struct btrfs_root *root, 3860 struct btrfs_block_rsv *block_rsv,
3708 struct btrfs_block_rsv *block_rsv, 3861 u64 num_bytes)
3709 u64 min_reserved, int min_factor) 3862{
3863 return __block_rsv_add(root, block_rsv, num_bytes, 1);
3864}
3865
3866int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
3867 struct btrfs_block_rsv *block_rsv,
3868 u64 num_bytes)
3869{
3870 return __block_rsv_add(root, block_rsv, num_bytes, 0);
3871}
3872
3873int btrfs_block_rsv_check(struct btrfs_root *root,
3874 struct btrfs_block_rsv *block_rsv, int min_factor)
3710{ 3875{
3711 u64 num_bytes = 0; 3876 u64 num_bytes = 0;
3712 int commit_trans = 0;
3713 int ret = -ENOSPC; 3877 int ret = -ENOSPC;
3714 3878
3715 if (!block_rsv) 3879 if (!block_rsv)
3716 return 0; 3880 return 0;
3717 3881
3718 spin_lock(&block_rsv->lock); 3882 spin_lock(&block_rsv->lock);
3719 if (min_factor > 0) 3883 num_bytes = div_factor(block_rsv->size, min_factor);
3720 num_bytes = div_factor(block_rsv->size, min_factor); 3884 if (block_rsv->reserved >= num_bytes)
3721 if (min_reserved > num_bytes) 3885 ret = 0;
3722 num_bytes = min_reserved; 3886 spin_unlock(&block_rsv->lock);
3723 3887
3724 if (block_rsv->reserved >= num_bytes) { 3888 return ret;
3889}
3890
3891int btrfs_block_rsv_refill(struct btrfs_root *root,
3892 struct btrfs_block_rsv *block_rsv,
3893 u64 min_reserved)
3894{
3895 u64 num_bytes = 0;
3896 int ret = -ENOSPC;
3897
3898 if (!block_rsv)
3899 return 0;
3900
3901 spin_lock(&block_rsv->lock);
3902 num_bytes = min_reserved;
3903 if (block_rsv->reserved >= num_bytes)
3725 ret = 0; 3904 ret = 0;
3726 } else { 3905 else
3727 num_bytes -= block_rsv->reserved; 3906 num_bytes -= block_rsv->reserved;
3728 if (block_rsv->durable &&
3729 block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
3730 commit_trans = 1;
3731 }
3732 spin_unlock(&block_rsv->lock); 3907 spin_unlock(&block_rsv->lock);
3908
3733 if (!ret) 3909 if (!ret)
3734 return 0; 3910 return 0;
3735 3911
3736 if (block_rsv->refill_used) { 3912 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
3737 ret = reserve_metadata_bytes(trans, root, block_rsv, 3913 if (!ret) {
3738 num_bytes, 0); 3914 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3739 if (!ret) {
3740 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3741 return 0;
3742 }
3743 }
3744
3745 if (commit_trans) {
3746 if (trans)
3747 return -EAGAIN;
3748 trans = btrfs_join_transaction(root);
3749 BUG_ON(IS_ERR(trans));
3750 ret = btrfs_commit_transaction(trans, root);
3751 return 0; 3915 return 0;
3752 } 3916 }
3753 3917
3754 return -ENOSPC; 3918 return ret;
3755} 3919}
3756 3920
3757int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 3921int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
@@ -3783,7 +3947,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
3783 u64 num_bytes; 3947 u64 num_bytes;
3784 u64 meta_used; 3948 u64 meta_used;
3785 u64 data_used; 3949 u64 data_used;
3786 int csum_size = btrfs_super_csum_size(&fs_info->super_copy); 3950 int csum_size = btrfs_super_csum_size(fs_info->super_copy);
3787 3951
3788 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); 3952 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
3789 spin_lock(&sinfo->lock); 3953 spin_lock(&sinfo->lock);
@@ -3827,12 +3991,12 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3827 if (sinfo->total_bytes > num_bytes) { 3991 if (sinfo->total_bytes > num_bytes) {
3828 num_bytes = sinfo->total_bytes - num_bytes; 3992 num_bytes = sinfo->total_bytes - num_bytes;
3829 block_rsv->reserved += num_bytes; 3993 block_rsv->reserved += num_bytes;
3830 sinfo->bytes_reserved += num_bytes; 3994 sinfo->bytes_may_use += num_bytes;
3831 } 3995 }
3832 3996
3833 if (block_rsv->reserved >= block_rsv->size) { 3997 if (block_rsv->reserved >= block_rsv->size) {
3834 num_bytes = block_rsv->reserved - block_rsv->size; 3998 num_bytes = block_rsv->reserved - block_rsv->size;
3835 sinfo->bytes_reserved -= num_bytes; 3999 sinfo->bytes_may_use -= num_bytes;
3836 sinfo->reservation_progress++; 4000 sinfo->reservation_progress++;
3837 block_rsv->reserved = block_rsv->size; 4001 block_rsv->reserved = block_rsv->size;
3838 block_rsv->full = 1; 4002 block_rsv->full = 1;
@@ -3848,16 +4012,13 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3848 4012
3849 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4013 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3850 fs_info->chunk_block_rsv.space_info = space_info; 4014 fs_info->chunk_block_rsv.space_info = space_info;
3851 fs_info->chunk_block_rsv.priority = 10;
3852 4015
3853 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4016 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3854 fs_info->global_block_rsv.space_info = space_info; 4017 fs_info->global_block_rsv.space_info = space_info;
3855 fs_info->global_block_rsv.priority = 10;
3856 fs_info->global_block_rsv.refill_used = 1;
3857 fs_info->delalloc_block_rsv.space_info = space_info; 4018 fs_info->delalloc_block_rsv.space_info = space_info;
3858 fs_info->trans_block_rsv.space_info = space_info; 4019 fs_info->trans_block_rsv.space_info = space_info;
3859 fs_info->empty_block_rsv.space_info = space_info; 4020 fs_info->empty_block_rsv.space_info = space_info;
3860 fs_info->empty_block_rsv.priority = 10; 4021 fs_info->delayed_block_rsv.space_info = space_info;
3861 4022
3862 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; 4023 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
3863 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; 4024 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
@@ -3865,10 +4026,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3865 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 4026 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
3866 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 4027 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
3867 4028
3868 btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
3869
3870 btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
3871
3872 update_global_block_rsv(fs_info); 4029 update_global_block_rsv(fs_info);
3873} 4030}
3874 4031
@@ -3881,37 +4038,8 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
3881 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 4038 WARN_ON(fs_info->trans_block_rsv.reserved > 0);
3882 WARN_ON(fs_info->chunk_block_rsv.size > 0); 4039 WARN_ON(fs_info->chunk_block_rsv.size > 0);
3883 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 4040 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
3884} 4041 WARN_ON(fs_info->delayed_block_rsv.size > 0);
3885 4042 WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
3886int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
3887 struct btrfs_root *root,
3888 struct btrfs_block_rsv *rsv)
3889{
3890 struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv;
3891 u64 num_bytes;
3892 int ret;
3893
3894 /*
3895 * Truncate should be freeing data, but give us 2 items just in case it
3896 * needs to use some space. We may want to be smarter about this in the
3897 * future.
3898 */
3899 num_bytes = btrfs_calc_trans_metadata_size(root, 2);
3900
3901 /* We already have enough bytes, just return */
3902 if (rsv->reserved >= num_bytes)
3903 return 0;
3904
3905 num_bytes -= rsv->reserved;
3906
3907 /*
3908 * You should have reserved enough space before hand to do this, so this
3909 * should not fail.
3910 */
3911 ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes);
3912 BUG_ON(ret);
3913
3914 return 0;
3915} 4043}
3916 4044
3917void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 4045void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -3920,9 +4048,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3920 if (!trans->bytes_reserved) 4048 if (!trans->bytes_reserved)
3921 return; 4049 return;
3922 4050
3923 BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv); 4051 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
3924 btrfs_block_rsv_release(root, trans->block_rsv,
3925 trans->bytes_reserved);
3926 trans->bytes_reserved = 0; 4052 trans->bytes_reserved = 0;
3927} 4053}
3928 4054
@@ -3964,33 +4090,99 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
3964 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4090 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3965} 4091}
3966 4092
4093/**
4094 * drop_outstanding_extent - drop an outstanding extent
4095 * @inode: the inode we're dropping the extent for
4096 *
4097 * This is called when we are freeing up an outstanding extent, either called
4098 * after an error or after an extent is written. This will return the number of
4099 * reserved extents that need to be freed. This must be called with
4100 * BTRFS_I(inode)->lock held.
4101 */
3967static unsigned drop_outstanding_extent(struct inode *inode) 4102static unsigned drop_outstanding_extent(struct inode *inode)
3968{ 4103{
4104 unsigned drop_inode_space = 0;
3969 unsigned dropped_extents = 0; 4105 unsigned dropped_extents = 0;
3970 4106
3971 spin_lock(&BTRFS_I(inode)->lock);
3972 BUG_ON(!BTRFS_I(inode)->outstanding_extents); 4107 BUG_ON(!BTRFS_I(inode)->outstanding_extents);
3973 BTRFS_I(inode)->outstanding_extents--; 4108 BTRFS_I(inode)->outstanding_extents--;
3974 4109
4110 if (BTRFS_I(inode)->outstanding_extents == 0 &&
4111 BTRFS_I(inode)->delalloc_meta_reserved) {
4112 drop_inode_space = 1;
4113 BTRFS_I(inode)->delalloc_meta_reserved = 0;
4114 }
4115
3975 /* 4116 /*
3976 * If we have more or the same amount of outsanding extents than we have 4117 * If we have more or the same amount of outsanding extents than we have
3977 * reserved then we need to leave the reserved extents count alone. 4118 * reserved then we need to leave the reserved extents count alone.
3978 */ 4119 */
3979 if (BTRFS_I(inode)->outstanding_extents >= 4120 if (BTRFS_I(inode)->outstanding_extents >=
3980 BTRFS_I(inode)->reserved_extents) 4121 BTRFS_I(inode)->reserved_extents)
3981 goto out; 4122 return drop_inode_space;
3982 4123
3983 dropped_extents = BTRFS_I(inode)->reserved_extents - 4124 dropped_extents = BTRFS_I(inode)->reserved_extents -
3984 BTRFS_I(inode)->outstanding_extents; 4125 BTRFS_I(inode)->outstanding_extents;
3985 BTRFS_I(inode)->reserved_extents -= dropped_extents; 4126 BTRFS_I(inode)->reserved_extents -= dropped_extents;
3986out: 4127 return dropped_extents + drop_inode_space;
3987 spin_unlock(&BTRFS_I(inode)->lock);
3988 return dropped_extents;
3989} 4128}
3990 4129
3991static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes) 4130/**
4131 * calc_csum_metadata_size - return the amount of metada space that must be
4132 * reserved/free'd for the given bytes.
4133 * @inode: the inode we're manipulating
4134 * @num_bytes: the number of bytes in question
4135 * @reserve: 1 if we are reserving space, 0 if we are freeing space
4136 *
4137 * This adjusts the number of csum_bytes in the inode and then returns the
4138 * correct amount of metadata that must either be reserved or freed. We
4139 * calculate how many checksums we can fit into one leaf and then divide the
4140 * number of bytes that will need to be checksumed by this value to figure out
4141 * how many checksums will be required. If we are adding bytes then the number
4142 * may go up and we will return the number of additional bytes that must be
4143 * reserved. If it is going down we will return the number of bytes that must
4144 * be freed.
4145 *
4146 * This must be called with BTRFS_I(inode)->lock held.
4147 */
4148static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
4149 int reserve)
3992{ 4150{
3993 return num_bytes >>= 3; 4151 struct btrfs_root *root = BTRFS_I(inode)->root;
4152 u64 csum_size;
4153 int num_csums_per_leaf;
4154 int num_csums;
4155 int old_csums;
4156
4157 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
4158 BTRFS_I(inode)->csum_bytes == 0)
4159 return 0;
4160
4161 old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4162 if (reserve)
4163 BTRFS_I(inode)->csum_bytes += num_bytes;
4164 else
4165 BTRFS_I(inode)->csum_bytes -= num_bytes;
4166 csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
4167 num_csums_per_leaf = (int)div64_u64(csum_size,
4168 sizeof(struct btrfs_csum_item) +
4169 sizeof(struct btrfs_disk_key));
4170 num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4171 num_csums = num_csums + num_csums_per_leaf - 1;
4172 num_csums = num_csums / num_csums_per_leaf;
4173
4174 old_csums = old_csums + num_csums_per_leaf - 1;
4175 old_csums = old_csums / num_csums_per_leaf;
4176
4177 /* No change, no need to reserve more */
4178 if (old_csums == num_csums)
4179 return 0;
4180
4181 if (reserve)
4182 return btrfs_calc_trans_metadata_size(root,
4183 num_csums - old_csums);
4184
4185 return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
3994} 4186}
3995 4187
3996int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) 4188int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
@@ -3999,9 +4191,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3999 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 4191 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4000 u64 to_reserve = 0; 4192 u64 to_reserve = 0;
4001 unsigned nr_extents = 0; 4193 unsigned nr_extents = 0;
4194 int flush = 1;
4002 int ret; 4195 int ret;
4003 4196
4004 if (btrfs_transaction_in_commit(root->fs_info)) 4197 if (btrfs_is_free_space_inode(root, inode))
4198 flush = 0;
4199
4200 if (flush && btrfs_transaction_in_commit(root->fs_info))
4005 schedule_timeout(1); 4201 schedule_timeout(1);
4006 4202
4007 num_bytes = ALIGN(num_bytes, root->sectorsize); 4203 num_bytes = ALIGN(num_bytes, root->sectorsize);
@@ -4014,21 +4210,41 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4014 nr_extents = BTRFS_I(inode)->outstanding_extents - 4210 nr_extents = BTRFS_I(inode)->outstanding_extents -
4015 BTRFS_I(inode)->reserved_extents; 4211 BTRFS_I(inode)->reserved_extents;
4016 BTRFS_I(inode)->reserved_extents += nr_extents; 4212 BTRFS_I(inode)->reserved_extents += nr_extents;
4213 }
4017 4214
4018 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); 4215 /*
4216 * Add an item to reserve for updating the inode when we complete the
4217 * delalloc io.
4218 */
4219 if (!BTRFS_I(inode)->delalloc_meta_reserved) {
4220 nr_extents++;
4221 BTRFS_I(inode)->delalloc_meta_reserved = 1;
4019 } 4222 }
4223
4224 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4225 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4020 spin_unlock(&BTRFS_I(inode)->lock); 4226 spin_unlock(&BTRFS_I(inode)->lock);
4021 4227
4022 to_reserve += calc_csum_metadata_size(inode, num_bytes); 4228 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4023 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
4024 if (ret) { 4229 if (ret) {
4230 u64 to_free = 0;
4025 unsigned dropped; 4231 unsigned dropped;
4232
4233 spin_lock(&BTRFS_I(inode)->lock);
4234 dropped = drop_outstanding_extent(inode);
4235 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4236 spin_unlock(&BTRFS_I(inode)->lock);
4237 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4238
4026 /* 4239 /*
4027 * We don't need the return value since our reservation failed, 4240 * Somebody could have come in and twiddled with the
4028 * we just need to clean up our counter. 4241 * reservation, so if we have to free more than we would have
4242 * reserved from this reservation go ahead and release those
4243 * bytes.
4029 */ 4244 */
4030 dropped = drop_outstanding_extent(inode); 4245 to_free -= to_reserve;
4031 WARN_ON(dropped > 1); 4246 if (to_free)
4247 btrfs_block_rsv_release(root, block_rsv, to_free);
4032 return ret; 4248 return ret;
4033 } 4249 }
4034 4250
@@ -4037,6 +4253,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4037 return 0; 4253 return 0;
4038} 4254}
4039 4255
4256/**
4257 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
4258 * @inode: the inode to release the reservation for
4259 * @num_bytes: the number of bytes we're releasing
4260 *
4261 * This will release the metadata reservation for an inode. This can be called
4262 * once we complete IO for a given set of bytes to release their metadata
4263 * reservations.
4264 */
4040void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) 4265void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4041{ 4266{
4042 struct btrfs_root *root = BTRFS_I(inode)->root; 4267 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4044,9 +4269,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4044 unsigned dropped; 4269 unsigned dropped;
4045 4270
4046 num_bytes = ALIGN(num_bytes, root->sectorsize); 4271 num_bytes = ALIGN(num_bytes, root->sectorsize);
4272 spin_lock(&BTRFS_I(inode)->lock);
4047 dropped = drop_outstanding_extent(inode); 4273 dropped = drop_outstanding_extent(inode);
4048 4274
4049 to_free = calc_csum_metadata_size(inode, num_bytes); 4275 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4276 spin_unlock(&BTRFS_I(inode)->lock);
4050 if (dropped > 0) 4277 if (dropped > 0)
4051 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4278 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4052 4279
@@ -4054,6 +4281,21 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4054 to_free); 4281 to_free);
4055} 4282}
4056 4283
4284/**
4285 * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
4286 * @inode: inode we're writing to
4287 * @num_bytes: the number of bytes we want to allocate
4288 *
4289 * This will do the following things
4290 *
4291 * o reserve space in the data space info for num_bytes
4292 * o reserve space in the metadata space info based on number of outstanding
4293 * extents and how much csums will be needed
4294 * o add to the inodes ->delalloc_bytes
4295 * o add it to the fs_info's delalloc inodes list.
4296 *
4297 * This will return 0 for success and -ENOSPC if there is no space left.
4298 */
4057int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) 4299int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4058{ 4300{
4059 int ret; 4301 int ret;
@@ -4071,6 +4313,19 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4071 return 0; 4313 return 0;
4072} 4314}
4073 4315
4316/**
4317 * btrfs_delalloc_release_space - release data and metadata space for delalloc
4318 * @inode: inode we're releasing space for
4319 * @num_bytes: the number of bytes we want to free up
4320 *
4321 * This must be matched with a call to btrfs_delalloc_reserve_space. This is
4322 * called in the case that we don't need the metadata AND data reservations
4323 * anymore. So if there is an error or we insert an inline extent.
4324 *
4325 * This function will release the metadata space that was not used and will
4326 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
4327 * list if there are no delalloc bytes left.
4328 */
4074void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) 4329void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4075{ 4330{
4076 btrfs_delalloc_release_metadata(inode, num_bytes); 4331 btrfs_delalloc_release_metadata(inode, num_bytes);
@@ -4090,12 +4345,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4090 4345
4091 /* block accounting for super block */ 4346 /* block accounting for super block */
4092 spin_lock(&info->delalloc_lock); 4347 spin_lock(&info->delalloc_lock);
4093 old_val = btrfs_super_bytes_used(&info->super_copy); 4348 old_val = btrfs_super_bytes_used(info->super_copy);
4094 if (alloc) 4349 if (alloc)
4095 old_val += num_bytes; 4350 old_val += num_bytes;
4096 else 4351 else
4097 old_val -= num_bytes; 4352 old_val -= num_bytes;
4098 btrfs_set_super_bytes_used(&info->super_copy, old_val); 4353 btrfs_set_super_bytes_used(info->super_copy, old_val);
4099 spin_unlock(&info->delalloc_lock); 4354 spin_unlock(&info->delalloc_lock);
4100 4355
4101 while (total) { 4356 while (total) {
@@ -4123,7 +4378,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4123 spin_lock(&cache->space_info->lock); 4378 spin_lock(&cache->space_info->lock);
4124 spin_lock(&cache->lock); 4379 spin_lock(&cache->lock);
4125 4380
4126 if (btrfs_super_cache_generation(&info->super_copy) != 0 && 4381 if (btrfs_test_opt(root, SPACE_CACHE) &&
4127 cache->disk_cache_state < BTRFS_DC_CLEAR) 4382 cache->disk_cache_state < BTRFS_DC_CLEAR)
4128 cache->disk_cache_state = BTRFS_DC_CLEAR; 4383 cache->disk_cache_state = BTRFS_DC_CLEAR;
4129 4384
@@ -4135,7 +4390,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4135 btrfs_set_block_group_used(&cache->item, old_val); 4390 btrfs_set_block_group_used(&cache->item, old_val);
4136 cache->reserved -= num_bytes; 4391 cache->reserved -= num_bytes;
4137 cache->space_info->bytes_reserved -= num_bytes; 4392 cache->space_info->bytes_reserved -= num_bytes;
4138 cache->space_info->reservation_progress++;
4139 cache->space_info->bytes_used += num_bytes; 4393 cache->space_info->bytes_used += num_bytes;
4140 cache->space_info->disk_used += num_bytes * factor; 4394 cache->space_info->disk_used += num_bytes * factor;
4141 spin_unlock(&cache->lock); 4395 spin_unlock(&cache->lock);
@@ -4187,7 +4441,6 @@ static int pin_down_extent(struct btrfs_root *root,
4187 if (reserved) { 4441 if (reserved) {
4188 cache->reserved -= num_bytes; 4442 cache->reserved -= num_bytes;
4189 cache->space_info->bytes_reserved -= num_bytes; 4443 cache->space_info->bytes_reserved -= num_bytes;
4190 cache->space_info->reservation_progress++;
4191 } 4444 }
4192 spin_unlock(&cache->lock); 4445 spin_unlock(&cache->lock);
4193 spin_unlock(&cache->space_info->lock); 4446 spin_unlock(&cache->space_info->lock);
@@ -4215,45 +4468,82 @@ int btrfs_pin_extent(struct btrfs_root *root,
4215} 4468}
4216 4469
4217/* 4470/*
4218 * update size of reserved extents. this function may return -EAGAIN 4471 * this function must be called within transaction
4219 * if 'reserve' is true or 'sinfo' is false.
4220 */ 4472 */
4221int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 4473int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
4222 u64 num_bytes, int reserve, int sinfo) 4474 struct btrfs_root *root,
4475 u64 bytenr, u64 num_bytes)
4223{ 4476{
4477 struct btrfs_block_group_cache *cache;
4478
4479 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4480 BUG_ON(!cache);
4481
4482 /*
4483 * pull in the free space cache (if any) so that our pin
4484 * removes the free space from the cache. We have load_only set
4485 * to one because the slow code to read in the free extents does check
4486 * the pinned extents.
4487 */
4488 cache_block_group(cache, trans, root, 1);
4489
4490 pin_down_extent(root, cache, bytenr, num_bytes, 0);
4491
4492 /* remove us from the free space cache (if we're there at all) */
4493 btrfs_remove_free_space(cache, bytenr, num_bytes);
4494 btrfs_put_block_group(cache);
4495 return 0;
4496}
4497
4498/**
4499 * btrfs_update_reserved_bytes - update the block_group and space info counters
4500 * @cache: The cache we are manipulating
4501 * @num_bytes: The number of bytes in question
4502 * @reserve: One of the reservation enums
4503 *
4504 * This is called by the allocator when it reserves space, or by somebody who is
4505 * freeing space that was never actually used on disk. For example if you
4506 * reserve some space for a new leaf in transaction A and before transaction A
4507 * commits you free that leaf, you call this with reserve set to 0 in order to
4508 * clear the reservation.
4509 *
4510 * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
4511 * ENOSPC accounting. For data we handle the reservation through clearing the
4512 * delalloc bits in the io_tree. We have to do this since we could end up
4513 * allocating less disk space for the amount of data we have reserved in the
4514 * case of compression.
4515 *
4516 * If this is a reservation and the block group has become read only we cannot
4517 * make the reservation and return -EAGAIN, otherwise this function always
4518 * succeeds.
4519 */
4520static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4521 u64 num_bytes, int reserve)
4522{
4523 struct btrfs_space_info *space_info = cache->space_info;
4224 int ret = 0; 4524 int ret = 0;
4225 if (sinfo) { 4525 spin_lock(&space_info->lock);
4226 struct btrfs_space_info *space_info = cache->space_info; 4526 spin_lock(&cache->lock);
4227 spin_lock(&space_info->lock); 4527 if (reserve != RESERVE_FREE) {
4228 spin_lock(&cache->lock);
4229 if (reserve) {
4230 if (cache->ro) {
4231 ret = -EAGAIN;
4232 } else {
4233 cache->reserved += num_bytes;
4234 space_info->bytes_reserved += num_bytes;
4235 }
4236 } else {
4237 if (cache->ro)
4238 space_info->bytes_readonly += num_bytes;
4239 cache->reserved -= num_bytes;
4240 space_info->bytes_reserved -= num_bytes;
4241 space_info->reservation_progress++;
4242 }
4243 spin_unlock(&cache->lock);
4244 spin_unlock(&space_info->lock);
4245 } else {
4246 spin_lock(&cache->lock);
4247 if (cache->ro) { 4528 if (cache->ro) {
4248 ret = -EAGAIN; 4529 ret = -EAGAIN;
4249 } else { 4530 } else {
4250 if (reserve) 4531 cache->reserved += num_bytes;
4251 cache->reserved += num_bytes; 4532 space_info->bytes_reserved += num_bytes;
4252 else 4533 if (reserve == RESERVE_ALLOC) {
4253 cache->reserved -= num_bytes; 4534 BUG_ON(space_info->bytes_may_use < num_bytes);
4535 space_info->bytes_may_use -= num_bytes;
4536 }
4254 } 4537 }
4255 spin_unlock(&cache->lock); 4538 } else {
4539 if (cache->ro)
4540 space_info->bytes_readonly += num_bytes;
4541 cache->reserved -= num_bytes;
4542 space_info->bytes_reserved -= num_bytes;
4543 space_info->reservation_progress++;
4256 } 4544 }
4545 spin_unlock(&cache->lock);
4546 spin_unlock(&space_info->lock);
4257 return ret; 4547 return ret;
4258} 4548}
4259 4549
@@ -4319,13 +4609,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4319 spin_lock(&cache->lock); 4609 spin_lock(&cache->lock);
4320 cache->pinned -= len; 4610 cache->pinned -= len;
4321 cache->space_info->bytes_pinned -= len; 4611 cache->space_info->bytes_pinned -= len;
4322 if (cache->ro) { 4612 if (cache->ro)
4323 cache->space_info->bytes_readonly += len; 4613 cache->space_info->bytes_readonly += len;
4324 } else if (cache->reserved_pinned > 0) {
4325 len = min(len, cache->reserved_pinned);
4326 cache->reserved_pinned -= len;
4327 cache->space_info->bytes_reserved += len;
4328 }
4329 spin_unlock(&cache->lock); 4614 spin_unlock(&cache->lock);
4330 spin_unlock(&cache->space_info->lock); 4615 spin_unlock(&cache->space_info->lock);
4331 } 4616 }
@@ -4340,11 +4625,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4340{ 4625{
4341 struct btrfs_fs_info *fs_info = root->fs_info; 4626 struct btrfs_fs_info *fs_info = root->fs_info;
4342 struct extent_io_tree *unpin; 4627 struct extent_io_tree *unpin;
4343 struct btrfs_block_rsv *block_rsv;
4344 struct btrfs_block_rsv *next_rsv;
4345 u64 start; 4628 u64 start;
4346 u64 end; 4629 u64 end;
4347 int idx;
4348 int ret; 4630 int ret;
4349 4631
4350 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 4632 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -4367,30 +4649,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4367 cond_resched(); 4649 cond_resched();
4368 } 4650 }
4369 4651
4370 mutex_lock(&fs_info->durable_block_rsv_mutex);
4371 list_for_each_entry_safe(block_rsv, next_rsv,
4372 &fs_info->durable_block_rsv_list, list) {
4373
4374 idx = trans->transid & 0x1;
4375 if (block_rsv->freed[idx] > 0) {
4376 block_rsv_add_bytes(block_rsv,
4377 block_rsv->freed[idx], 0);
4378 block_rsv->freed[idx] = 0;
4379 }
4380 if (atomic_read(&block_rsv->usage) == 0) {
4381 btrfs_block_rsv_release(root, block_rsv, (u64)-1);
4382
4383 if (block_rsv->freed[0] == 0 &&
4384 block_rsv->freed[1] == 0) {
4385 list_del_init(&block_rsv->list);
4386 kfree(block_rsv);
4387 }
4388 } else {
4389 btrfs_block_rsv_release(root, block_rsv, 0);
4390 }
4391 }
4392 mutex_unlock(&fs_info->durable_block_rsv_mutex);
4393
4394 return 0; 4652 return 0;
4395} 4653}
4396 4654
@@ -4668,7 +4926,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4668 struct extent_buffer *buf, 4926 struct extent_buffer *buf,
4669 u64 parent, int last_ref) 4927 u64 parent, int last_ref)
4670{ 4928{
4671 struct btrfs_block_rsv *block_rsv;
4672 struct btrfs_block_group_cache *cache = NULL; 4929 struct btrfs_block_group_cache *cache = NULL;
4673 int ret; 4930 int ret;
4674 4931
@@ -4683,64 +4940,24 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4683 if (!last_ref) 4940 if (!last_ref)
4684 return; 4941 return;
4685 4942
4686 block_rsv = get_block_rsv(trans, root);
4687 cache = btrfs_lookup_block_group(root->fs_info, buf->start); 4943 cache = btrfs_lookup_block_group(root->fs_info, buf->start);
4688 if (block_rsv->space_info != cache->space_info)
4689 goto out;
4690 4944
4691 if (btrfs_header_generation(buf) == trans->transid) { 4945 if (btrfs_header_generation(buf) == trans->transid) {
4692 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 4946 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4693 ret = check_ref_cleanup(trans, root, buf->start); 4947 ret = check_ref_cleanup(trans, root, buf->start);
4694 if (!ret) 4948 if (!ret)
4695 goto pin; 4949 goto out;
4696 } 4950 }
4697 4951
4698 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 4952 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
4699 pin_down_extent(root, cache, buf->start, buf->len, 1); 4953 pin_down_extent(root, cache, buf->start, buf->len, 1);
4700 goto pin; 4954 goto out;
4701 } 4955 }
4702 4956
4703 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 4957 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4704 4958
4705 btrfs_add_free_space(cache, buf->start, buf->len); 4959 btrfs_add_free_space(cache, buf->start, buf->len);
4706 ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0); 4960 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
4707 if (ret == -EAGAIN) {
4708 /* block group became read-only */
4709 btrfs_update_reserved_bytes(cache, buf->len, 0, 1);
4710 goto out;
4711 }
4712
4713 ret = 1;
4714 spin_lock(&block_rsv->lock);
4715 if (block_rsv->reserved < block_rsv->size) {
4716 block_rsv->reserved += buf->len;
4717 ret = 0;
4718 }
4719 spin_unlock(&block_rsv->lock);
4720
4721 if (ret) {
4722 spin_lock(&cache->space_info->lock);
4723 cache->space_info->bytes_reserved -= buf->len;
4724 cache->space_info->reservation_progress++;
4725 spin_unlock(&cache->space_info->lock);
4726 }
4727 goto out;
4728 }
4729pin:
4730 if (block_rsv->durable && !cache->ro) {
4731 ret = 0;
4732 spin_lock(&cache->lock);
4733 if (!cache->ro) {
4734 cache->reserved_pinned += buf->len;
4735 ret = 1;
4736 }
4737 spin_unlock(&cache->lock);
4738
4739 if (ret) {
4740 spin_lock(&block_rsv->lock);
4741 block_rsv->freed[trans->transid & 0x1] += buf->len;
4742 spin_unlock(&block_rsv->lock);
4743 }
4744 } 4961 }
4745out: 4962out:
4746 /* 4963 /*
@@ -4883,10 +5100,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4883 int last_ptr_loop = 0; 5100 int last_ptr_loop = 0;
4884 int loop = 0; 5101 int loop = 0;
4885 int index = 0; 5102 int index = 0;
5103 int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
5104 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
4886 bool found_uncached_bg = false; 5105 bool found_uncached_bg = false;
4887 bool failed_cluster_refill = false; 5106 bool failed_cluster_refill = false;
4888 bool failed_alloc = false; 5107 bool failed_alloc = false;
4889 bool use_cluster = true; 5108 bool use_cluster = true;
5109 bool have_caching_bg = false;
4890 u64 ideal_cache_percent = 0; 5110 u64 ideal_cache_percent = 0;
4891 u64 ideal_cache_offset = 0; 5111 u64 ideal_cache_offset = 0;
4892 5112
@@ -4969,6 +5189,7 @@ ideal_cache:
4969 } 5189 }
4970 } 5190 }
4971search: 5191search:
5192 have_caching_bg = false;
4972 down_read(&space_info->groups_sem); 5193 down_read(&space_info->groups_sem);
4973 list_for_each_entry(block_group, &space_info->block_groups[index], 5194 list_for_each_entry(block_group, &space_info->block_groups[index],
4974 list) { 5195 list) {
@@ -4998,13 +5219,15 @@ search:
4998 } 5219 }
4999 5220
5000have_block_group: 5221have_block_group:
5001 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { 5222 cached = block_group_cache_done(block_group);
5223 if (unlikely(!cached)) {
5002 u64 free_percent; 5224 u64 free_percent;
5003 5225
5226 found_uncached_bg = true;
5004 ret = cache_block_group(block_group, trans, 5227 ret = cache_block_group(block_group, trans,
5005 orig_root, 1); 5228 orig_root, 1);
5006 if (block_group->cached == BTRFS_CACHE_FINISHED) 5229 if (block_group->cached == BTRFS_CACHE_FINISHED)
5007 goto have_block_group; 5230 goto alloc;
5008 5231
5009 free_percent = btrfs_block_group_used(&block_group->item); 5232 free_percent = btrfs_block_group_used(&block_group->item);
5010 free_percent *= 100; 5233 free_percent *= 100;
@@ -5026,7 +5249,6 @@ have_block_group:
5026 orig_root, 0); 5249 orig_root, 0);
5027 BUG_ON(ret); 5250 BUG_ON(ret);
5028 } 5251 }
5029 found_uncached_bg = true;
5030 5252
5031 /* 5253 /*
5032 * If loop is set for cached only, try the next block 5254 * If loop is set for cached only, try the next block
@@ -5036,10 +5258,7 @@ have_block_group:
5036 goto loop; 5258 goto loop;
5037 } 5259 }
5038 5260
5039 cached = block_group_cache_done(block_group); 5261alloc:
5040 if (unlikely(!cached))
5041 found_uncached_bg = true;
5042
5043 if (unlikely(block_group->ro)) 5262 if (unlikely(block_group->ro))
5044 goto loop; 5263 goto loop;
5045 5264
@@ -5177,6 +5396,8 @@ refill_cluster:
5177 failed_alloc = true; 5396 failed_alloc = true;
5178 goto have_block_group; 5397 goto have_block_group;
5179 } else if (!offset) { 5398 } else if (!offset) {
5399 if (!cached)
5400 have_caching_bg = true;
5180 goto loop; 5401 goto loop;
5181 } 5402 }
5182checks: 5403checks:
@@ -5202,8 +5423,8 @@ checks:
5202 search_start - offset); 5423 search_start - offset);
5203 BUG_ON(offset > search_start); 5424 BUG_ON(offset > search_start);
5204 5425
5205 ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1, 5426 ret = btrfs_update_reserved_bytes(block_group, num_bytes,
5206 (data & BTRFS_BLOCK_GROUP_DATA)); 5427 alloc_type);
5207 if (ret == -EAGAIN) { 5428 if (ret == -EAGAIN) {
5208 btrfs_add_free_space(block_group, offset, num_bytes); 5429 btrfs_add_free_space(block_group, offset, num_bytes);
5209 goto loop; 5430 goto loop;
@@ -5227,6 +5448,9 @@ loop:
5227 } 5448 }
5228 up_read(&space_info->groups_sem); 5449 up_read(&space_info->groups_sem);
5229 5450
5451 if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
5452 goto search;
5453
5230 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) 5454 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
5231 goto search; 5455 goto search;
5232 5456
@@ -5325,7 +5549,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
5325 int index = 0; 5549 int index = 0;
5326 5550
5327 spin_lock(&info->lock); 5551 spin_lock(&info->lock);
5328 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 5552 printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
5553 (unsigned long long)info->flags,
5329 (unsigned long long)(info->total_bytes - info->bytes_used - 5554 (unsigned long long)(info->total_bytes - info->bytes_used -
5330 info->bytes_pinned - info->bytes_reserved - 5555 info->bytes_pinned - info->bytes_reserved -
5331 info->bytes_readonly), 5556 info->bytes_readonly),
@@ -5411,7 +5636,8 @@ again:
5411 return ret; 5636 return ret;
5412} 5637}
5413 5638
5414int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) 5639static int __btrfs_free_reserved_extent(struct btrfs_root *root,
5640 u64 start, u64 len, int pin)
5415{ 5641{
5416 struct btrfs_block_group_cache *cache; 5642 struct btrfs_block_group_cache *cache;
5417 int ret = 0; 5643 int ret = 0;
@@ -5426,8 +5652,12 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
5426 if (btrfs_test_opt(root, DISCARD)) 5652 if (btrfs_test_opt(root, DISCARD))
5427 ret = btrfs_discard_extent(root, start, len, NULL); 5653 ret = btrfs_discard_extent(root, start, len, NULL);
5428 5654
5429 btrfs_add_free_space(cache, start, len); 5655 if (pin)
5430 btrfs_update_reserved_bytes(cache, len, 0, 1); 5656 pin_down_extent(root, cache, start, len, 1);
5657 else {
5658 btrfs_add_free_space(cache, start, len);
5659 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
5660 }
5431 btrfs_put_block_group(cache); 5661 btrfs_put_block_group(cache);
5432 5662
5433 trace_btrfs_reserved_extent_free(root, start, len); 5663 trace_btrfs_reserved_extent_free(root, start, len);
@@ -5435,6 +5665,18 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
5435 return ret; 5665 return ret;
5436} 5666}
5437 5667
5668int btrfs_free_reserved_extent(struct btrfs_root *root,
5669 u64 start, u64 len)
5670{
5671 return __btrfs_free_reserved_extent(root, start, len, 0);
5672}
5673
5674int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
5675 u64 start, u64 len)
5676{
5677 return __btrfs_free_reserved_extent(root, start, len, 1);
5678}
5679
5438static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 5680static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5439 struct btrfs_root *root, 5681 struct btrfs_root *root,
5440 u64 parent, u64 root_objectid, 5682 u64 parent, u64 root_objectid,
@@ -5630,7 +5872,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5630 put_caching_control(caching_ctl); 5872 put_caching_control(caching_ctl);
5631 } 5873 }
5632 5874
5633 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1); 5875 ret = btrfs_update_reserved_bytes(block_group, ins->offset,
5876 RESERVE_ALLOC_NO_ACCOUNT);
5634 BUG_ON(ret); 5877 BUG_ON(ret);
5635 btrfs_put_block_group(block_group); 5878 btrfs_put_block_group(block_group);
5636 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 5879 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
@@ -5687,8 +5930,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5687 block_rsv = get_block_rsv(trans, root); 5930 block_rsv = get_block_rsv(trans, root);
5688 5931
5689 if (block_rsv->size == 0) { 5932 if (block_rsv->size == 0) {
5690 ret = reserve_metadata_bytes(trans, root, block_rsv, 5933 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
5691 blocksize, 0);
5692 /* 5934 /*
5693 * If we couldn't reserve metadata bytes try and use some from 5935 * If we couldn't reserve metadata bytes try and use some from
5694 * the global reserve. 5936 * the global reserve.
@@ -5708,13 +5950,15 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5708 if (!ret) 5950 if (!ret)
5709 return block_rsv; 5951 return block_rsv;
5710 if (ret) { 5952 if (ret) {
5711 WARN_ON(1); 5953 static DEFINE_RATELIMIT_STATE(_rs,
5712 ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize, 5954 DEFAULT_RATELIMIT_INTERVAL,
5713 0); 5955 /*DEFAULT_RATELIMIT_BURST*/ 2);
5956 if (__ratelimit(&_rs)) {
5957 printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
5958 WARN_ON(1);
5959 }
5960 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
5714 if (!ret) { 5961 if (!ret) {
5715 spin_lock(&block_rsv->lock);
5716 block_rsv->size += blocksize;
5717 spin_unlock(&block_rsv->lock);
5718 return block_rsv; 5962 return block_rsv;
5719 } else if (ret && block_rsv != global_rsv) { 5963 } else if (ret && block_rsv != global_rsv) {
5720 ret = block_rsv_use_bytes(global_rsv, blocksize); 5964 ret = block_rsv_use_bytes(global_rsv, blocksize);
@@ -6592,12 +6836,9 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
6592 cache->bytes_super - btrfs_block_group_used(&cache->item); 6836 cache->bytes_super - btrfs_block_group_used(&cache->item);
6593 6837
6594 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 6838 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
6595 sinfo->bytes_may_use + sinfo->bytes_readonly + 6839 sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
6596 cache->reserved_pinned + num_bytes + min_allocable_bytes <= 6840 min_allocable_bytes <= sinfo->total_bytes) {
6597 sinfo->total_bytes) {
6598 sinfo->bytes_readonly += num_bytes; 6841 sinfo->bytes_readonly += num_bytes;
6599 sinfo->bytes_reserved += cache->reserved_pinned;
6600 cache->reserved_pinned = 0;
6601 cache->ro = 1; 6842 cache->ro = 1;
6602 ret = 0; 6843 ret = 0;
6603 } 6844 }
@@ -6964,7 +7205,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
6964 struct btrfs_space_info, 7205 struct btrfs_space_info,
6965 list); 7206 list);
6966 if (space_info->bytes_pinned > 0 || 7207 if (space_info->bytes_pinned > 0 ||
6967 space_info->bytes_reserved > 0) { 7208 space_info->bytes_reserved > 0 ||
7209 space_info->bytes_may_use > 0) {
6968 WARN_ON(1); 7210 WARN_ON(1);
6969 dump_space_info(space_info, 0, 0); 7211 dump_space_info(space_info, 0, 0);
6970 } 7212 }
@@ -7006,14 +7248,12 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7006 return -ENOMEM; 7248 return -ENOMEM;
7007 path->reada = 1; 7249 path->reada = 1;
7008 7250
7009 cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); 7251 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
7010 if (cache_gen != 0 && 7252 if (btrfs_test_opt(root, SPACE_CACHE) &&
7011 btrfs_super_generation(&root->fs_info->super_copy) != cache_gen) 7253 btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
7012 need_clear = 1; 7254 need_clear = 1;
7013 if (btrfs_test_opt(root, CLEAR_CACHE)) 7255 if (btrfs_test_opt(root, CLEAR_CACHE))
7014 need_clear = 1; 7256 need_clear = 1;
7015 if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
7016 printk(KERN_INFO "btrfs: disk space caching is enabled\n");
7017 7257
7018 while (1) { 7258 while (1) {
7019 ret = find_first_block_group(root, path, &key); 7259 ret = find_first_block_group(root, path, &key);
@@ -7252,7 +7492,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7252 goto out; 7492 goto out;
7253 } 7493 }
7254 7494
7255 inode = lookup_free_space_inode(root, block_group, path); 7495 inode = lookup_free_space_inode(tree_root, block_group, path);
7256 if (!IS_ERR(inode)) { 7496 if (!IS_ERR(inode)) {
7257 ret = btrfs_orphan_add(trans, inode); 7497 ret = btrfs_orphan_add(trans, inode);
7258 BUG_ON(ret); 7498 BUG_ON(ret);
@@ -7268,7 +7508,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7268 spin_unlock(&block_group->lock); 7508 spin_unlock(&block_group->lock);
7269 } 7509 }
7270 /* One for our lookup ref */ 7510 /* One for our lookup ref */
7271 iput(inode); 7511 btrfs_add_delayed_iput(inode);
7272 } 7512 }
7273 7513
7274 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 7514 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@ -7339,7 +7579,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
7339 int mixed = 0; 7579 int mixed = 0;
7340 int ret; 7580 int ret;
7341 7581
7342 disk_super = &fs_info->super_copy; 7582 disk_super = fs_info->super_copy;
7343 if (!btrfs_super_root(disk_super)) 7583 if (!btrfs_super_root(disk_super))
7344 return 1; 7584 return 1;
7345 7585
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d418164a35f1..9472d3de5e52 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -17,6 +17,7 @@
17#include "compat.h" 17#include "compat.h"
18#include "ctree.h" 18#include "ctree.h"
19#include "btrfs_inode.h" 19#include "btrfs_inode.h"
20#include "volumes.h"
20 21
21static struct kmem_cache *extent_state_cache; 22static struct kmem_cache *extent_state_cache;
22static struct kmem_cache *extent_buffer_cache; 23static struct kmem_cache *extent_buffer_cache;
@@ -894,6 +895,194 @@ search_again:
894 goto again; 895 goto again;
895} 896}
896 897
898/**
899 * convert_extent - convert all bits in a given range from one bit to another
900 * @tree: the io tree to search
901 * @start: the start offset in bytes
902 * @end: the end offset in bytes (inclusive)
903 * @bits: the bits to set in this range
904 * @clear_bits: the bits to clear in this range
905 * @mask: the allocation mask
906 *
907 * This will go through and set bits for the given range. If any states exist
908 * already in this range they are set with the given bit and cleared of the
909 * clear_bits. This is only meant to be used by things that are mergeable, ie
910 * converting from say DELALLOC to DIRTY. This is not meant to be used with
911 * boundary bits like LOCK.
912 */
913int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
914 int bits, int clear_bits, gfp_t mask)
915{
916 struct extent_state *state;
917 struct extent_state *prealloc = NULL;
918 struct rb_node *node;
919 int err = 0;
920 u64 last_start;
921 u64 last_end;
922
923again:
924 if (!prealloc && (mask & __GFP_WAIT)) {
925 prealloc = alloc_extent_state(mask);
926 if (!prealloc)
927 return -ENOMEM;
928 }
929
930 spin_lock(&tree->lock);
931 /*
932 * this search will find all the extents that end after
933 * our range starts.
934 */
935 node = tree_search(tree, start);
936 if (!node) {
937 prealloc = alloc_extent_state_atomic(prealloc);
938 if (!prealloc)
939 return -ENOMEM;
940 err = insert_state(tree, prealloc, start, end, &bits);
941 prealloc = NULL;
942 BUG_ON(err == -EEXIST);
943 goto out;
944 }
945 state = rb_entry(node, struct extent_state, rb_node);
946hit_next:
947 last_start = state->start;
948 last_end = state->end;
949
950 /*
951 * | ---- desired range ---- |
952 * | state |
953 *
954 * Just lock what we found and keep going
955 */
956 if (state->start == start && state->end <= end) {
957 struct rb_node *next_node;
958
959 set_state_bits(tree, state, &bits);
960 clear_state_bit(tree, state, &clear_bits, 0);
961
962 merge_state(tree, state);
963 if (last_end == (u64)-1)
964 goto out;
965
966 start = last_end + 1;
967 next_node = rb_next(&state->rb_node);
968 if (next_node && start < end && prealloc && !need_resched()) {
969 state = rb_entry(next_node, struct extent_state,
970 rb_node);
971 if (state->start == start)
972 goto hit_next;
973 }
974 goto search_again;
975 }
976
977 /*
978 * | ---- desired range ---- |
979 * | state |
980 * or
981 * | ------------- state -------------- |
982 *
983 * We need to split the extent we found, and may flip bits on
984 * second half.
985 *
986 * If the extent we found extends past our
987 * range, we just split and search again. It'll get split
988 * again the next time though.
989 *
990 * If the extent we found is inside our range, we set the
991 * desired bit on it.
992 */
993 if (state->start < start) {
994 prealloc = alloc_extent_state_atomic(prealloc);
995 if (!prealloc)
996 return -ENOMEM;
997 err = split_state(tree, state, prealloc, start);
998 BUG_ON(err == -EEXIST);
999 prealloc = NULL;
1000 if (err)
1001 goto out;
1002 if (state->end <= end) {
1003 set_state_bits(tree, state, &bits);
1004 clear_state_bit(tree, state, &clear_bits, 0);
1005 merge_state(tree, state);
1006 if (last_end == (u64)-1)
1007 goto out;
1008 start = last_end + 1;
1009 }
1010 goto search_again;
1011 }
1012 /*
1013 * | ---- desired range ---- |
1014 * | state | or | state |
1015 *
1016 * There's a hole, we need to insert something in it and
1017 * ignore the extent we found.
1018 */
1019 if (state->start > start) {
1020 u64 this_end;
1021 if (end < last_start)
1022 this_end = end;
1023 else
1024 this_end = last_start - 1;
1025
1026 prealloc = alloc_extent_state_atomic(prealloc);
1027 if (!prealloc)
1028 return -ENOMEM;
1029
1030 /*
1031 * Avoid to free 'prealloc' if it can be merged with
1032 * the later extent.
1033 */
1034 err = insert_state(tree, prealloc, start, this_end,
1035 &bits);
1036 BUG_ON(err == -EEXIST);
1037 if (err) {
1038 free_extent_state(prealloc);
1039 prealloc = NULL;
1040 goto out;
1041 }
1042 prealloc = NULL;
1043 start = this_end + 1;
1044 goto search_again;
1045 }
1046 /*
1047 * | ---- desired range ---- |
1048 * | state |
1049 * We need to split the extent, and set the bit
1050 * on the first half
1051 */
1052 if (state->start <= end && state->end > end) {
1053 prealloc = alloc_extent_state_atomic(prealloc);
1054 if (!prealloc)
1055 return -ENOMEM;
1056
1057 err = split_state(tree, state, prealloc, end + 1);
1058 BUG_ON(err == -EEXIST);
1059
1060 set_state_bits(tree, prealloc, &bits);
1061 clear_state_bit(tree, prealloc, &clear_bits, 0);
1062
1063 merge_state(tree, prealloc);
1064 prealloc = NULL;
1065 goto out;
1066 }
1067
1068 goto search_again;
1069
1070out:
1071 spin_unlock(&tree->lock);
1072 if (prealloc)
1073 free_extent_state(prealloc);
1074
1075 return err;
1076
1077search_again:
1078 if (start > end)
1079 goto out;
1080 spin_unlock(&tree->lock);
1081 if (mask & __GFP_WAIT)
1082 cond_resched();
1083 goto again;
1084}
1085
897/* wrappers around set/clear extent bit */ 1086/* wrappers around set/clear extent bit */
898int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1087int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
899 gfp_t mask) 1088 gfp_t mask)
@@ -919,7 +1108,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
919 struct extent_state **cached_state, gfp_t mask) 1108 struct extent_state **cached_state, gfp_t mask)
920{ 1109{
921 return set_extent_bit(tree, start, end, 1110 return set_extent_bit(tree, start, end,
922 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, 1111 EXTENT_DELALLOC | EXTENT_UPTODATE,
923 0, NULL, cached_state, mask); 1112 0, NULL, cached_state, mask);
924} 1113}
925 1114
@@ -1599,6 +1788,368 @@ static int check_page_writeback(struct extent_io_tree *tree,
1599 return 0; 1788 return 0;
1600} 1789}
1601 1790
1791/*
1792 * When IO fails, either with EIO or csum verification fails, we
1793 * try other mirrors that might have a good copy of the data. This
1794 * io_failure_record is used to record state as we go through all the
1795 * mirrors. If another mirror has good data, the page is set up to date
1796 * and things continue. If a good mirror can't be found, the original
1797 * bio end_io callback is called to indicate things have failed.
1798 */
1799struct io_failure_record {
1800 struct page *page;
1801 u64 start;
1802 u64 len;
1803 u64 logical;
1804 unsigned long bio_flags;
1805 int this_mirror;
1806 int failed_mirror;
1807 int in_validation;
1808};
1809
1810static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1811 int did_repair)
1812{
1813 int ret;
1814 int err = 0;
1815 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1816
1817 set_state_private(failure_tree, rec->start, 0);
1818 ret = clear_extent_bits(failure_tree, rec->start,
1819 rec->start + rec->len - 1,
1820 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1821 if (ret)
1822 err = ret;
1823
1824 if (did_repair) {
1825 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1826 rec->start + rec->len - 1,
1827 EXTENT_DAMAGED, GFP_NOFS);
1828 if (ret && !err)
1829 err = ret;
1830 }
1831
1832 kfree(rec);
1833 return err;
1834}
1835
1836static void repair_io_failure_callback(struct bio *bio, int err)
1837{
1838 complete(bio->bi_private);
1839}
1840
1841/*
1842 * this bypasses the standard btrfs submit functions deliberately, as
1843 * the standard behavior is to write all copies in a raid setup. here we only
1844 * want to write the one bad copy. so we do the mapping for ourselves and issue
1845 * submit_bio directly.
1846 * to avoid any synchonization issues, wait for the data after writing, which
1847 * actually prevents the read that triggered the error from finishing.
1848 * currently, there can be no more than two copies of every data bit. thus,
1849 * exactly one rewrite is required.
1850 */
1851int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1852 u64 length, u64 logical, struct page *page,
1853 int mirror_num)
1854{
1855 struct bio *bio;
1856 struct btrfs_device *dev;
1857 DECLARE_COMPLETION_ONSTACK(compl);
1858 u64 map_length = 0;
1859 u64 sector;
1860 struct btrfs_bio *bbio = NULL;
1861 int ret;
1862
1863 BUG_ON(!mirror_num);
1864
1865 bio = bio_alloc(GFP_NOFS, 1);
1866 if (!bio)
1867 return -EIO;
1868 bio->bi_private = &compl;
1869 bio->bi_end_io = repair_io_failure_callback;
1870 bio->bi_size = 0;
1871 map_length = length;
1872
1873 ret = btrfs_map_block(map_tree, WRITE, logical,
1874 &map_length, &bbio, mirror_num);
1875 if (ret) {
1876 bio_put(bio);
1877 return -EIO;
1878 }
1879 BUG_ON(mirror_num != bbio->mirror_num);
1880 sector = bbio->stripes[mirror_num-1].physical >> 9;
1881 bio->bi_sector = sector;
1882 dev = bbio->stripes[mirror_num-1].dev;
1883 kfree(bbio);
1884 if (!dev || !dev->bdev || !dev->writeable) {
1885 bio_put(bio);
1886 return -EIO;
1887 }
1888 bio->bi_bdev = dev->bdev;
1889 bio_add_page(bio, page, length, start-page_offset(page));
1890 submit_bio(WRITE_SYNC, bio);
1891 wait_for_completion(&compl);
1892
1893 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1894 /* try to remap that extent elsewhere? */
1895 bio_put(bio);
1896 return -EIO;
1897 }
1898
1899 printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
1900 "sector %llu)\n", page->mapping->host->i_ino, start,
1901 dev->name, sector);
1902
1903 bio_put(bio);
1904 return 0;
1905}
1906
1907/*
1908 * each time an IO finishes, we do a fast check in the IO failure tree
1909 * to see if we need to process or clean up an io_failure_record
1910 */
1911static int clean_io_failure(u64 start, struct page *page)
1912{
1913 u64 private;
1914 u64 private_failure;
1915 struct io_failure_record *failrec;
1916 struct btrfs_mapping_tree *map_tree;
1917 struct extent_state *state;
1918 int num_copies;
1919 int did_repair = 0;
1920 int ret;
1921 struct inode *inode = page->mapping->host;
1922
1923 private = 0;
1924 ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1925 (u64)-1, 1, EXTENT_DIRTY, 0);
1926 if (!ret)
1927 return 0;
1928
1929 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
1930 &private_failure);
1931 if (ret)
1932 return 0;
1933
1934 failrec = (struct io_failure_record *)(unsigned long) private_failure;
1935 BUG_ON(!failrec->this_mirror);
1936
1937 if (failrec->in_validation) {
1938 /* there was no real error, just free the record */
1939 pr_debug("clean_io_failure: freeing dummy error at %llu\n",
1940 failrec->start);
1941 did_repair = 1;
1942 goto out;
1943 }
1944
1945 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1946 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1947 failrec->start,
1948 EXTENT_LOCKED);
1949 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1950
1951 if (state && state->start == failrec->start) {
1952 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
1953 num_copies = btrfs_num_copies(map_tree, failrec->logical,
1954 failrec->len);
1955 if (num_copies > 1) {
1956 ret = repair_io_failure(map_tree, start, failrec->len,
1957 failrec->logical, page,
1958 failrec->failed_mirror);
1959 did_repair = !ret;
1960 }
1961 }
1962
1963out:
1964 if (!ret)
1965 ret = free_io_failure(inode, failrec, did_repair);
1966
1967 return ret;
1968}
1969
1970/*
1971 * this is a generic handler for readpage errors (default
1972 * readpage_io_failed_hook). if other copies exist, read those and write back
1973 * good data to the failed position. does not investigate in remapping the
1974 * failed extent elsewhere, hoping the device will be smart enough to do this as
1975 * needed
1976 */
1977
1978static int bio_readpage_error(struct bio *failed_bio, struct page *page,
1979 u64 start, u64 end, int failed_mirror,
1980 struct extent_state *state)
1981{
1982 struct io_failure_record *failrec = NULL;
1983 u64 private;
1984 struct extent_map *em;
1985 struct inode *inode = page->mapping->host;
1986 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1987 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
1988 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1989 struct bio *bio;
1990 int num_copies;
1991 int ret;
1992 int read_mode;
1993 u64 logical;
1994
1995 BUG_ON(failed_bio->bi_rw & REQ_WRITE);
1996
1997 ret = get_state_private(failure_tree, start, &private);
1998 if (ret) {
1999 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2000 if (!failrec)
2001 return -ENOMEM;
2002 failrec->start = start;
2003 failrec->len = end - start + 1;
2004 failrec->this_mirror = 0;
2005 failrec->bio_flags = 0;
2006 failrec->in_validation = 0;
2007
2008 read_lock(&em_tree->lock);
2009 em = lookup_extent_mapping(em_tree, start, failrec->len);
2010 if (!em) {
2011 read_unlock(&em_tree->lock);
2012 kfree(failrec);
2013 return -EIO;
2014 }
2015
2016 if (em->start > start || em->start + em->len < start) {
2017 free_extent_map(em);
2018 em = NULL;
2019 }
2020 read_unlock(&em_tree->lock);
2021
2022 if (!em || IS_ERR(em)) {
2023 kfree(failrec);
2024 return -EIO;
2025 }
2026 logical = start - em->start;
2027 logical = em->block_start + logical;
2028 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2029 logical = em->block_start;
2030 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2031 extent_set_compress_type(&failrec->bio_flags,
2032 em->compress_type);
2033 }
2034 pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
2035 "len=%llu\n", logical, start, failrec->len);
2036 failrec->logical = logical;
2037 free_extent_map(em);
2038
2039 /* set the bits in the private failure tree */
2040 ret = set_extent_bits(failure_tree, start, end,
2041 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2042 if (ret >= 0)
2043 ret = set_state_private(failure_tree, start,
2044 (u64)(unsigned long)failrec);
2045 /* set the bits in the inode's tree */
2046 if (ret >= 0)
2047 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
2048 GFP_NOFS);
2049 if (ret < 0) {
2050 kfree(failrec);
2051 return ret;
2052 }
2053 } else {
2054 failrec = (struct io_failure_record *)(unsigned long)private;
2055 pr_debug("bio_readpage_error: (found) logical=%llu, "
2056 "start=%llu, len=%llu, validation=%d\n",
2057 failrec->logical, failrec->start, failrec->len,
2058 failrec->in_validation);
2059 /*
2060 * when data can be on disk more than twice, add to failrec here
2061 * (e.g. with a list for failed_mirror) to make
2062 * clean_io_failure() clean all those errors at once.
2063 */
2064 }
2065 num_copies = btrfs_num_copies(
2066 &BTRFS_I(inode)->root->fs_info->mapping_tree,
2067 failrec->logical, failrec->len);
2068 if (num_copies == 1) {
2069 /*
2070 * we only have a single copy of the data, so don't bother with
2071 * all the retry and error correction code that follows. no
2072 * matter what the error is, it is very likely to persist.
2073 */
2074 pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
2075 "state=%p, num_copies=%d, next_mirror %d, "
2076 "failed_mirror %d\n", state, num_copies,
2077 failrec->this_mirror, failed_mirror);
2078 free_io_failure(inode, failrec, 0);
2079 return -EIO;
2080 }
2081
2082 if (!state) {
2083 spin_lock(&tree->lock);
2084 state = find_first_extent_bit_state(tree, failrec->start,
2085 EXTENT_LOCKED);
2086 if (state && state->start != failrec->start)
2087 state = NULL;
2088 spin_unlock(&tree->lock);
2089 }
2090
2091 /*
2092 * there are two premises:
2093 * a) deliver good data to the caller
2094 * b) correct the bad sectors on disk
2095 */
2096 if (failed_bio->bi_vcnt > 1) {
2097 /*
2098 * to fulfill b), we need to know the exact failing sectors, as
2099 * we don't want to rewrite any more than the failed ones. thus,
2100 * we need separate read requests for the failed bio
2101 *
2102 * if the following BUG_ON triggers, our validation request got
2103 * merged. we need separate requests for our algorithm to work.
2104 */
2105 BUG_ON(failrec->in_validation);
2106 failrec->in_validation = 1;
2107 failrec->this_mirror = failed_mirror;
2108 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2109 } else {
2110 /*
2111 * we're ready to fulfill a) and b) alongside. get a good copy
2112 * of the failed sector and if we succeed, we have setup
2113 * everything for repair_io_failure to do the rest for us.
2114 */
2115 if (failrec->in_validation) {
2116 BUG_ON(failrec->this_mirror != failed_mirror);
2117 failrec->in_validation = 0;
2118 failrec->this_mirror = 0;
2119 }
2120 failrec->failed_mirror = failed_mirror;
2121 failrec->this_mirror++;
2122 if (failrec->this_mirror == failed_mirror)
2123 failrec->this_mirror++;
2124 read_mode = READ_SYNC;
2125 }
2126
2127 if (!state || failrec->this_mirror > num_copies) {
2128 pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
2129 "next_mirror %d, failed_mirror %d\n", state,
2130 num_copies, failrec->this_mirror, failed_mirror);
2131 free_io_failure(inode, failrec, 0);
2132 return -EIO;
2133 }
2134
2135 bio = bio_alloc(GFP_NOFS, 1);
2136 bio->bi_private = state;
2137 bio->bi_end_io = failed_bio->bi_end_io;
2138 bio->bi_sector = failrec->logical >> 9;
2139 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
2140 bio->bi_size = 0;
2141
2142 bio_add_page(bio, page, failrec->len, start - page_offset(page));
2143
2144 pr_debug("bio_readpage_error: submitting new read[%#x] to "
2145 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
2146 failrec->this_mirror, num_copies, failrec->in_validation);
2147
2148 tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
2149 failrec->bio_flags, 0);
2150 return 0;
2151}
2152
1602/* lots and lots of room for performance fixes in the end_bio funcs */ 2153/* lots and lots of room for performance fixes in the end_bio funcs */
1603 2154
1604/* 2155/*
@@ -1697,6 +2248,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1697 struct extent_state *cached = NULL; 2248 struct extent_state *cached = NULL;
1698 struct extent_state *state; 2249 struct extent_state *state;
1699 2250
2251 pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
2252 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
2253 (long int)bio->bi_bdev);
1700 tree = &BTRFS_I(page->mapping->host)->io_tree; 2254 tree = &BTRFS_I(page->mapping->host)->io_tree;
1701 2255
1702 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 2256 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1727,11 +2281,19 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1727 state); 2281 state);
1728 if (ret) 2282 if (ret)
1729 uptodate = 0; 2283 uptodate = 0;
2284 else
2285 clean_io_failure(start, page);
1730 } 2286 }
1731 if (!uptodate && tree->ops && 2287 if (!uptodate) {
1732 tree->ops->readpage_io_failed_hook) { 2288 int failed_mirror;
1733 ret = tree->ops->readpage_io_failed_hook(bio, page, 2289 failed_mirror = (int)(unsigned long)bio->bi_bdev;
1734 start, end, NULL); 2290 if (tree->ops && tree->ops->readpage_io_failed_hook)
2291 ret = tree->ops->readpage_io_failed_hook(
2292 bio, page, start, end,
2293 failed_mirror, state);
2294 else
2295 ret = bio_readpage_error(bio, page, start, end,
2296 failed_mirror, NULL);
1735 if (ret == 0) { 2297 if (ret == 0) {
1736 uptodate = 2298 uptodate =
1737 test_bit(BIO_UPTODATE, &bio->bi_flags); 2299 test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -1811,6 +2373,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1811 mirror_num, bio_flags, start); 2373 mirror_num, bio_flags, start);
1812 else 2374 else
1813 submit_bio(rw, bio); 2375 submit_bio(rw, bio);
2376
1814 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 2377 if (bio_flagged(bio, BIO_EOPNOTSUPP))
1815 ret = -EOPNOTSUPP; 2378 ret = -EOPNOTSUPP;
1816 bio_put(bio); 2379 bio_put(bio);
@@ -2076,16 +2639,16 @@ out:
2076} 2639}
2077 2640
2078int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 2641int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2079 get_extent_t *get_extent) 2642 get_extent_t *get_extent, int mirror_num)
2080{ 2643{
2081 struct bio *bio = NULL; 2644 struct bio *bio = NULL;
2082 unsigned long bio_flags = 0; 2645 unsigned long bio_flags = 0;
2083 int ret; 2646 int ret;
2084 2647
2085 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, 2648 ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
2086 &bio_flags); 2649 &bio_flags);
2087 if (bio) 2650 if (bio)
2088 ret = submit_one_bio(READ, bio, 0, bio_flags); 2651 ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
2089 return ret; 2652 return ret;
2090} 2653}
2091 2654
@@ -2136,6 +2699,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2136 int compressed; 2699 int compressed;
2137 int write_flags; 2700 int write_flags;
2138 unsigned long nr_written = 0; 2701 unsigned long nr_written = 0;
2702 bool fill_delalloc = true;
2139 2703
2140 if (wbc->sync_mode == WB_SYNC_ALL) 2704 if (wbc->sync_mode == WB_SYNC_ALL)
2141 write_flags = WRITE_SYNC; 2705 write_flags = WRITE_SYNC;
@@ -2145,6 +2709,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2145 trace___extent_writepage(page, inode, wbc); 2709 trace___extent_writepage(page, inode, wbc);
2146 2710
2147 WARN_ON(!PageLocked(page)); 2711 WARN_ON(!PageLocked(page));
2712
2713 ClearPageError(page);
2714
2148 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2715 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2149 if (page->index > end_index || 2716 if (page->index > end_index ||
2150 (page->index == end_index && !pg_offset)) { 2717 (page->index == end_index && !pg_offset)) {
@@ -2166,10 +2733,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2166 2733
2167 set_page_extent_mapped(page); 2734 set_page_extent_mapped(page);
2168 2735
2736 if (!tree->ops || !tree->ops->fill_delalloc)
2737 fill_delalloc = false;
2738
2169 delalloc_start = start; 2739 delalloc_start = start;
2170 delalloc_end = 0; 2740 delalloc_end = 0;
2171 page_started = 0; 2741 page_started = 0;
2172 if (!epd->extent_locked) { 2742 if (!epd->extent_locked && fill_delalloc) {
2173 u64 delalloc_to_write = 0; 2743 u64 delalloc_to_write = 0;
2174 /* 2744 /*
2175 * make sure the wbc mapping index is at least updated 2745 * make sure the wbc mapping index is at least updated
@@ -2421,10 +2991,16 @@ retry:
2421 * swizzled back from swapper_space to tmpfs file 2991 * swizzled back from swapper_space to tmpfs file
2422 * mapping 2992 * mapping
2423 */ 2993 */
2424 if (tree->ops && tree->ops->write_cache_pages_lock_hook) 2994 if (tree->ops &&
2425 tree->ops->write_cache_pages_lock_hook(page); 2995 tree->ops->write_cache_pages_lock_hook) {
2426 else 2996 tree->ops->write_cache_pages_lock_hook(page,
2427 lock_page(page); 2997 data, flush_fn);
2998 } else {
2999 if (!trylock_page(page)) {
3000 flush_fn(data);
3001 lock_page(page);
3002 }
3003 }
2428 3004
2429 if (unlikely(page->mapping != mapping)) { 3005 if (unlikely(page->mapping != mapping)) {
2430 unlock_page(page); 3006 unlock_page(page);
@@ -2790,6 +3366,9 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2790 return -ENOMEM; 3366 return -ENOMEM;
2791 path->leave_spinning = 1; 3367 path->leave_spinning = 1;
2792 3368
3369 start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
3370 len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
3371
2793 /* 3372 /*
2794 * lookup the last file extent. We're not using i_size here 3373 * lookup the last file extent. We're not using i_size here
2795 * because there might be preallocation past i_size 3374 * because there might be preallocation past i_size
@@ -2837,7 +3416,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2837 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, 3416 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
2838 &cached_state, GFP_NOFS); 3417 &cached_state, GFP_NOFS);
2839 3418
2840 em = get_extent_skip_holes(inode, off, last_for_get_extent, 3419 em = get_extent_skip_holes(inode, start, last_for_get_extent,
2841 get_extent); 3420 get_extent);
2842 if (!em) 3421 if (!em)
2843 goto out; 3422 goto out;
@@ -2926,7 +3505,7 @@ out:
2926 return ret; 3505 return ret;
2927} 3506}
2928 3507
2929static inline struct page *extent_buffer_page(struct extent_buffer *eb, 3508inline struct page *extent_buffer_page(struct extent_buffer *eb,
2930 unsigned long i) 3509 unsigned long i)
2931{ 3510{
2932 struct page *p; 3511 struct page *p;
@@ -2951,7 +3530,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2951 return p; 3530 return p;
2952} 3531}
2953 3532
2954static inline unsigned long num_extent_pages(u64 start, u64 len) 3533inline unsigned long num_extent_pages(u64 start, u64 len)
2955{ 3534{
2956 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - 3535 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2957 (start >> PAGE_CACHE_SHIFT); 3536 (start >> PAGE_CACHE_SHIFT);
@@ -3204,6 +3783,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3204 PAGECACHE_TAG_DIRTY); 3783 PAGECACHE_TAG_DIRTY);
3205 } 3784 }
3206 spin_unlock_irq(&page->mapping->tree_lock); 3785 spin_unlock_irq(&page->mapping->tree_lock);
3786 ClearPageError(page);
3207 unlock_page(page); 3787 unlock_page(page);
3208 } 3788 }
3209 return 0; 3789 return 0;
@@ -3349,8 +3929,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3349} 3929}
3350 3930
3351int read_extent_buffer_pages(struct extent_io_tree *tree, 3931int read_extent_buffer_pages(struct extent_io_tree *tree,
3352 struct extent_buffer *eb, 3932 struct extent_buffer *eb, u64 start, int wait,
3353 u64 start, int wait,
3354 get_extent_t *get_extent, int mirror_num) 3933 get_extent_t *get_extent, int mirror_num)
3355{ 3934{
3356 unsigned long i; 3935 unsigned long i;
@@ -3386,7 +3965,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3386 num_pages = num_extent_pages(eb->start, eb->len); 3965 num_pages = num_extent_pages(eb->start, eb->len);
3387 for (i = start_i; i < num_pages; i++) { 3966 for (i = start_i; i < num_pages; i++) {
3388 page = extent_buffer_page(eb, i); 3967 page = extent_buffer_page(eb, i);
3389 if (!wait) { 3968 if (wait == WAIT_NONE) {
3390 if (!trylock_page(page)) 3969 if (!trylock_page(page))
3391 goto unlock_exit; 3970 goto unlock_exit;
3392 } else { 3971 } else {
@@ -3430,7 +4009,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3430 if (bio) 4009 if (bio)
3431 submit_one_bio(READ, bio, mirror_num, bio_flags); 4010 submit_one_bio(READ, bio, mirror_num, bio_flags);
3432 4011
3433 if (ret || !wait) 4012 if (ret || wait != WAIT_COMPLETE)
3434 return ret; 4013 return ret;
3435 4014
3436 for (i = start_i; i < num_pages; i++) { 4015 for (i = start_i; i < num_pages; i++) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7b2f0c3e7929..7604c3001322 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -17,6 +17,8 @@
17#define EXTENT_NODATASUM (1 << 10) 17#define EXTENT_NODATASUM (1 << 10)
18#define EXTENT_DO_ACCOUNTING (1 << 11) 18#define EXTENT_DO_ACCOUNTING (1 << 11)
19#define EXTENT_FIRST_DELALLOC (1 << 12) 19#define EXTENT_FIRST_DELALLOC (1 << 12)
20#define EXTENT_NEED_WAIT (1 << 13)
21#define EXTENT_DAMAGED (1 << 14)
20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 22#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) 23#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
22 24
@@ -32,6 +34,7 @@
32#define EXTENT_BUFFER_BLOCKING 1 34#define EXTENT_BUFFER_BLOCKING 1
33#define EXTENT_BUFFER_DIRTY 2 35#define EXTENT_BUFFER_DIRTY 2
34#define EXTENT_BUFFER_CORRUPT 3 36#define EXTENT_BUFFER_CORRUPT 3
37#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */
35 38
36/* these are flags for extent_clear_unlock_delalloc */ 39/* these are flags for extent_clear_unlock_delalloc */
37#define EXTENT_CLEAR_UNLOCK_PAGE 0x1 40#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -67,7 +70,7 @@ struct extent_io_ops {
67 unsigned long bio_flags); 70 unsigned long bio_flags);
68 int (*readpage_io_hook)(struct page *page, u64 start, u64 end); 71 int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
69 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, 72 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
70 u64 start, u64 end, 73 u64 start, u64 end, int failed_mirror,
71 struct extent_state *state); 74 struct extent_state *state);
72 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, 75 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
73 u64 start, u64 end, 76 u64 start, u64 end,
@@ -85,7 +88,8 @@ struct extent_io_ops {
85 struct extent_state *other); 88 struct extent_state *other);
86 void (*split_extent_hook)(struct inode *inode, 89 void (*split_extent_hook)(struct inode *inode,
87 struct extent_state *orig, u64 split); 90 struct extent_state *orig, u64 split);
88 int (*write_cache_pages_lock_hook)(struct page *page); 91 int (*write_cache_pages_lock_hook)(struct page *page, void *data,
92 void (*flush_fn)(void *));
89}; 93};
90 94
91struct extent_io_tree { 95struct extent_io_tree {
@@ -185,7 +189,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
185int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, 189int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
186 gfp_t mask); 190 gfp_t mask);
187int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 191int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
188 get_extent_t *get_extent); 192 get_extent_t *get_extent, int mirror_num);
189int __init extent_io_init(void); 193int __init extent_io_init(void);
190void extent_io_exit(void); 194void extent_io_exit(void);
191 195
@@ -214,6 +218,8 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
214 gfp_t mask); 218 gfp_t mask);
215int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 219int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
216 gfp_t mask); 220 gfp_t mask);
221int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
222 int bits, int clear_bits, gfp_t mask);
217int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 223int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
218 struct extent_state **cached_state, gfp_t mask); 224 struct extent_state **cached_state, gfp_t mask);
219int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 225int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@ -248,9 +254,14 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
248struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 254struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
249 u64 start, unsigned long len); 255 u64 start, unsigned long len);
250void free_extent_buffer(struct extent_buffer *eb); 256void free_extent_buffer(struct extent_buffer *eb);
257#define WAIT_NONE 0
258#define WAIT_COMPLETE 1
259#define WAIT_PAGE_LOCK 2
251int read_extent_buffer_pages(struct extent_io_tree *tree, 260int read_extent_buffer_pages(struct extent_io_tree *tree,
252 struct extent_buffer *eb, u64 start, int wait, 261 struct extent_buffer *eb, u64 start, int wait,
253 get_extent_t *get_extent, int mirror_num); 262 get_extent_t *get_extent, int mirror_num);
263unsigned long num_extent_pages(u64 start, u64 len);
264struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i);
254 265
255static inline void extent_buffer_get(struct extent_buffer *eb) 266static inline void extent_buffer_get(struct extent_buffer *eb)
256{ 267{
@@ -300,4 +311,10 @@ int extent_clear_unlock_delalloc(struct inode *inode,
300struct bio * 311struct bio *
301btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 312btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
302 gfp_t gfp_flags); 313 gfp_t gfp_flags);
314
315struct btrfs_mapping_tree;
316
317int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
318 u64 length, u64 logical, struct page *page,
319 int mirror_num);
303#endif 320#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a1cb7821becd..c7fb3a4247d3 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -91,8 +91,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
91 struct btrfs_csum_item *item; 91 struct btrfs_csum_item *item;
92 struct extent_buffer *leaf; 92 struct extent_buffer *leaf;
93 u64 csum_offset = 0; 93 u64 csum_offset = 0;
94 u16 csum_size = 94 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
95 btrfs_super_csum_size(&root->fs_info->super_copy);
96 int csums_in_item; 95 int csums_in_item;
97 96
98 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 97 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -162,8 +161,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
162 u64 item_last_offset = 0; 161 u64 item_last_offset = 0;
163 u64 disk_bytenr; 162 u64 disk_bytenr;
164 u32 diff; 163 u32 diff;
165 u16 csum_size = 164 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
166 btrfs_super_csum_size(&root->fs_info->super_copy);
167 int ret; 165 int ret;
168 struct btrfs_path *path; 166 struct btrfs_path *path;
169 struct btrfs_csum_item *item = NULL; 167 struct btrfs_csum_item *item = NULL;
@@ -290,7 +288,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
290 int ret; 288 int ret;
291 size_t size; 289 size_t size;
292 u64 csum_end; 290 u64 csum_end;
293 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); 291 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
294 292
295 path = btrfs_alloc_path(); 293 path = btrfs_alloc_path();
296 if (!path) 294 if (!path)
@@ -492,8 +490,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
492 u64 bytenr, u64 len) 490 u64 bytenr, u64 len)
493{ 491{
494 struct extent_buffer *leaf; 492 struct extent_buffer *leaf;
495 u16 csum_size = 493 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
496 btrfs_super_csum_size(&root->fs_info->super_copy);
497 u64 csum_end; 494 u64 csum_end;
498 u64 end_byte = bytenr + len; 495 u64 end_byte = bytenr + len;
499 u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; 496 u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
@@ -549,8 +546,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
549 u64 csum_end; 546 u64 csum_end;
550 struct extent_buffer *leaf; 547 struct extent_buffer *leaf;
551 int ret; 548 int ret;
552 u16 csum_size = 549 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
553 btrfs_super_csum_size(&root->fs_info->super_copy);
554 int blocksize_bits = root->fs_info->sb->s_blocksize_bits; 550 int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
555 551
556 root = root->fs_info->csum_root; 552 root = root->fs_info->csum_root;
@@ -676,8 +672,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
676 struct btrfs_sector_sum *sector_sum; 672 struct btrfs_sector_sum *sector_sum;
677 u32 nritems; 673 u32 nritems;
678 u32 ins_size; 674 u32 ins_size;
679 u16 csum_size = 675 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
680 btrfs_super_csum_size(&root->fs_info->super_copy);
681 676
682 path = btrfs_alloc_path(); 677 path = btrfs_alloc_path();
683 if (!path) 678 if (!path)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1266f6e9cdb2..dafdfa059bf6 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1069,6 +1069,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1069 int i; 1069 int i;
1070 unsigned long index = pos >> PAGE_CACHE_SHIFT; 1070 unsigned long index = pos >> PAGE_CACHE_SHIFT;
1071 struct inode *inode = fdentry(file)->d_inode; 1071 struct inode *inode = fdentry(file)->d_inode;
1072 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1072 int err = 0; 1073 int err = 0;
1073 int faili = 0; 1074 int faili = 0;
1074 u64 start_pos; 1075 u64 start_pos;
@@ -1080,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1080again: 1081again:
1081 for (i = 0; i < num_pages; i++) { 1082 for (i = 0; i < num_pages; i++) {
1082 pages[i] = find_or_create_page(inode->i_mapping, index + i, 1083 pages[i] = find_or_create_page(inode->i_mapping, index + i,
1083 GFP_NOFS); 1084 mask);
1084 if (!pages[i]) { 1085 if (!pages[i]) {
1085 faili = i - 1; 1086 faili = i - 1;
1086 err = -ENOMEM; 1087 err = -ENOMEM;
@@ -1615,10 +1616,6 @@ static long btrfs_fallocate(struct file *file, int mode,
1615 goto out; 1616 goto out;
1616 } 1617 }
1617 1618
1618 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
1619 if (ret)
1620 goto out;
1621
1622 locked_end = alloc_end - 1; 1619 locked_end = alloc_end - 1;
1623 while (1) { 1620 while (1) {
1624 struct btrfs_ordered_extent *ordered; 1621 struct btrfs_ordered_extent *ordered;
@@ -1664,11 +1661,27 @@ static long btrfs_fallocate(struct file *file, int mode,
1664 if (em->block_start == EXTENT_MAP_HOLE || 1661 if (em->block_start == EXTENT_MAP_HOLE ||
1665 (cur_offset >= inode->i_size && 1662 (cur_offset >= inode->i_size &&
1666 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 1663 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
1664
1665 /*
1666 * Make sure we have enough space before we do the
1667 * allocation.
1668 */
1669 ret = btrfs_check_data_free_space(inode, last_byte -
1670 cur_offset);
1671 if (ret) {
1672 free_extent_map(em);
1673 break;
1674 }
1675
1667 ret = btrfs_prealloc_file_range(inode, mode, cur_offset, 1676 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
1668 last_byte - cur_offset, 1677 last_byte - cur_offset,
1669 1 << inode->i_blkbits, 1678 1 << inode->i_blkbits,
1670 offset + len, 1679 offset + len,
1671 &alloc_hint); 1680 &alloc_hint);
1681
1682 /* Let go of our reservation. */
1683 btrfs_free_reserved_data_space(inode, last_byte -
1684 cur_offset);
1672 if (ret < 0) { 1685 if (ret < 0) {
1673 free_extent_map(em); 1686 free_extent_map(em);
1674 break; 1687 break;
@@ -1694,8 +1707,6 @@ static long btrfs_fallocate(struct file *file, int mode,
1694 } 1707 }
1695 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 1708 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
1696 &cached_state, GFP_NOFS); 1709 &cached_state, GFP_NOFS);
1697
1698 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
1699out: 1710out:
1700 mutex_unlock(&inode->i_mutex); 1711 mutex_unlock(&inode->i_mutex);
1701 return ret; 1712 return ret;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 41ac927401d0..6e5b7e463698 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -20,6 +20,7 @@
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/math64.h> 22#include <linux/math64.h>
23#include <linux/ratelimit.h>
23#include "ctree.h" 24#include "ctree.h"
24#include "free-space-cache.h" 25#include "free-space-cache.h"
25#include "transaction.h" 26#include "transaction.h"
@@ -84,6 +85,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
84 *block_group, struct btrfs_path *path) 85 *block_group, struct btrfs_path *path)
85{ 86{
86 struct inode *inode = NULL; 87 struct inode *inode = NULL;
88 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
87 89
88 spin_lock(&block_group->lock); 90 spin_lock(&block_group->lock);
89 if (block_group->inode) 91 if (block_group->inode)
@@ -98,13 +100,14 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
98 return inode; 100 return inode;
99 101
100 spin_lock(&block_group->lock); 102 spin_lock(&block_group->lock);
101 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) { 103 if (!((BTRFS_I(inode)->flags & flags) == flags)) {
102 printk(KERN_INFO "Old style space inode found, converting.\n"); 104 printk(KERN_INFO "Old style space inode found, converting.\n");
103 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM; 105 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
106 BTRFS_INODE_NODATACOW;
104 block_group->disk_cache_state = BTRFS_DC_CLEAR; 107 block_group->disk_cache_state = BTRFS_DC_CLEAR;
105 } 108 }
106 109
107 if (!btrfs_fs_closing(root->fs_info)) { 110 if (!block_group->iref) {
108 block_group->inode = igrab(inode); 111 block_group->inode = igrab(inode);
109 block_group->iref = 1; 112 block_group->iref = 1;
110 } 113 }
@@ -122,12 +125,17 @@ int __create_free_space_inode(struct btrfs_root *root,
122 struct btrfs_free_space_header *header; 125 struct btrfs_free_space_header *header;
123 struct btrfs_inode_item *inode_item; 126 struct btrfs_inode_item *inode_item;
124 struct extent_buffer *leaf; 127 struct extent_buffer *leaf;
128 u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
125 int ret; 129 int ret;
126 130
127 ret = btrfs_insert_empty_inode(trans, root, path, ino); 131 ret = btrfs_insert_empty_inode(trans, root, path, ino);
128 if (ret) 132 if (ret)
129 return ret; 133 return ret;
130 134
135 /* We inline crc's for the free disk space cache */
136 if (ino != BTRFS_FREE_INO_OBJECTID)
137 flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
138
131 leaf = path->nodes[0]; 139 leaf = path->nodes[0];
132 inode_item = btrfs_item_ptr(leaf, path->slots[0], 140 inode_item = btrfs_item_ptr(leaf, path->slots[0],
133 struct btrfs_inode_item); 141 struct btrfs_inode_item);
@@ -140,8 +148,7 @@ int __create_free_space_inode(struct btrfs_root *root,
140 btrfs_set_inode_uid(leaf, inode_item, 0); 148 btrfs_set_inode_uid(leaf, inode_item, 0);
141 btrfs_set_inode_gid(leaf, inode_item, 0); 149 btrfs_set_inode_gid(leaf, inode_item, 0);
142 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 150 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
143 btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | 151 btrfs_set_inode_flags(leaf, inode_item, flags);
144 BTRFS_INODE_PREALLOC);
145 btrfs_set_inode_nlink(leaf, inode_item, 1); 152 btrfs_set_inode_nlink(leaf, inode_item, 1);
146 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 153 btrfs_set_inode_transid(leaf, inode_item, trans->transid);
147 btrfs_set_inode_block_group(leaf, inode_item, offset); 154 btrfs_set_inode_block_group(leaf, inode_item, offset);
@@ -191,16 +198,24 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
191 struct inode *inode) 198 struct inode *inode)
192{ 199{
193 struct btrfs_block_rsv *rsv; 200 struct btrfs_block_rsv *rsv;
201 u64 needed_bytes;
194 loff_t oldsize; 202 loff_t oldsize;
195 int ret = 0; 203 int ret = 0;
196 204
197 rsv = trans->block_rsv; 205 rsv = trans->block_rsv;
198 trans->block_rsv = root->orphan_block_rsv; 206 trans->block_rsv = &root->fs_info->global_block_rsv;
199 ret = btrfs_block_rsv_check(trans, root, 207
200 root->orphan_block_rsv, 208 /* 1 for slack space, 1 for updating the inode */
201 0, 5); 209 needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) +
202 if (ret) 210 btrfs_calc_trans_metadata_size(root, 1);
203 return ret; 211
212 spin_lock(&trans->block_rsv->lock);
213 if (trans->block_rsv->reserved < needed_bytes) {
214 spin_unlock(&trans->block_rsv->lock);
215 trans->block_rsv = rsv;
216 return -ENOSPC;
217 }
218 spin_unlock(&trans->block_rsv->lock);
204 219
205 oldsize = i_size_read(inode); 220 oldsize = i_size_read(inode);
206 btrfs_i_size_write(inode, 0); 221 btrfs_i_size_write(inode, 0);
@@ -213,13 +228,15 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
213 ret = btrfs_truncate_inode_items(trans, root, inode, 228 ret = btrfs_truncate_inode_items(trans, root, inode,
214 0, BTRFS_EXTENT_DATA_KEY); 229 0, BTRFS_EXTENT_DATA_KEY);
215 230
216 trans->block_rsv = rsv;
217 if (ret) { 231 if (ret) {
232 trans->block_rsv = rsv;
218 WARN_ON(1); 233 WARN_ON(1);
219 return ret; 234 return ret;
220 } 235 }
221 236
222 ret = btrfs_update_inode(trans, root, inode); 237 ret = btrfs_update_inode(trans, root, inode);
238 trans->block_rsv = rsv;
239
223 return ret; 240 return ret;
224} 241}
225 242
@@ -242,26 +259,348 @@ static int readahead_cache(struct inode *inode)
242 return 0; 259 return 0;
243} 260}
244 261
262struct io_ctl {
263 void *cur, *orig;
264 struct page *page;
265 struct page **pages;
266 struct btrfs_root *root;
267 unsigned long size;
268 int index;
269 int num_pages;
270 unsigned check_crcs:1;
271};
272
273static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
274 struct btrfs_root *root)
275{
276 memset(io_ctl, 0, sizeof(struct io_ctl));
277 io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
278 PAGE_CACHE_SHIFT;
279 io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages,
280 GFP_NOFS);
281 if (!io_ctl->pages)
282 return -ENOMEM;
283 io_ctl->root = root;
284 if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
285 io_ctl->check_crcs = 1;
286 return 0;
287}
288
289static void io_ctl_free(struct io_ctl *io_ctl)
290{
291 kfree(io_ctl->pages);
292}
293
294static void io_ctl_unmap_page(struct io_ctl *io_ctl)
295{
296 if (io_ctl->cur) {
297 kunmap(io_ctl->page);
298 io_ctl->cur = NULL;
299 io_ctl->orig = NULL;
300 }
301}
302
303static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
304{
305 WARN_ON(io_ctl->cur);
306 BUG_ON(io_ctl->index >= io_ctl->num_pages);
307 io_ctl->page = io_ctl->pages[io_ctl->index++];
308 io_ctl->cur = kmap(io_ctl->page);
309 io_ctl->orig = io_ctl->cur;
310 io_ctl->size = PAGE_CACHE_SIZE;
311 if (clear)
312 memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
313}
314
315static void io_ctl_drop_pages(struct io_ctl *io_ctl)
316{
317 int i;
318
319 io_ctl_unmap_page(io_ctl);
320
321 for (i = 0; i < io_ctl->num_pages; i++) {
322 ClearPageChecked(io_ctl->pages[i]);
323 unlock_page(io_ctl->pages[i]);
324 page_cache_release(io_ctl->pages[i]);
325 }
326}
327
328static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
329 int uptodate)
330{
331 struct page *page;
332 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
333 int i;
334
335 for (i = 0; i < io_ctl->num_pages; i++) {
336 page = find_or_create_page(inode->i_mapping, i, mask);
337 if (!page) {
338 io_ctl_drop_pages(io_ctl);
339 return -ENOMEM;
340 }
341 io_ctl->pages[i] = page;
342 if (uptodate && !PageUptodate(page)) {
343 btrfs_readpage(NULL, page);
344 lock_page(page);
345 if (!PageUptodate(page)) {
346 printk(KERN_ERR "btrfs: error reading free "
347 "space cache\n");
348 io_ctl_drop_pages(io_ctl);
349 return -EIO;
350 }
351 }
352 }
353
354 for (i = 0; i < io_ctl->num_pages; i++) {
355 clear_page_dirty_for_io(io_ctl->pages[i]);
356 set_page_extent_mapped(io_ctl->pages[i]);
357 }
358
359 return 0;
360}
361
362static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
363{
364 u64 *val;
365
366 io_ctl_map_page(io_ctl, 1);
367
368 /*
369 * Skip the csum areas. If we don't check crcs then we just have a
370 * 64bit chunk at the front of the first page.
371 */
372 if (io_ctl->check_crcs) {
373 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
374 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
375 } else {
376 io_ctl->cur += sizeof(u64);
377 io_ctl->size -= sizeof(u64) * 2;
378 }
379
380 val = io_ctl->cur;
381 *val = cpu_to_le64(generation);
382 io_ctl->cur += sizeof(u64);
383}
384
385static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
386{
387 u64 *gen;
388
389 /*
390 * Skip the crc area. If we don't check crcs then we just have a 64bit
391 * chunk at the front of the first page.
392 */
393 if (io_ctl->check_crcs) {
394 io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
395 io_ctl->size -= sizeof(u64) +
396 (sizeof(u32) * io_ctl->num_pages);
397 } else {
398 io_ctl->cur += sizeof(u64);
399 io_ctl->size -= sizeof(u64) * 2;
400 }
401
402 gen = io_ctl->cur;
403 if (le64_to_cpu(*gen) != generation) {
404 printk_ratelimited(KERN_ERR "btrfs: space cache generation "
405 "(%Lu) does not match inode (%Lu)\n", *gen,
406 generation);
407 io_ctl_unmap_page(io_ctl);
408 return -EIO;
409 }
410 io_ctl->cur += sizeof(u64);
411 return 0;
412}
413
414static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
415{
416 u32 *tmp;
417 u32 crc = ~(u32)0;
418 unsigned offset = 0;
419
420 if (!io_ctl->check_crcs) {
421 io_ctl_unmap_page(io_ctl);
422 return;
423 }
424
425 if (index == 0)
426 offset = sizeof(u32) * io_ctl->num_pages;;
427
428 crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
429 PAGE_CACHE_SIZE - offset);
430 btrfs_csum_final(crc, (char *)&crc);
431 io_ctl_unmap_page(io_ctl);
432 tmp = kmap(io_ctl->pages[0]);
433 tmp += index;
434 *tmp = crc;
435 kunmap(io_ctl->pages[0]);
436}
437
438static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
439{
440 u32 *tmp, val;
441 u32 crc = ~(u32)0;
442 unsigned offset = 0;
443
444 if (!io_ctl->check_crcs) {
445 io_ctl_map_page(io_ctl, 0);
446 return 0;
447 }
448
449 if (index == 0)
450 offset = sizeof(u32) * io_ctl->num_pages;
451
452 tmp = kmap(io_ctl->pages[0]);
453 tmp += index;
454 val = *tmp;
455 kunmap(io_ctl->pages[0]);
456
457 io_ctl_map_page(io_ctl, 0);
458 crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
459 PAGE_CACHE_SIZE - offset);
460 btrfs_csum_final(crc, (char *)&crc);
461 if (val != crc) {
462 printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free "
463 "space cache\n");
464 io_ctl_unmap_page(io_ctl);
465 return -EIO;
466 }
467
468 return 0;
469}
470
471static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
472 void *bitmap)
473{
474 struct btrfs_free_space_entry *entry;
475
476 if (!io_ctl->cur)
477 return -ENOSPC;
478
479 entry = io_ctl->cur;
480 entry->offset = cpu_to_le64(offset);
481 entry->bytes = cpu_to_le64(bytes);
482 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
483 BTRFS_FREE_SPACE_EXTENT;
484 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
485 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
486
487 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
488 return 0;
489
490 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
491
492 /* No more pages to map */
493 if (io_ctl->index >= io_ctl->num_pages)
494 return 0;
495
496 /* map the next page */
497 io_ctl_map_page(io_ctl, 1);
498 return 0;
499}
500
501static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
502{
503 if (!io_ctl->cur)
504 return -ENOSPC;
505
506 /*
507 * If we aren't at the start of the current page, unmap this one and
508 * map the next one if there is any left.
509 */
510 if (io_ctl->cur != io_ctl->orig) {
511 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
512 if (io_ctl->index >= io_ctl->num_pages)
513 return -ENOSPC;
514 io_ctl_map_page(io_ctl, 0);
515 }
516
517 memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
518 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
519 if (io_ctl->index < io_ctl->num_pages)
520 io_ctl_map_page(io_ctl, 0);
521 return 0;
522}
523
524static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
525{
526 /*
527 * If we're not on the boundary we know we've modified the page and we
528 * need to crc the page.
529 */
530 if (io_ctl->cur != io_ctl->orig)
531 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
532 else
533 io_ctl_unmap_page(io_ctl);
534
535 while (io_ctl->index < io_ctl->num_pages) {
536 io_ctl_map_page(io_ctl, 1);
537 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
538 }
539}
540
541static int io_ctl_read_entry(struct io_ctl *io_ctl,
542 struct btrfs_free_space *entry, u8 *type)
543{
544 struct btrfs_free_space_entry *e;
545 int ret;
546
547 if (!io_ctl->cur) {
548 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
549 if (ret)
550 return ret;
551 }
552
553 e = io_ctl->cur;
554 entry->offset = le64_to_cpu(e->offset);
555 entry->bytes = le64_to_cpu(e->bytes);
556 *type = e->type;
557 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
558 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
559
560 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
561 return 0;
562
563 io_ctl_unmap_page(io_ctl);
564
565 return 0;
566}
567
568static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
569 struct btrfs_free_space *entry)
570{
571 int ret;
572
573 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
574 if (ret)
575 return ret;
576
577 memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE);
578 io_ctl_unmap_page(io_ctl);
579
580 return 0;
581}
582
245int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, 583int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
246 struct btrfs_free_space_ctl *ctl, 584 struct btrfs_free_space_ctl *ctl,
247 struct btrfs_path *path, u64 offset) 585 struct btrfs_path *path, u64 offset)
248{ 586{
249 struct btrfs_free_space_header *header; 587 struct btrfs_free_space_header *header;
250 struct extent_buffer *leaf; 588 struct extent_buffer *leaf;
251 struct page *page; 589 struct io_ctl io_ctl;
252 struct btrfs_key key; 590 struct btrfs_key key;
591 struct btrfs_free_space *e, *n;
253 struct list_head bitmaps; 592 struct list_head bitmaps;
254 u64 num_entries; 593 u64 num_entries;
255 u64 num_bitmaps; 594 u64 num_bitmaps;
256 u64 generation; 595 u64 generation;
257 pgoff_t index = 0; 596 u8 type;
258 int ret = 0; 597 int ret = 0;
259 598
260 INIT_LIST_HEAD(&bitmaps); 599 INIT_LIST_HEAD(&bitmaps);
261 600
262 /* Nothing in the space cache, goodbye */ 601 /* Nothing in the space cache, goodbye */
263 if (!i_size_read(inode)) 602 if (!i_size_read(inode))
264 goto out; 603 return 0;
265 604
266 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 605 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
267 key.offset = offset; 606 key.offset = offset;
@@ -269,11 +608,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
269 608
270 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 609 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
271 if (ret < 0) 610 if (ret < 0)
272 goto out; 611 return 0;
273 else if (ret > 0) { 612 else if (ret > 0) {
274 btrfs_release_path(path); 613 btrfs_release_path(path);
275 ret = 0; 614 return 0;
276 goto out;
277 } 615 }
278 616
279 ret = -1; 617 ret = -1;
@@ -291,169 +629,102 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
291 " not match free space cache generation (%llu)\n", 629 " not match free space cache generation (%llu)\n",
292 (unsigned long long)BTRFS_I(inode)->generation, 630 (unsigned long long)BTRFS_I(inode)->generation,
293 (unsigned long long)generation); 631 (unsigned long long)generation);
294 goto out; 632 return 0;
295 } 633 }
296 634
297 if (!num_entries) 635 if (!num_entries)
298 goto out; 636 return 0;
299 637
638 io_ctl_init(&io_ctl, inode, root);
300 ret = readahead_cache(inode); 639 ret = readahead_cache(inode);
301 if (ret) 640 if (ret)
302 goto out; 641 goto out;
303 642
304 while (1) { 643 ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
305 struct btrfs_free_space_entry *entry; 644 if (ret)
306 struct btrfs_free_space *e; 645 goto out;
307 void *addr;
308 unsigned long offset = 0;
309 int need_loop = 0;
310 646
311 if (!num_entries && !num_bitmaps) 647 ret = io_ctl_check_crc(&io_ctl, 0);
312 break; 648 if (ret)
649 goto free_cache;
650
651 ret = io_ctl_check_generation(&io_ctl, generation);
652 if (ret)
653 goto free_cache;
313 654
314 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 655 while (num_entries) {
315 if (!page) 656 e = kmem_cache_zalloc(btrfs_free_space_cachep,
657 GFP_NOFS);
658 if (!e)
316 goto free_cache; 659 goto free_cache;
317 660
318 if (!PageUptodate(page)) { 661 ret = io_ctl_read_entry(&io_ctl, e, &type);
319 btrfs_readpage(NULL, page); 662 if (ret) {
320 lock_page(page); 663 kmem_cache_free(btrfs_free_space_cachep, e);
321 if (!PageUptodate(page)) { 664 goto free_cache;
322 unlock_page(page);
323 page_cache_release(page);
324 printk(KERN_ERR "btrfs: error reading free "
325 "space cache\n");
326 goto free_cache;
327 }
328 } 665 }
329 addr = kmap(page);
330 666
331 if (index == 0) { 667 if (!e->bytes) {
332 u64 *gen; 668 kmem_cache_free(btrfs_free_space_cachep, e);
669 goto free_cache;
670 }
333 671
334 /* 672 if (type == BTRFS_FREE_SPACE_EXTENT) {
335 * We put a bogus crc in the front of the first page in 673 spin_lock(&ctl->tree_lock);
336 * case old kernels try to mount a fs with the new 674 ret = link_free_space(ctl, e);
337 * format to make sure they discard the cache. 675 spin_unlock(&ctl->tree_lock);
338 */ 676 if (ret) {
339 addr += sizeof(u64); 677 printk(KERN_ERR "Duplicate entries in "
340 offset += sizeof(u64); 678 "free space cache, dumping\n");
341 679 kmem_cache_free(btrfs_free_space_cachep, e);
342 gen = addr;
343 if (*gen != BTRFS_I(inode)->generation) {
344 printk(KERN_ERR "btrfs: space cache generation"
345 " (%llu) does not match inode (%llu)\n",
346 (unsigned long long)*gen,
347 (unsigned long long)
348 BTRFS_I(inode)->generation);
349 kunmap(page);
350 unlock_page(page);
351 page_cache_release(page);
352 goto free_cache; 680 goto free_cache;
353 } 681 }
354 addr += sizeof(u64); 682 } else {
355 offset += sizeof(u64); 683 BUG_ON(!num_bitmaps);
356 } 684 num_bitmaps--;
357 entry = addr; 685 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
358 686 if (!e->bitmap) {
359 while (1) { 687 kmem_cache_free(
360 if (!num_entries) 688 btrfs_free_space_cachep, e);
361 break;
362
363 need_loop = 1;
364 e = kmem_cache_zalloc(btrfs_free_space_cachep,
365 GFP_NOFS);
366 if (!e) {
367 kunmap(page);
368 unlock_page(page);
369 page_cache_release(page);
370 goto free_cache; 689 goto free_cache;
371 } 690 }
372 691 spin_lock(&ctl->tree_lock);
373 e->offset = le64_to_cpu(entry->offset); 692 ret = link_free_space(ctl, e);
374 e->bytes = le64_to_cpu(entry->bytes); 693 ctl->total_bitmaps++;
375 if (!e->bytes) { 694 ctl->op->recalc_thresholds(ctl);
376 kunmap(page); 695 spin_unlock(&ctl->tree_lock);
696 if (ret) {
697 printk(KERN_ERR "Duplicate entries in "
698 "free space cache, dumping\n");
377 kmem_cache_free(btrfs_free_space_cachep, e); 699 kmem_cache_free(btrfs_free_space_cachep, e);
378 unlock_page(page);
379 page_cache_release(page);
380 goto free_cache; 700 goto free_cache;
381 } 701 }
382 702 list_add_tail(&e->list, &bitmaps);
383 if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
384 spin_lock(&ctl->tree_lock);
385 ret = link_free_space(ctl, e);
386 spin_unlock(&ctl->tree_lock);
387 if (ret) {
388 printk(KERN_ERR "Duplicate entries in "
389 "free space cache, dumping\n");
390 kunmap(page);
391 unlock_page(page);
392 page_cache_release(page);
393 goto free_cache;
394 }
395 } else {
396 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
397 if (!e->bitmap) {
398 kunmap(page);
399 kmem_cache_free(
400 btrfs_free_space_cachep, e);
401 unlock_page(page);
402 page_cache_release(page);
403 goto free_cache;
404 }
405 spin_lock(&ctl->tree_lock);
406 ret = link_free_space(ctl, e);
407 ctl->total_bitmaps++;
408 ctl->op->recalc_thresholds(ctl);
409 spin_unlock(&ctl->tree_lock);
410 if (ret) {
411 printk(KERN_ERR "Duplicate entries in "
412 "free space cache, dumping\n");
413 kunmap(page);
414 unlock_page(page);
415 page_cache_release(page);
416 goto free_cache;
417 }
418 list_add_tail(&e->list, &bitmaps);
419 }
420
421 num_entries--;
422 offset += sizeof(struct btrfs_free_space_entry);
423 if (offset + sizeof(struct btrfs_free_space_entry) >=
424 PAGE_CACHE_SIZE)
425 break;
426 entry++;
427 } 703 }
428 704
429 /* 705 num_entries--;
430 * We read an entry out of this page, we need to move on to the 706 }
431 * next page.
432 */
433 if (need_loop) {
434 kunmap(page);
435 goto next;
436 }
437 707
438 /* 708 io_ctl_unmap_page(&io_ctl);
439 * We add the bitmaps at the end of the entries in order that 709
440 * the bitmap entries are added to the cache. 710 /*
441 */ 711 * We add the bitmaps at the end of the entries in order that
442 e = list_entry(bitmaps.next, struct btrfs_free_space, list); 712 * the bitmap entries are added to the cache.
713 */
714 list_for_each_entry_safe(e, n, &bitmaps, list) {
443 list_del_init(&e->list); 715 list_del_init(&e->list);
444 memcpy(e->bitmap, addr, PAGE_CACHE_SIZE); 716 ret = io_ctl_read_bitmap(&io_ctl, e);
445 kunmap(page); 717 if (ret)
446 num_bitmaps--; 718 goto free_cache;
447next:
448 unlock_page(page);
449 page_cache_release(page);
450 index++;
451 } 719 }
452 720
721 io_ctl_drop_pages(&io_ctl);
453 ret = 1; 722 ret = 1;
454out: 723out:
724 io_ctl_free(&io_ctl);
455 return ret; 725 return ret;
456free_cache: 726free_cache:
727 io_ctl_drop_pages(&io_ctl);
457 __btrfs_remove_free_space_cache(ctl); 728 __btrfs_remove_free_space_cache(ctl);
458 goto out; 729 goto out;
459} 730}
@@ -465,7 +736,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
465 struct btrfs_root *root = fs_info->tree_root; 736 struct btrfs_root *root = fs_info->tree_root;
466 struct inode *inode; 737 struct inode *inode;
467 struct btrfs_path *path; 738 struct btrfs_path *path;
468 int ret; 739 int ret = 0;
469 bool matched; 740 bool matched;
470 u64 used = btrfs_block_group_used(&block_group->item); 741 u64 used = btrfs_block_group_used(&block_group->item);
471 742
@@ -497,6 +768,14 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
497 return 0; 768 return 0;
498 } 769 }
499 770
771 /* We may have converted the inode and made the cache invalid. */
772 spin_lock(&block_group->lock);
773 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
774 spin_unlock(&block_group->lock);
775 goto out;
776 }
777 spin_unlock(&block_group->lock);
778
500 ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, 779 ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
501 path, block_group->key.objectid); 780 path, block_group->key.objectid);
502 btrfs_free_path(path); 781 btrfs_free_path(path);
@@ -530,6 +809,19 @@ out:
530 return ret; 809 return ret;
531} 810}
532 811
812/**
813 * __btrfs_write_out_cache - write out cached info to an inode
814 * @root - the root the inode belongs to
815 * @ctl - the free space cache we are going to write out
816 * @block_group - the block_group for this cache if it belongs to a block_group
817 * @trans - the trans handle
818 * @path - the path to use
819 * @offset - the offset for the key we'll insert
820 *
821 * This function writes out a free space cache struct to disk for quick recovery
822 * on mount. This will return 0 if it was successfull in writing the cache out,
823 * and -1 if it was not.
824 */
533int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, 825int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
534 struct btrfs_free_space_ctl *ctl, 826 struct btrfs_free_space_ctl *ctl,
535 struct btrfs_block_group_cache *block_group, 827 struct btrfs_block_group_cache *block_group,
@@ -540,42 +832,24 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
540 struct extent_buffer *leaf; 832 struct extent_buffer *leaf;
541 struct rb_node *node; 833 struct rb_node *node;
542 struct list_head *pos, *n; 834 struct list_head *pos, *n;
543 struct page **pages;
544 struct page *page;
545 struct extent_state *cached_state = NULL; 835 struct extent_state *cached_state = NULL;
546 struct btrfs_free_cluster *cluster = NULL; 836 struct btrfs_free_cluster *cluster = NULL;
547 struct extent_io_tree *unpin = NULL; 837 struct extent_io_tree *unpin = NULL;
838 struct io_ctl io_ctl;
548 struct list_head bitmap_list; 839 struct list_head bitmap_list;
549 struct btrfs_key key; 840 struct btrfs_key key;
550 u64 start, end, len; 841 u64 start, end, len;
551 u64 bytes = 0;
552 u32 crc = ~(u32)0;
553 int index = 0, num_pages = 0;
554 int entries = 0; 842 int entries = 0;
555 int bitmaps = 0; 843 int bitmaps = 0;
556 int ret = -1; 844 int ret;
557 bool next_page = false; 845 int err = -1;
558 bool out_of_space = false;
559 846
560 INIT_LIST_HEAD(&bitmap_list); 847 INIT_LIST_HEAD(&bitmap_list);
561 848
562 node = rb_first(&ctl->free_space_offset);
563 if (!node)
564 return 0;
565
566 if (!i_size_read(inode)) 849 if (!i_size_read(inode))
567 return -1; 850 return -1;
568 851
569 num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> 852 io_ctl_init(&io_ctl, inode, root);
570 PAGE_CACHE_SHIFT;
571
572 filemap_write_and_wait(inode->i_mapping);
573 btrfs_wait_ordered_range(inode, inode->i_size &
574 ~(root->sectorsize - 1), (u64)-1);
575
576 pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
577 if (!pages)
578 return -1;
579 853
580 /* Get the cluster for this block_group if it exists */ 854 /* Get the cluster for this block_group if it exists */
581 if (block_group && !list_empty(&block_group->cluster_list)) 855 if (block_group && !list_empty(&block_group->cluster_list))
@@ -589,30 +863,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
589 */ 863 */
590 unpin = root->fs_info->pinned_extents; 864 unpin = root->fs_info->pinned_extents;
591 865
592 /* 866 /* Lock all pages first so we can lock the extent safely. */
593 * Lock all pages first so we can lock the extent safely. 867 io_ctl_prepare_pages(&io_ctl, inode, 0);
594 *
595 * NOTE: Because we hold the ref the entire time we're going to write to
596 * the page find_get_page should never fail, so we don't do a check
597 * after find_get_page at this point. Just putting this here so people
598 * know and don't freak out.
599 */
600 while (index < num_pages) {
601 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
602 if (!page) {
603 int i;
604 868
605 for (i = 0; i < num_pages; i++) {
606 unlock_page(pages[i]);
607 page_cache_release(pages[i]);
608 }
609 goto out;
610 }
611 pages[index] = page;
612 index++;
613 }
614
615 index = 0;
616 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 869 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
617 0, &cached_state, GFP_NOFS); 870 0, &cached_state, GFP_NOFS);
618 871
@@ -623,189 +876,111 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
623 if (block_group) 876 if (block_group)
624 start = block_group->key.objectid; 877 start = block_group->key.objectid;
625 878
626 /* Write out the extent entries */ 879 node = rb_first(&ctl->free_space_offset);
627 do { 880 if (!node && cluster) {
628 struct btrfs_free_space_entry *entry; 881 node = rb_first(&cluster->root);
629 void *addr, *orig; 882 cluster = NULL;
630 unsigned long offset = 0; 883 }
631 884
632 next_page = false; 885 /* Make sure we can fit our crcs into the first page */
886 if (io_ctl.check_crcs &&
887 (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) {
888 WARN_ON(1);
889 goto out_nospc;
890 }
633 891
634 if (index >= num_pages) { 892 io_ctl_set_generation(&io_ctl, trans->transid);
635 out_of_space = true;
636 break;
637 }
638 893
639 page = pages[index]; 894 /* Write out the extent entries */
895 while (node) {
896 struct btrfs_free_space *e;
640 897
641 orig = addr = kmap(page); 898 e = rb_entry(node, struct btrfs_free_space, offset_index);
642 if (index == 0) { 899 entries++;
643 u64 *gen;
644 900
645 /* 901 ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes,
646 * We're going to put in a bogus crc for this page to 902 e->bitmap);
647 * make sure that old kernels who aren't aware of this 903 if (ret)
648 * format will be sure to discard the cache. 904 goto out_nospc;
649 */
650 addr += sizeof(u64);
651 offset += sizeof(u64);
652 905
653 gen = addr; 906 if (e->bitmap) {
654 *gen = trans->transid; 907 list_add_tail(&e->list, &bitmap_list);
655 addr += sizeof(u64); 908 bitmaps++;
656 offset += sizeof(u64);
657 } 909 }
658 entry = addr; 910 node = rb_next(node);
659 911 if (!node && cluster) {
660 memset(addr, 0, PAGE_CACHE_SIZE - offset); 912 node = rb_first(&cluster->root);
661 while (node && !next_page) { 913 cluster = NULL;
662 struct btrfs_free_space *e;
663
664 e = rb_entry(node, struct btrfs_free_space, offset_index);
665 entries++;
666
667 entry->offset = cpu_to_le64(e->offset);
668 entry->bytes = cpu_to_le64(e->bytes);
669 if (e->bitmap) {
670 entry->type = BTRFS_FREE_SPACE_BITMAP;
671 list_add_tail(&e->list, &bitmap_list);
672 bitmaps++;
673 } else {
674 entry->type = BTRFS_FREE_SPACE_EXTENT;
675 }
676 node = rb_next(node);
677 if (!node && cluster) {
678 node = rb_first(&cluster->root);
679 cluster = NULL;
680 }
681 offset += sizeof(struct btrfs_free_space_entry);
682 if (offset + sizeof(struct btrfs_free_space_entry) >=
683 PAGE_CACHE_SIZE)
684 next_page = true;
685 entry++;
686 } 914 }
915 }
687 916
688 /* 917 /*
689 * We want to add any pinned extents to our free space cache 918 * We want to add any pinned extents to our free space cache
690 * so we don't leak the space 919 * so we don't leak the space
691 */ 920 */
692 while (block_group && !next_page && 921 while (block_group && (start < block_group->key.objectid +
693 (start < block_group->key.objectid + 922 block_group->key.offset)) {
694 block_group->key.offset)) { 923 ret = find_first_extent_bit(unpin, start, &start, &end,
695 ret = find_first_extent_bit(unpin, start, &start, &end, 924 EXTENT_DIRTY);
696 EXTENT_DIRTY); 925 if (ret) {
697 if (ret) { 926 ret = 0;
698 ret = 0; 927 break;
699 break;
700 }
701
702 /* This pinned extent is out of our range */
703 if (start >= block_group->key.objectid +
704 block_group->key.offset)
705 break;
706
707 len = block_group->key.objectid +
708 block_group->key.offset - start;
709 len = min(len, end + 1 - start);
710
711 entries++;
712 entry->offset = cpu_to_le64(start);
713 entry->bytes = cpu_to_le64(len);
714 entry->type = BTRFS_FREE_SPACE_EXTENT;
715
716 start = end + 1;
717 offset += sizeof(struct btrfs_free_space_entry);
718 if (offset + sizeof(struct btrfs_free_space_entry) >=
719 PAGE_CACHE_SIZE)
720 next_page = true;
721 entry++;
722 } 928 }
723 929
724 /* Generate bogus crc value */ 930 /* This pinned extent is out of our range */
725 if (index == 0) { 931 if (start >= block_group->key.objectid +
726 u32 *tmp; 932 block_group->key.offset)
727 crc = btrfs_csum_data(root, orig + sizeof(u64), crc, 933 break;
728 PAGE_CACHE_SIZE - sizeof(u64));
729 btrfs_csum_final(crc, (char *)&crc);
730 crc++;
731 tmp = orig;
732 *tmp = crc;
733 }
734 934
735 kunmap(page); 935 len = block_group->key.objectid +
936 block_group->key.offset - start;
937 len = min(len, end + 1 - start);
736 938
737 bytes += PAGE_CACHE_SIZE; 939 entries++;
940 ret = io_ctl_add_entry(&io_ctl, start, len, NULL);
941 if (ret)
942 goto out_nospc;
738 943
739 index++; 944 start = end + 1;
740 } while (node || next_page); 945 }
741 946
742 /* Write out the bitmaps */ 947 /* Write out the bitmaps */
743 list_for_each_safe(pos, n, &bitmap_list) { 948 list_for_each_safe(pos, n, &bitmap_list) {
744 void *addr;
745 struct btrfs_free_space *entry = 949 struct btrfs_free_space *entry =
746 list_entry(pos, struct btrfs_free_space, list); 950 list_entry(pos, struct btrfs_free_space, list);
747 951
748 if (index >= num_pages) { 952 ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap);
749 out_of_space = true; 953 if (ret)
750 break; 954 goto out_nospc;
751 }
752 page = pages[index];
753
754 addr = kmap(page);
755 memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
756 kunmap(page);
757 bytes += PAGE_CACHE_SIZE;
758
759 list_del_init(&entry->list); 955 list_del_init(&entry->list);
760 index++;
761 }
762
763 if (out_of_space) {
764 btrfs_drop_pages(pages, num_pages);
765 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
766 i_size_read(inode) - 1, &cached_state,
767 GFP_NOFS);
768 ret = 0;
769 goto out;
770 } 956 }
771 957
772 /* Zero out the rest of the pages just to make sure */ 958 /* Zero out the rest of the pages just to make sure */
773 while (index < num_pages) { 959 io_ctl_zero_remaining_pages(&io_ctl);
774 void *addr;
775 960
776 page = pages[index]; 961 ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
777 addr = kmap(page); 962 0, i_size_read(inode), &cached_state);
778 memset(addr, 0, PAGE_CACHE_SIZE); 963 io_ctl_drop_pages(&io_ctl);
779 kunmap(page);
780 bytes += PAGE_CACHE_SIZE;
781 index++;
782 }
783
784 ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0,
785 bytes, &cached_state);
786 btrfs_drop_pages(pages, num_pages);
787 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 964 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
788 i_size_read(inode) - 1, &cached_state, GFP_NOFS); 965 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
789 966
790 if (ret) { 967 if (ret)
791 ret = 0;
792 goto out; 968 goto out;
793 }
794 969
795 BTRFS_I(inode)->generation = trans->transid;
796 970
797 filemap_write_and_wait(inode->i_mapping); 971 ret = filemap_write_and_wait(inode->i_mapping);
972 if (ret)
973 goto out;
798 974
799 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 975 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
800 key.offset = offset; 976 key.offset = offset;
801 key.type = 0; 977 key.type = 0;
802 978
803 ret = btrfs_search_slot(trans, root, &key, path, 1, 1); 979 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
804 if (ret < 0) { 980 if (ret < 0) {
805 ret = -1; 981 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
806 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, 982 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
807 EXTENT_DIRTY | EXTENT_DELALLOC | 983 GFP_NOFS);
808 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
809 goto out; 984 goto out;
810 } 985 }
811 leaf = path->nodes[0]; 986 leaf = path->nodes[0];
@@ -816,15 +991,16 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
816 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 991 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
817 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 992 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
818 found_key.offset != offset) { 993 found_key.offset != offset) {
819 ret = -1; 994 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
820 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, 995 inode->i_size - 1,
821 EXTENT_DIRTY | EXTENT_DELALLOC | 996 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
822 EXTENT_DO_ACCOUNTING, 0, 0, NULL, 997 NULL, GFP_NOFS);
823 GFP_NOFS);
824 btrfs_release_path(path); 998 btrfs_release_path(path);
825 goto out; 999 goto out;
826 } 1000 }
827 } 1001 }
1002
1003 BTRFS_I(inode)->generation = trans->transid;
828 header = btrfs_item_ptr(leaf, path->slots[0], 1004 header = btrfs_item_ptr(leaf, path->slots[0],
829 struct btrfs_free_space_header); 1005 struct btrfs_free_space_header);
830 btrfs_set_free_space_entries(leaf, header, entries); 1006 btrfs_set_free_space_entries(leaf, header, entries);
@@ -833,16 +1009,26 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
833 btrfs_mark_buffer_dirty(leaf); 1009 btrfs_mark_buffer_dirty(leaf);
834 btrfs_release_path(path); 1010 btrfs_release_path(path);
835 1011
836 ret = 1; 1012 err = 0;
837
838out: 1013out:
839 kfree(pages); 1014 io_ctl_free(&io_ctl);
840 if (ret != 1) { 1015 if (err) {
841 invalidate_inode_pages2_range(inode->i_mapping, 0, index); 1016 invalidate_inode_pages2(inode->i_mapping);
842 BTRFS_I(inode)->generation = 0; 1017 BTRFS_I(inode)->generation = 0;
843 } 1018 }
844 btrfs_update_inode(trans, root, inode); 1019 btrfs_update_inode(trans, root, inode);
845 return ret; 1020 return err;
1021
1022out_nospc:
1023 list_for_each_safe(pos, n, &bitmap_list) {
1024 struct btrfs_free_space *entry =
1025 list_entry(pos, struct btrfs_free_space, list);
1026 list_del_init(&entry->list);
1027 }
1028 io_ctl_drop_pages(&io_ctl);
1029 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1030 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
1031 goto out;
846} 1032}
847 1033
848int btrfs_write_out_cache(struct btrfs_root *root, 1034int btrfs_write_out_cache(struct btrfs_root *root,
@@ -869,14 +1055,15 @@ int btrfs_write_out_cache(struct btrfs_root *root,
869 1055
870 ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, 1056 ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
871 path, block_group->key.objectid); 1057 path, block_group->key.objectid);
872 if (ret < 0) { 1058 if (ret) {
873 spin_lock(&block_group->lock); 1059 spin_lock(&block_group->lock);
874 block_group->disk_cache_state = BTRFS_DC_ERROR; 1060 block_group->disk_cache_state = BTRFS_DC_ERROR;
875 spin_unlock(&block_group->lock); 1061 spin_unlock(&block_group->lock);
876 ret = 0; 1062 ret = 0;
877 1063#ifdef DEBUG
878 printk(KERN_ERR "btrfs: failed to write free space cace " 1064 printk(KERN_ERR "btrfs: failed to write free space cace "
879 "for block group %llu\n", block_group->key.objectid); 1065 "for block group %llu\n", block_group->key.objectid);
1066#endif
880 } 1067 }
881 1068
882 iput(inode); 1069 iput(inode);
@@ -1662,7 +1849,13 @@ again:
1662 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1849 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
1663 1, 0); 1850 1, 0);
1664 if (!info) { 1851 if (!info) {
1665 WARN_ON(1); 1852 /* the tree logging code might be calling us before we
1853 * have fully loaded the free space rbtree for this
1854 * block group. So it is possible the entry won't
1855 * be in the rbtree yet at all. The caching code
1856 * will make sure not to put it in the rbtree if
1857 * the logging code has pinned it.
1858 */
1666 goto out_lock; 1859 goto out_lock;
1667 } 1860 }
1668 } 1861 }
@@ -1701,6 +1894,7 @@ again:
1701 ctl->total_bitmaps--; 1894 ctl->total_bitmaps--;
1702 } 1895 }
1703 kmem_cache_free(btrfs_free_space_cachep, info); 1896 kmem_cache_free(btrfs_free_space_cachep, info);
1897 ret = 0;
1704 goto out_lock; 1898 goto out_lock;
1705 } 1899 }
1706 1900
@@ -1708,7 +1902,8 @@ again:
1708 unlink_free_space(ctl, info); 1902 unlink_free_space(ctl, info);
1709 info->offset += bytes; 1903 info->offset += bytes;
1710 info->bytes -= bytes; 1904 info->bytes -= bytes;
1711 link_free_space(ctl, info); 1905 ret = link_free_space(ctl, info);
1906 WARN_ON(ret);
1712 goto out_lock; 1907 goto out_lock;
1713 } 1908 }
1714 1909
@@ -2267,16 +2462,23 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2267{ 2462{
2268 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2463 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2269 struct btrfs_free_space *entry; 2464 struct btrfs_free_space *entry;
2270 struct rb_node *node;
2271 int ret = -ENOSPC; 2465 int ret = -ENOSPC;
2466 u64 bitmap_offset = offset_to_bitmap(ctl, offset);
2272 2467
2273 if (ctl->total_bitmaps == 0) 2468 if (ctl->total_bitmaps == 0)
2274 return -ENOSPC; 2469 return -ENOSPC;
2275 2470
2276 /* 2471 /*
2277 * First check our cached list of bitmaps and see if there is an entry 2472 * The bitmap that covers offset won't be in the list unless offset
2278 * here that will work. 2473 * is just its start offset.
2279 */ 2474 */
2475 entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
2476 if (entry->offset != bitmap_offset) {
2477 entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
2478 if (entry && list_empty(&entry->list))
2479 list_add(&entry->list, bitmaps);
2480 }
2481
2280 list_for_each_entry(entry, bitmaps, list) { 2482 list_for_each_entry(entry, bitmaps, list) {
2281 if (entry->bytes < min_bytes) 2483 if (entry->bytes < min_bytes)
2282 continue; 2484 continue;
@@ -2287,38 +2489,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2287 } 2489 }
2288 2490
2289 /* 2491 /*
2290 * If we do have entries on our list and we are here then we didn't find 2492 * The bitmaps list has all the bitmaps that record free space
2291 * anything, so go ahead and get the next entry after the last entry in 2493 * starting after offset, so no more search is required.
2292 * this list and start the search from there.
2293 */ 2494 */
2294 if (!list_empty(bitmaps)) { 2495 return -ENOSPC;
2295 entry = list_entry(bitmaps->prev, struct btrfs_free_space,
2296 list);
2297 node = rb_next(&entry->offset_index);
2298 if (!node)
2299 return -ENOSPC;
2300 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2301 goto search;
2302 }
2303
2304 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1);
2305 if (!entry)
2306 return -ENOSPC;
2307
2308search:
2309 node = &entry->offset_index;
2310 do {
2311 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2312 node = rb_next(&entry->offset_index);
2313 if (!entry->bitmap)
2314 continue;
2315 if (entry->bytes < min_bytes)
2316 continue;
2317 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
2318 bytes, min_bytes);
2319 } while (ret && node);
2320
2321 return ret;
2322} 2496}
2323 2497
2324/* 2498/*
@@ -2336,8 +2510,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2336 u64 offset, u64 bytes, u64 empty_size) 2510 u64 offset, u64 bytes, u64 empty_size)
2337{ 2511{
2338 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2512 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2339 struct list_head bitmaps;
2340 struct btrfs_free_space *entry, *tmp; 2513 struct btrfs_free_space *entry, *tmp;
2514 LIST_HEAD(bitmaps);
2341 u64 min_bytes; 2515 u64 min_bytes;
2342 int ret; 2516 int ret;
2343 2517
@@ -2376,7 +2550,6 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2376 goto out; 2550 goto out;
2377 } 2551 }
2378 2552
2379 INIT_LIST_HEAD(&bitmaps);
2380 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, 2553 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
2381 bytes, min_bytes); 2554 bytes, min_bytes);
2382 if (ret) 2555 if (ret)
@@ -2472,9 +2645,19 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2472 spin_unlock(&ctl->tree_lock); 2645 spin_unlock(&ctl->tree_lock);
2473 2646
2474 if (bytes >= minlen) { 2647 if (bytes >= minlen) {
2475 int update_ret; 2648 struct btrfs_space_info *space_info;
2476 update_ret = btrfs_update_reserved_bytes(block_group, 2649 int update = 0;
2477 bytes, 1, 1); 2650
2651 space_info = block_group->space_info;
2652 spin_lock(&space_info->lock);
2653 spin_lock(&block_group->lock);
2654 if (!block_group->ro) {
2655 block_group->reserved += bytes;
2656 space_info->bytes_reserved += bytes;
2657 update = 1;
2658 }
2659 spin_unlock(&block_group->lock);
2660 spin_unlock(&space_info->lock);
2478 2661
2479 ret = btrfs_error_discard_extent(fs_info->extent_root, 2662 ret = btrfs_error_discard_extent(fs_info->extent_root,
2480 start, 2663 start,
@@ -2482,9 +2665,16 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2482 &actually_trimmed); 2665 &actually_trimmed);
2483 2666
2484 btrfs_add_free_space(block_group, start, bytes); 2667 btrfs_add_free_space(block_group, start, bytes);
2485 if (!update_ret) 2668 if (update) {
2486 btrfs_update_reserved_bytes(block_group, 2669 spin_lock(&space_info->lock);
2487 bytes, 0, 1); 2670 spin_lock(&block_group->lock);
2671 if (block_group->ro)
2672 space_info->bytes_readonly += bytes;
2673 block_group->reserved -= bytes;
2674 space_info->bytes_reserved -= bytes;
2675 spin_unlock(&space_info->lock);
2676 spin_unlock(&block_group->lock);
2677 }
2488 2678
2489 if (ret) 2679 if (ret)
2490 break; 2680 break;
@@ -2643,9 +2833,13 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
2643 return 0; 2833 return 0;
2644 2834
2645 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); 2835 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
2646 if (ret < 0) 2836 if (ret) {
2837 btrfs_delalloc_release_metadata(inode, inode->i_size);
2838#ifdef DEBUG
2647 printk(KERN_ERR "btrfs: failed to write free ino cache " 2839 printk(KERN_ERR "btrfs: failed to write free ino cache "
2648 "for root %llu\n", root->root_key.objectid); 2840 "for root %llu\n", root->root_key.objectid);
2841#endif
2842 }
2649 2843
2650 iput(inode); 2844 iput(inode);
2651 return ret; 2845 return ret;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index b4087e0fa871..f8962a957d65 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -398,6 +398,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
398 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; 398 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
399 struct btrfs_path *path; 399 struct btrfs_path *path;
400 struct inode *inode; 400 struct inode *inode;
401 struct btrfs_block_rsv *rsv;
402 u64 num_bytes;
401 u64 alloc_hint = 0; 403 u64 alloc_hint = 0;
402 int ret; 404 int ret;
403 int prealloc; 405 int prealloc;
@@ -421,11 +423,26 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
421 if (!path) 423 if (!path)
422 return -ENOMEM; 424 return -ENOMEM;
423 425
426 rsv = trans->block_rsv;
427 trans->block_rsv = &root->fs_info->trans_block_rsv;
428
429 num_bytes = trans->bytes_reserved;
430 /*
431 * 1 item for inode item insertion if need
432 * 3 items for inode item update (in the worst case)
433 * 1 item for free space object
434 * 3 items for pre-allocation
435 */
436 trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
437 ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv,
438 trans->bytes_reserved);
439 if (ret)
440 goto out;
424again: 441again:
425 inode = lookup_free_ino_inode(root, path); 442 inode = lookup_free_ino_inode(root, path);
426 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 443 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
427 ret = PTR_ERR(inode); 444 ret = PTR_ERR(inode);
428 goto out; 445 goto out_release;
429 } 446 }
430 447
431 if (IS_ERR(inode)) { 448 if (IS_ERR(inode)) {
@@ -434,7 +451,7 @@ again:
434 451
435 ret = create_free_ino_inode(root, trans, path); 452 ret = create_free_ino_inode(root, trans, path);
436 if (ret) 453 if (ret)
437 goto out; 454 goto out_release;
438 goto again; 455 goto again;
439 } 456 }
440 457
@@ -465,21 +482,26 @@ again:
465 /* Just to make sure we have enough space */ 482 /* Just to make sure we have enough space */
466 prealloc += 8 * PAGE_CACHE_SIZE; 483 prealloc += 8 * PAGE_CACHE_SIZE;
467 484
468 ret = btrfs_check_data_free_space(inode, prealloc); 485 ret = btrfs_delalloc_reserve_space(inode, prealloc);
469 if (ret) 486 if (ret)
470 goto out_put; 487 goto out_put;
471 488
472 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, 489 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
473 prealloc, prealloc, &alloc_hint); 490 prealloc, prealloc, &alloc_hint);
474 if (ret) 491 if (ret) {
492 btrfs_delalloc_release_space(inode, prealloc);
475 goto out_put; 493 goto out_put;
494 }
476 btrfs_free_reserved_data_space(inode, prealloc); 495 btrfs_free_reserved_data_space(inode, prealloc);
477 496
497 ret = btrfs_write_out_ino_cache(root, trans, path);
478out_put: 498out_put:
479 iput(inode); 499 iput(inode);
500out_release:
501 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
480out: 502out:
481 if (ret == 0) 503 trans->block_rsv = rsv;
482 ret = btrfs_write_out_ino_cache(root, trans, path); 504 trans->bytes_reserved = num_bytes;
483 505
484 btrfs_free_path(path); 506 btrfs_free_path(path);
485 return ret; 507 return ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 75686a61bd45..526dd51a1966 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -45,10 +45,10 @@
45#include "btrfs_inode.h" 45#include "btrfs_inode.h"
46#include "ioctl.h" 46#include "ioctl.h"
47#include "print-tree.h" 47#include "print-tree.h"
48#include "volumes.h"
49#include "ordered-data.h" 48#include "ordered-data.h"
50#include "xattr.h" 49#include "xattr.h"
51#include "tree-log.h" 50#include "tree-log.h"
51#include "volumes.h"
52#include "compression.h" 52#include "compression.h"
53#include "locking.h" 53#include "locking.h"
54#include "free-space-cache.h" 54#include "free-space-cache.h"
@@ -93,6 +93,8 @@ static noinline int cow_file_range(struct inode *inode,
93 struct page *locked_page, 93 struct page *locked_page,
94 u64 start, u64 end, int *page_started, 94 u64 start, u64 end, int *page_started,
95 unsigned long *nr_written, int unlock); 95 unsigned long *nr_written, int unlock);
96static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
97 struct btrfs_root *root, struct inode *inode);
96 98
97static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 99static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
98 struct inode *inode, struct inode *dir, 100 struct inode *inode, struct inode *dir,
@@ -393,7 +395,10 @@ again:
393 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { 395 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
394 WARN_ON(pages); 396 WARN_ON(pages);
395 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 397 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
396 BUG_ON(!pages); 398 if (!pages) {
399 /* just bail out to the uncompressed code */
400 goto cont;
401 }
397 402
398 if (BTRFS_I(inode)->force_compress) 403 if (BTRFS_I(inode)->force_compress)
399 compress_type = BTRFS_I(inode)->force_compress; 404 compress_type = BTRFS_I(inode)->force_compress;
@@ -424,6 +429,7 @@ again:
424 will_compress = 1; 429 will_compress = 1;
425 } 430 }
426 } 431 }
432cont:
427 if (start == 0) { 433 if (start == 0) {
428 trans = btrfs_join_transaction(root); 434 trans = btrfs_join_transaction(root);
429 BUG_ON(IS_ERR(trans)); 435 BUG_ON(IS_ERR(trans));
@@ -820,7 +826,7 @@ static noinline int cow_file_range(struct inode *inode,
820 } 826 }
821 827
822 BUG_ON(disk_num_bytes > 828 BUG_ON(disk_num_bytes >
823 btrfs_super_total_bytes(&root->fs_info->super_copy)); 829 btrfs_super_total_bytes(root->fs_info->super_copy));
824 830
825 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); 831 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
826 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 832 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
@@ -1737,7 +1743,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1737 trans = btrfs_join_transaction(root); 1743 trans = btrfs_join_transaction(root);
1738 BUG_ON(IS_ERR(trans)); 1744 BUG_ON(IS_ERR(trans));
1739 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1745 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1740 ret = btrfs_update_inode(trans, root, inode); 1746 ret = btrfs_update_inode_fallback(trans, root, inode);
1741 BUG_ON(ret); 1747 BUG_ON(ret);
1742 } 1748 }
1743 goto out; 1749 goto out;
@@ -1787,17 +1793,17 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1787 1793
1788 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1794 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1789 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1795 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1790 ret = btrfs_update_inode(trans, root, inode); 1796 ret = btrfs_update_inode_fallback(trans, root, inode);
1791 BUG_ON(ret); 1797 BUG_ON(ret);
1792 } 1798 }
1793 ret = 0; 1799 ret = 0;
1794out: 1800out:
1795 if (nolock) { 1801 if (root != root->fs_info->tree_root)
1796 if (trans)
1797 btrfs_end_transaction_nolock(trans, root);
1798 } else {
1799 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 1802 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
1800 if (trans) 1803 if (trans) {
1804 if (nolock)
1805 btrfs_end_transaction_nolock(trans, root);
1806 else
1801 btrfs_end_transaction(trans, root); 1807 btrfs_end_transaction(trans, root);
1802 } 1808 }
1803 1809
@@ -1819,153 +1825,9 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
1819} 1825}
1820 1826
1821/* 1827/*
1822 * When IO fails, either with EIO or csum verification fails, we
1823 * try other mirrors that might have a good copy of the data. This
1824 * io_failure_record is used to record state as we go through all the
1825 * mirrors. If another mirror has good data, the page is set up to date
1826 * and things continue. If a good mirror can't be found, the original
1827 * bio end_io callback is called to indicate things have failed.
1828 */
1829struct io_failure_record {
1830 struct page *page;
1831 u64 start;
1832 u64 len;
1833 u64 logical;
1834 unsigned long bio_flags;
1835 int last_mirror;
1836};
1837
1838static int btrfs_io_failed_hook(struct bio *failed_bio,
1839 struct page *page, u64 start, u64 end,
1840 struct extent_state *state)
1841{
1842 struct io_failure_record *failrec = NULL;
1843 u64 private;
1844 struct extent_map *em;
1845 struct inode *inode = page->mapping->host;
1846 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1847 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1848 struct bio *bio;
1849 int num_copies;
1850 int ret;
1851 int rw;
1852 u64 logical;
1853
1854 ret = get_state_private(failure_tree, start, &private);
1855 if (ret) {
1856 failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
1857 if (!failrec)
1858 return -ENOMEM;
1859 failrec->start = start;
1860 failrec->len = end - start + 1;
1861 failrec->last_mirror = 0;
1862 failrec->bio_flags = 0;
1863
1864 read_lock(&em_tree->lock);
1865 em = lookup_extent_mapping(em_tree, start, failrec->len);
1866 if (em->start > start || em->start + em->len < start) {
1867 free_extent_map(em);
1868 em = NULL;
1869 }
1870 read_unlock(&em_tree->lock);
1871
1872 if (IS_ERR_OR_NULL(em)) {
1873 kfree(failrec);
1874 return -EIO;
1875 }
1876 logical = start - em->start;
1877 logical = em->block_start + logical;
1878 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1879 logical = em->block_start;
1880 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
1881 extent_set_compress_type(&failrec->bio_flags,
1882 em->compress_type);
1883 }
1884 failrec->logical = logical;
1885 free_extent_map(em);
1886 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
1887 EXTENT_DIRTY, GFP_NOFS);
1888 set_state_private(failure_tree, start,
1889 (u64)(unsigned long)failrec);
1890 } else {
1891 failrec = (struct io_failure_record *)(unsigned long)private;
1892 }
1893 num_copies = btrfs_num_copies(
1894 &BTRFS_I(inode)->root->fs_info->mapping_tree,
1895 failrec->logical, failrec->len);
1896 failrec->last_mirror++;
1897 if (!state) {
1898 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1899 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1900 failrec->start,
1901 EXTENT_LOCKED);
1902 if (state && state->start != failrec->start)
1903 state = NULL;
1904 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1905 }
1906 if (!state || failrec->last_mirror > num_copies) {
1907 set_state_private(failure_tree, failrec->start, 0);
1908 clear_extent_bits(failure_tree, failrec->start,
1909 failrec->start + failrec->len - 1,
1910 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1911 kfree(failrec);
1912 return -EIO;
1913 }
1914 bio = bio_alloc(GFP_NOFS, 1);
1915 bio->bi_private = state;
1916 bio->bi_end_io = failed_bio->bi_end_io;
1917 bio->bi_sector = failrec->logical >> 9;
1918 bio->bi_bdev = failed_bio->bi_bdev;
1919 bio->bi_size = 0;
1920
1921 bio_add_page(bio, page, failrec->len, start - page_offset(page));
1922 if (failed_bio->bi_rw & REQ_WRITE)
1923 rw = WRITE;
1924 else
1925 rw = READ;
1926
1927 ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1928 failrec->last_mirror,
1929 failrec->bio_flags, 0);
1930 return ret;
1931}
1932
1933/*
1934 * each time an IO finishes, we do a fast check in the IO failure tree
1935 * to see if we need to process or clean up an io_failure_record
1936 */
1937static int btrfs_clean_io_failures(struct inode *inode, u64 start)
1938{
1939 u64 private;
1940 u64 private_failure;
1941 struct io_failure_record *failure;
1942 int ret;
1943
1944 private = 0;
1945 if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1946 (u64)-1, 1, EXTENT_DIRTY, 0)) {
1947 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
1948 start, &private_failure);
1949 if (ret == 0) {
1950 failure = (struct io_failure_record *)(unsigned long)
1951 private_failure;
1952 set_state_private(&BTRFS_I(inode)->io_failure_tree,
1953 failure->start, 0);
1954 clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
1955 failure->start,
1956 failure->start + failure->len - 1,
1957 EXTENT_DIRTY | EXTENT_LOCKED,
1958 GFP_NOFS);
1959 kfree(failure);
1960 }
1961 }
1962 return 0;
1963}
1964
1965/*
1966 * when reads are done, we need to check csums to verify the data is correct 1828 * when reads are done, we need to check csums to verify the data is correct
1967 * if there's a match, we allow the bio to finish. If not, we go through 1829 * if there's a match, we allow the bio to finish. If not, the code in
1968 * the io_failure_record routines to find good copies 1830 * extent_io.c will try to find good copies for us.
1969 */ 1831 */
1970static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, 1832static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
1971 struct extent_state *state) 1833 struct extent_state *state)
@@ -2011,10 +1873,6 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
2011 1873
2012 kunmap_atomic(kaddr, KM_USER0); 1874 kunmap_atomic(kaddr, KM_USER0);
2013good: 1875good:
2014 /* if the io failure tree for this inode is non-empty,
2015 * check to see if we've recovered from a failed IO
2016 */
2017 btrfs_clean_io_failures(inode, start);
2018 return 0; 1876 return 0;
2019 1877
2020zeroit: 1878zeroit:
@@ -2079,89 +1937,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
2079 up_read(&root->fs_info->cleanup_work_sem); 1937 up_read(&root->fs_info->cleanup_work_sem);
2080} 1938}
2081 1939
2082/*
2083 * calculate extra metadata reservation when snapshotting a subvolume
2084 * contains orphan files.
2085 */
2086void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2087 struct btrfs_pending_snapshot *pending,
2088 u64 *bytes_to_reserve)
2089{
2090 struct btrfs_root *root;
2091 struct btrfs_block_rsv *block_rsv;
2092 u64 num_bytes;
2093 int index;
2094
2095 root = pending->root;
2096 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2097 return;
2098
2099 block_rsv = root->orphan_block_rsv;
2100
2101 /* orphan block reservation for the snapshot */
2102 num_bytes = block_rsv->size;
2103
2104 /*
2105 * after the snapshot is created, COWing tree blocks may use more
2106 * space than it frees. So we should make sure there is enough
2107 * reserved space.
2108 */
2109 index = trans->transid & 0x1;
2110 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2111 num_bytes += block_rsv->size -
2112 (block_rsv->reserved + block_rsv->freed[index]);
2113 }
2114
2115 *bytes_to_reserve += num_bytes;
2116}
2117
2118void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2119 struct btrfs_pending_snapshot *pending)
2120{
2121 struct btrfs_root *root = pending->root;
2122 struct btrfs_root *snap = pending->snap;
2123 struct btrfs_block_rsv *block_rsv;
2124 u64 num_bytes;
2125 int index;
2126 int ret;
2127
2128 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2129 return;
2130
2131 /* refill source subvolume's orphan block reservation */
2132 block_rsv = root->orphan_block_rsv;
2133 index = trans->transid & 0x1;
2134 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2135 num_bytes = block_rsv->size -
2136 (block_rsv->reserved + block_rsv->freed[index]);
2137 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2138 root->orphan_block_rsv,
2139 num_bytes);
2140 BUG_ON(ret);
2141 }
2142
2143 /* setup orphan block reservation for the snapshot */
2144 block_rsv = btrfs_alloc_block_rsv(snap);
2145 BUG_ON(!block_rsv);
2146
2147 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2148 snap->orphan_block_rsv = block_rsv;
2149
2150 num_bytes = root->orphan_block_rsv->size;
2151 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2152 block_rsv, num_bytes);
2153 BUG_ON(ret);
2154
2155#if 0
2156 /* insert orphan item for the snapshot */
2157 WARN_ON(!root->orphan_item_inserted);
2158 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2159 snap->root_key.objectid);
2160 BUG_ON(ret);
2161 snap->orphan_item_inserted = 1;
2162#endif
2163}
2164
2165enum btrfs_orphan_cleanup_state { 1940enum btrfs_orphan_cleanup_state {
2166 ORPHAN_CLEANUP_STARTED = 1, 1941 ORPHAN_CLEANUP_STARTED = 1,
2167 ORPHAN_CLEANUP_DONE = 2, 1942 ORPHAN_CLEANUP_DONE = 2,
@@ -2247,9 +2022,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2247 } 2022 }
2248 spin_unlock(&root->orphan_lock); 2023 spin_unlock(&root->orphan_lock);
2249 2024
2250 if (block_rsv)
2251 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2252
2253 /* grab metadata reservation from transaction handle */ 2025 /* grab metadata reservation from transaction handle */
2254 if (reserve) { 2026 if (reserve) {
2255 ret = btrfs_orphan_reserve_metadata(trans, inode); 2027 ret = btrfs_orphan_reserve_metadata(trans, inode);
@@ -2316,6 +2088,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2316 struct btrfs_key key, found_key; 2088 struct btrfs_key key, found_key;
2317 struct btrfs_trans_handle *trans; 2089 struct btrfs_trans_handle *trans;
2318 struct inode *inode; 2090 struct inode *inode;
2091 u64 last_objectid = 0;
2319 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2092 int ret = 0, nr_unlink = 0, nr_truncate = 0;
2320 2093
2321 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) 2094 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
@@ -2367,41 +2140,49 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2367 * crossing root thing. we store the inode number in the 2140 * crossing root thing. we store the inode number in the
2368 * offset of the orphan item. 2141 * offset of the orphan item.
2369 */ 2142 */
2143
2144 if (found_key.offset == last_objectid) {
2145 printk(KERN_ERR "btrfs: Error removing orphan entry, "
2146 "stopping orphan cleanup\n");
2147 ret = -EINVAL;
2148 goto out;
2149 }
2150
2151 last_objectid = found_key.offset;
2152
2370 found_key.objectid = found_key.offset; 2153 found_key.objectid = found_key.offset;
2371 found_key.type = BTRFS_INODE_ITEM_KEY; 2154 found_key.type = BTRFS_INODE_ITEM_KEY;
2372 found_key.offset = 0; 2155 found_key.offset = 0;
2373 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 2156 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
2374 if (IS_ERR(inode)) { 2157 ret = PTR_RET(inode);
2375 ret = PTR_ERR(inode); 2158 if (ret && ret != -ESTALE)
2376 goto out; 2159 goto out;
2377 }
2378 2160
2379 /* 2161 /*
2380 * add this inode to the orphan list so btrfs_orphan_del does 2162 * Inode is already gone but the orphan item is still there,
2381 * the proper thing when we hit it 2163 * kill the orphan item.
2382 */ 2164 */
2383 spin_lock(&root->orphan_lock); 2165 if (ret == -ESTALE) {
2384 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2166 trans = btrfs_start_transaction(root, 1);
2385 spin_unlock(&root->orphan_lock);
2386
2387 /*
2388 * if this is a bad inode, means we actually succeeded in
2389 * removing the inode, but not the orphan record, which means
2390 * we need to manually delete the orphan since iput will just
2391 * do a destroy_inode
2392 */
2393 if (is_bad_inode(inode)) {
2394 trans = btrfs_start_transaction(root, 0);
2395 if (IS_ERR(trans)) { 2167 if (IS_ERR(trans)) {
2396 ret = PTR_ERR(trans); 2168 ret = PTR_ERR(trans);
2397 goto out; 2169 goto out;
2398 } 2170 }
2399 btrfs_orphan_del(trans, inode); 2171 ret = btrfs_del_orphan_item(trans, root,
2172 found_key.objectid);
2173 BUG_ON(ret);
2400 btrfs_end_transaction(trans, root); 2174 btrfs_end_transaction(trans, root);
2401 iput(inode);
2402 continue; 2175 continue;
2403 } 2176 }
2404 2177
2178 /*
2179 * add this inode to the orphan list so btrfs_orphan_del does
2180 * the proper thing when we hit it
2181 */
2182 spin_lock(&root->orphan_lock);
2183 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2184 spin_unlock(&root->orphan_lock);
2185
2405 /* if we have links, this was a truncate, lets do that */ 2186 /* if we have links, this was a truncate, lets do that */
2406 if (inode->i_nlink) { 2187 if (inode->i_nlink) {
2407 if (!S_ISREG(inode->i_mode)) { 2188 if (!S_ISREG(inode->i_mode)) {
@@ -2420,6 +2201,9 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2420 if (ret) 2201 if (ret)
2421 goto out; 2202 goto out;
2422 } 2203 }
2204 /* release the path since we're done with it */
2205 btrfs_release_path(path);
2206
2423 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; 2207 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
2424 2208
2425 if (root->orphan_block_rsv) 2209 if (root->orphan_block_rsv)
@@ -2647,7 +2431,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2647/* 2431/*
2648 * copy everything in the in-memory inode into the btree. 2432 * copy everything in the in-memory inode into the btree.
2649 */ 2433 */
2650noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, 2434static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
2651 struct btrfs_root *root, struct inode *inode) 2435 struct btrfs_root *root, struct inode *inode)
2652{ 2436{
2653 struct btrfs_inode_item *inode_item; 2437 struct btrfs_inode_item *inode_item;
@@ -2655,21 +2439,6 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2655 struct extent_buffer *leaf; 2439 struct extent_buffer *leaf;
2656 int ret; 2440 int ret;
2657 2441
2658 /*
2659 * If the inode is a free space inode, we can deadlock during commit
2660 * if we put it into the delayed code.
2661 *
2662 * The data relocation inode should also be directly updated
2663 * without delay
2664 */
2665 if (!btrfs_is_free_space_inode(root, inode)
2666 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
2667 ret = btrfs_delayed_update_inode(trans, root, inode);
2668 if (!ret)
2669 btrfs_set_inode_last_trans(trans, inode);
2670 return ret;
2671 }
2672
2673 path = btrfs_alloc_path(); 2442 path = btrfs_alloc_path();
2674 if (!path) 2443 if (!path)
2675 return -ENOMEM; 2444 return -ENOMEM;
@@ -2698,6 +2467,43 @@ failed:
2698} 2467}
2699 2468
2700/* 2469/*
2470 * copy everything in the in-memory inode into the btree.
2471 */
2472noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2473 struct btrfs_root *root, struct inode *inode)
2474{
2475 int ret;
2476
2477 /*
2478 * If the inode is a free space inode, we can deadlock during commit
2479 * if we put it into the delayed code.
2480 *
2481 * The data relocation inode should also be directly updated
2482 * without delay
2483 */
2484 if (!btrfs_is_free_space_inode(root, inode)
2485 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
2486 ret = btrfs_delayed_update_inode(trans, root, inode);
2487 if (!ret)
2488 btrfs_set_inode_last_trans(trans, inode);
2489 return ret;
2490 }
2491
2492 return btrfs_update_inode_item(trans, root, inode);
2493}
2494
2495static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
2496 struct btrfs_root *root, struct inode *inode)
2497{
2498 int ret;
2499
2500 ret = btrfs_update_inode(trans, root, inode);
2501 if (ret == -ENOSPC)
2502 return btrfs_update_inode_item(trans, root, inode);
2503 return ret;
2504}
2505
2506/*
2701 * unlink helper that gets used here in inode.c and in the tree logging 2507 * unlink helper that gets used here in inode.c and in the tree logging
2702 * recovery code. It remove a link in a directory with a given name, and 2508 * recovery code. It remove a link in a directory with a given name, and
2703 * also drops the back refs in the inode to the directory 2509 * also drops the back refs in the inode to the directory
@@ -2835,7 +2641,16 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2835 u64 ino = btrfs_ino(inode); 2641 u64 ino = btrfs_ino(inode);
2836 u64 dir_ino = btrfs_ino(dir); 2642 u64 dir_ino = btrfs_ino(dir);
2837 2643
2838 trans = btrfs_start_transaction(root, 10); 2644 /*
2645 * 1 for the possible orphan item
2646 * 1 for the dir item
2647 * 1 for the dir index
2648 * 1 for the inode ref
2649 * 1 for the inode ref in the tree log
2650 * 2 for the dir entries in the log
2651 * 1 for the inode
2652 */
2653 trans = btrfs_start_transaction(root, 8);
2839 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) 2654 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
2840 return trans; 2655 return trans;
2841 2656
@@ -2858,7 +2673,8 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2858 return ERR_PTR(-ENOMEM); 2673 return ERR_PTR(-ENOMEM);
2859 } 2674 }
2860 2675
2861 trans = btrfs_start_transaction(root, 0); 2676 /* 1 for the orphan item */
2677 trans = btrfs_start_transaction(root, 1);
2862 if (IS_ERR(trans)) { 2678 if (IS_ERR(trans)) {
2863 btrfs_free_path(path); 2679 btrfs_free_path(path);
2864 root->fs_info->enospc_unlink = 0; 2680 root->fs_info->enospc_unlink = 0;
@@ -2963,6 +2779,12 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2963 err = 0; 2779 err = 0;
2964out: 2780out:
2965 btrfs_free_path(path); 2781 btrfs_free_path(path);
2782 /* Migrate the orphan reservation over */
2783 if (!err)
2784 err = btrfs_block_rsv_migrate(trans->block_rsv,
2785 &root->fs_info->global_block_rsv,
2786 trans->bytes_reserved);
2787
2966 if (err) { 2788 if (err) {
2967 btrfs_end_transaction(trans, root); 2789 btrfs_end_transaction(trans, root);
2968 root->fs_info->enospc_unlink = 0; 2790 root->fs_info->enospc_unlink = 0;
@@ -2977,6 +2799,9 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
2977 struct btrfs_root *root) 2799 struct btrfs_root *root)
2978{ 2800{
2979 if (trans->block_rsv == &root->fs_info->global_block_rsv) { 2801 if (trans->block_rsv == &root->fs_info->global_block_rsv) {
2802 btrfs_block_rsv_release(root, trans->block_rsv,
2803 trans->bytes_reserved);
2804 trans->block_rsv = &root->fs_info->trans_block_rsv;
2980 BUG_ON(!root->fs_info->enospc_unlink); 2805 BUG_ON(!root->fs_info->enospc_unlink);
2981 root->fs_info->enospc_unlink = 0; 2806 root->fs_info->enospc_unlink = 0;
2982 } 2807 }
@@ -3368,6 +3193,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3368 pgoff_t index = from >> PAGE_CACHE_SHIFT; 3193 pgoff_t index = from >> PAGE_CACHE_SHIFT;
3369 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3194 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3370 struct page *page; 3195 struct page *page;
3196 gfp_t mask = btrfs_alloc_write_mask(mapping);
3371 int ret = 0; 3197 int ret = 0;
3372 u64 page_start; 3198 u64 page_start;
3373 u64 page_end; 3199 u64 page_end;
@@ -3380,7 +3206,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3380 3206
3381 ret = -ENOMEM; 3207 ret = -ENOMEM;
3382again: 3208again:
3383 page = find_or_create_page(mapping, index, GFP_NOFS); 3209 page = find_or_create_page(mapping, index, mask);
3384 if (!page) { 3210 if (!page) {
3385 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3211 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3386 goto out; 3212 goto out;
@@ -3613,6 +3439,8 @@ void btrfs_evict_inode(struct inode *inode)
3613{ 3439{
3614 struct btrfs_trans_handle *trans; 3440 struct btrfs_trans_handle *trans;
3615 struct btrfs_root *root = BTRFS_I(inode)->root; 3441 struct btrfs_root *root = BTRFS_I(inode)->root;
3442 struct btrfs_block_rsv *rsv, *global_rsv;
3443 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
3616 unsigned long nr; 3444 unsigned long nr;
3617 int ret; 3445 int ret;
3618 3446
@@ -3640,22 +3468,55 @@ void btrfs_evict_inode(struct inode *inode)
3640 goto no_delete; 3468 goto no_delete;
3641 } 3469 }
3642 3470
3471 rsv = btrfs_alloc_block_rsv(root);
3472 if (!rsv) {
3473 btrfs_orphan_del(NULL, inode);
3474 goto no_delete;
3475 }
3476 rsv->size = min_size;
3477 global_rsv = &root->fs_info->global_block_rsv;
3478
3643 btrfs_i_size_write(inode, 0); 3479 btrfs_i_size_write(inode, 0);
3644 3480
3481 /*
3482 * This is a bit simpler than btrfs_truncate since
3483 *
3484 * 1) We've already reserved our space for our orphan item in the
3485 * unlink.
3486 * 2) We're going to delete the inode item, so we don't need to update
3487 * it at all.
3488 *
3489 * So we just need to reserve some slack space in case we add bytes when
3490 * doing the truncate.
3491 */
3645 while (1) { 3492 while (1) {
3646 trans = btrfs_join_transaction(root); 3493 ret = btrfs_block_rsv_refill(root, rsv, min_size);
3647 BUG_ON(IS_ERR(trans)); 3494
3648 trans->block_rsv = root->orphan_block_rsv; 3495 /*
3496 * Try and steal from the global reserve since we will
3497 * likely not use this space anyway, we want to try as
3498 * hard as possible to get this to work.
3499 */
3500 if (ret)
3501 ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
3649 3502
3650 ret = btrfs_block_rsv_check(trans, root,
3651 root->orphan_block_rsv, 0, 5);
3652 if (ret) { 3503 if (ret) {
3653 BUG_ON(ret != -EAGAIN); 3504 printk(KERN_WARNING "Could not get space for a "
3654 ret = btrfs_commit_transaction(trans, root); 3505 "delete, will truncate on mount %d\n", ret);
3655 BUG_ON(ret); 3506 btrfs_orphan_del(NULL, inode);
3656 continue; 3507 btrfs_free_block_rsv(root, rsv);
3508 goto no_delete;
3509 }
3510
3511 trans = btrfs_start_transaction(root, 0);
3512 if (IS_ERR(trans)) {
3513 btrfs_orphan_del(NULL, inode);
3514 btrfs_free_block_rsv(root, rsv);
3515 goto no_delete;
3657 } 3516 }
3658 3517
3518 trans->block_rsv = rsv;
3519
3659 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 3520 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
3660 if (ret != -EAGAIN) 3521 if (ret != -EAGAIN)
3661 break; 3522 break;
@@ -3664,14 +3525,17 @@ void btrfs_evict_inode(struct inode *inode)
3664 btrfs_end_transaction(trans, root); 3525 btrfs_end_transaction(trans, root);
3665 trans = NULL; 3526 trans = NULL;
3666 btrfs_btree_balance_dirty(root, nr); 3527 btrfs_btree_balance_dirty(root, nr);
3667
3668 } 3528 }
3669 3529
3530 btrfs_free_block_rsv(root, rsv);
3531
3670 if (ret == 0) { 3532 if (ret == 0) {
3533 trans->block_rsv = root->orphan_block_rsv;
3671 ret = btrfs_orphan_del(trans, inode); 3534 ret = btrfs_orphan_del(trans, inode);
3672 BUG_ON(ret); 3535 BUG_ON(ret);
3673 } 3536 }
3674 3537
3538 trans->block_rsv = &root->fs_info->trans_block_rsv;
3675 if (!(root == root->fs_info->tree_root || 3539 if (!(root == root->fs_info->tree_root ||
3676 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) 3540 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
3677 btrfs_return_ino(root, btrfs_ino(inode)); 3541 btrfs_return_ino(root, btrfs_ino(inode));
@@ -5795,8 +5659,7 @@ again:
5795 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { 5659 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
5796 ret = btrfs_ordered_update_i_size(inode, 0, ordered); 5660 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5797 if (!ret) 5661 if (!ret)
5798 ret = btrfs_update_inode(trans, root, inode); 5662 err = btrfs_update_inode_fallback(trans, root, inode);
5799 err = ret;
5800 goto out; 5663 goto out;
5801 } 5664 }
5802 5665
@@ -5834,7 +5697,7 @@ again:
5834 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); 5697 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
5835 ret = btrfs_ordered_update_i_size(inode, 0, ordered); 5698 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5836 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) 5699 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
5837 btrfs_update_inode(trans, root, inode); 5700 btrfs_update_inode_fallback(trans, root, inode);
5838 ret = 0; 5701 ret = 0;
5839out_unlock: 5702out_unlock:
5840 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, 5703 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
@@ -6289,7 +6152,7 @@ int btrfs_readpage(struct file *file, struct page *page)
6289{ 6152{
6290 struct extent_io_tree *tree; 6153 struct extent_io_tree *tree;
6291 tree = &BTRFS_I(page->mapping->host)->io_tree; 6154 tree = &BTRFS_I(page->mapping->host)->io_tree;
6292 return extent_read_full_page(tree, page, btrfs_get_extent); 6155 return extent_read_full_page(tree, page, btrfs_get_extent, 0);
6293} 6156}
6294 6157
6295static int btrfs_writepage(struct page *page, struct writeback_control *wbc) 6158static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
@@ -6541,6 +6404,7 @@ static int btrfs_truncate(struct inode *inode)
6541 struct btrfs_trans_handle *trans; 6404 struct btrfs_trans_handle *trans;
6542 unsigned long nr; 6405 unsigned long nr;
6543 u64 mask = root->sectorsize - 1; 6406 u64 mask = root->sectorsize - 1;
6407 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
6544 6408
6545 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); 6409 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
6546 if (ret) 6410 if (ret)
@@ -6588,19 +6452,23 @@ static int btrfs_truncate(struct inode *inode)
6588 rsv = btrfs_alloc_block_rsv(root); 6452 rsv = btrfs_alloc_block_rsv(root);
6589 if (!rsv) 6453 if (!rsv)
6590 return -ENOMEM; 6454 return -ENOMEM;
6591 btrfs_add_durable_block_rsv(root->fs_info, rsv); 6455 rsv->size = min_size;
6592 6456
6457 /*
6458 * 1 for the truncate slack space
6459 * 1 for the orphan item we're going to add
6460 * 1 for the orphan item deletion
6461 * 1 for updating the inode.
6462 */
6593 trans = btrfs_start_transaction(root, 4); 6463 trans = btrfs_start_transaction(root, 4);
6594 if (IS_ERR(trans)) { 6464 if (IS_ERR(trans)) {
6595 err = PTR_ERR(trans); 6465 err = PTR_ERR(trans);
6596 goto out; 6466 goto out;
6597 } 6467 }
6598 6468
6599 /* 6469 /* Migrate the slack space for the truncate to our reserve */
6600 * Reserve space for the truncate process. Truncate should be adding 6470 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
6601 * space, but if there are snapshots it may end up using space. 6471 min_size);
6602 */
6603 ret = btrfs_truncate_reserve_metadata(trans, root, rsv);
6604 BUG_ON(ret); 6472 BUG_ON(ret);
6605 6473
6606 ret = btrfs_orphan_add(trans, inode); 6474 ret = btrfs_orphan_add(trans, inode);
@@ -6609,21 +6477,6 @@ static int btrfs_truncate(struct inode *inode)
6609 goto out; 6477 goto out;
6610 } 6478 }
6611 6479
6612 nr = trans->blocks_used;
6613 btrfs_end_transaction(trans, root);
6614 btrfs_btree_balance_dirty(root, nr);
6615
6616 /*
6617 * Ok so we've already migrated our bytes over for the truncate, so here
6618 * just reserve the one slot we need for updating the inode.
6619 */
6620 trans = btrfs_start_transaction(root, 1);
6621 if (IS_ERR(trans)) {
6622 err = PTR_ERR(trans);
6623 goto out;
6624 }
6625 trans->block_rsv = rsv;
6626
6627 /* 6480 /*
6628 * setattr is responsible for setting the ordered_data_close flag, 6481 * setattr is responsible for setting the ordered_data_close flag,
6629 * but that is only tested during the last file release. That 6482 * but that is only tested during the last file release. That
@@ -6645,20 +6498,30 @@ static int btrfs_truncate(struct inode *inode)
6645 btrfs_add_ordered_operation(trans, root, inode); 6498 btrfs_add_ordered_operation(trans, root, inode);
6646 6499
6647 while (1) { 6500 while (1) {
6501 ret = btrfs_block_rsv_refill(root, rsv, min_size);
6502 if (ret) {
6503 /*
6504 * This can only happen with the original transaction we
6505 * started above, every other time we shouldn't have a
6506 * transaction started yet.
6507 */
6508 if (ret == -EAGAIN)
6509 goto end_trans;
6510 err = ret;
6511 break;
6512 }
6513
6648 if (!trans) { 6514 if (!trans) {
6649 trans = btrfs_start_transaction(root, 3); 6515 /* Just need the 1 for updating the inode */
6516 trans = btrfs_start_transaction(root, 1);
6650 if (IS_ERR(trans)) { 6517 if (IS_ERR(trans)) {
6651 err = PTR_ERR(trans); 6518 err = PTR_ERR(trans);
6652 goto out; 6519 goto out;
6653 } 6520 }
6654
6655 ret = btrfs_truncate_reserve_metadata(trans, root,
6656 rsv);
6657 BUG_ON(ret);
6658
6659 trans->block_rsv = rsv;
6660 } 6521 }
6661 6522
6523 trans->block_rsv = rsv;
6524
6662 ret = btrfs_truncate_inode_items(trans, root, inode, 6525 ret = btrfs_truncate_inode_items(trans, root, inode,
6663 inode->i_size, 6526 inode->i_size,
6664 BTRFS_EXTENT_DATA_KEY); 6527 BTRFS_EXTENT_DATA_KEY);
@@ -6673,7 +6536,7 @@ static int btrfs_truncate(struct inode *inode)
6673 err = ret; 6536 err = ret;
6674 break; 6537 break;
6675 } 6538 }
6676 6539end_trans:
6677 nr = trans->blocks_used; 6540 nr = trans->blocks_used;
6678 btrfs_end_transaction(trans, root); 6541 btrfs_end_transaction(trans, root);
6679 trans = NULL; 6542 trans = NULL;
@@ -6693,14 +6556,16 @@ static int btrfs_truncate(struct inode *inode)
6693 ret = btrfs_orphan_del(NULL, inode); 6556 ret = btrfs_orphan_del(NULL, inode);
6694 } 6557 }
6695 6558
6696 trans->block_rsv = &root->fs_info->trans_block_rsv; 6559 if (trans) {
6697 ret = btrfs_update_inode(trans, root, inode); 6560 trans->block_rsv = &root->fs_info->trans_block_rsv;
6698 if (ret && !err) 6561 ret = btrfs_update_inode(trans, root, inode);
6699 err = ret; 6562 if (ret && !err)
6563 err = ret;
6700 6564
6701 nr = trans->blocks_used; 6565 nr = trans->blocks_used;
6702 ret = btrfs_end_transaction_throttle(trans, root); 6566 ret = btrfs_end_transaction_throttle(trans, root);
6703 btrfs_btree_balance_dirty(root, nr); 6567 btrfs_btree_balance_dirty(root, nr);
6568 }
6704 6569
6705out: 6570out:
6706 btrfs_free_block_rsv(root, rsv); 6571 btrfs_free_block_rsv(root, rsv);
@@ -6755,9 +6620,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6755 ei->last_sub_trans = 0; 6620 ei->last_sub_trans = 0;
6756 ei->logged_trans = 0; 6621 ei->logged_trans = 0;
6757 ei->delalloc_bytes = 0; 6622 ei->delalloc_bytes = 0;
6758 ei->reserved_bytes = 0;
6759 ei->disk_i_size = 0; 6623 ei->disk_i_size = 0;
6760 ei->flags = 0; 6624 ei->flags = 0;
6625 ei->csum_bytes = 0;
6761 ei->index_cnt = (u64)-1; 6626 ei->index_cnt = (u64)-1;
6762 ei->last_unlink_trans = 0; 6627 ei->last_unlink_trans = 0;
6763 6628
@@ -6769,6 +6634,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6769 ei->orphan_meta_reserved = 0; 6634 ei->orphan_meta_reserved = 0;
6770 ei->dummy_inode = 0; 6635 ei->dummy_inode = 0;
6771 ei->in_defrag = 0; 6636 ei->in_defrag = 0;
6637 ei->delalloc_meta_reserved = 0;
6772 ei->force_compress = BTRFS_COMPRESS_NONE; 6638 ei->force_compress = BTRFS_COMPRESS_NONE;
6773 6639
6774 ei->delayed_node = NULL; 6640 ei->delayed_node = NULL;
@@ -6803,6 +6669,8 @@ void btrfs_destroy_inode(struct inode *inode)
6803 WARN_ON(inode->i_data.nrpages); 6669 WARN_ON(inode->i_data.nrpages);
6804 WARN_ON(BTRFS_I(inode)->outstanding_extents); 6670 WARN_ON(BTRFS_I(inode)->outstanding_extents);
6805 WARN_ON(BTRFS_I(inode)->reserved_extents); 6671 WARN_ON(BTRFS_I(inode)->reserved_extents);
6672 WARN_ON(BTRFS_I(inode)->delalloc_bytes);
6673 WARN_ON(BTRFS_I(inode)->csum_bytes);
6806 6674
6807 /* 6675 /*
6808 * This can happen where we create an inode, but somebody else also 6676 * This can happen where we create an inode, but somebody else also
@@ -6926,11 +6794,13 @@ static int btrfs_getattr(struct vfsmount *mnt,
6926 struct dentry *dentry, struct kstat *stat) 6794 struct dentry *dentry, struct kstat *stat)
6927{ 6795{
6928 struct inode *inode = dentry->d_inode; 6796 struct inode *inode = dentry->d_inode;
6797 u32 blocksize = inode->i_sb->s_blocksize;
6798
6929 generic_fillattr(inode, stat); 6799 generic_fillattr(inode, stat);
6930 stat->dev = BTRFS_I(inode)->root->anon_dev; 6800 stat->dev = BTRFS_I(inode)->root->anon_dev;
6931 stat->blksize = PAGE_CACHE_SIZE; 6801 stat->blksize = PAGE_CACHE_SIZE;
6932 stat->blocks = (inode_get_bytes(inode) + 6802 stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
6933 BTRFS_I(inode)->delalloc_bytes) >> 9; 6803 ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9;
6934 return 0; 6804 return 0;
6935} 6805}
6936 6806
@@ -7420,7 +7290,6 @@ static struct extent_io_ops btrfs_extent_io_ops = {
7420 .readpage_end_io_hook = btrfs_readpage_end_io_hook, 7290 .readpage_end_io_hook = btrfs_readpage_end_io_hook,
7421 .writepage_end_io_hook = btrfs_writepage_end_io_hook, 7291 .writepage_end_io_hook = btrfs_writepage_end_io_hook,
7422 .writepage_start_hook = btrfs_writepage_start_hook, 7292 .writepage_start_hook = btrfs_writepage_start_hook,
7423 .readpage_io_failed_hook = btrfs_io_failed_hook,
7424 .set_bit_hook = btrfs_set_bit_hook, 7293 .set_bit_hook = btrfs_set_bit_hook,
7425 .clear_bit_hook = btrfs_clear_bit_hook, 7294 .clear_bit_hook = btrfs_clear_bit_hook,
7426 .merge_extent_hook = btrfs_merge_extent_hook, 7295 .merge_extent_hook = btrfs_merge_extent_hook,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index dae5dfe41ba5..a90e749ed6d2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -51,6 +51,7 @@
51#include "volumes.h" 51#include "volumes.h"
52#include "locking.h" 52#include "locking.h"
53#include "inode-map.h" 53#include "inode-map.h"
54#include "backref.h"
54 55
55/* Mask out flags that are inappropriate for the given type of inode. */ 56/* Mask out flags that are inappropriate for the given type of inode. */
56static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) 57static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -117,7 +118,7 @@ void btrfs_update_iflags(struct inode *inode)
117/* 118/*
118 * Inherit flags from the parent inode. 119 * Inherit flags from the parent inode.
119 * 120 *
120 * Unlike extN we don't have any flags we don't want to inherit currently. 121 * Currently only the compression flags and the cow flags are inherited.
121 */ 122 */
122void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) 123void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
123{ 124{
@@ -128,12 +129,17 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
128 129
129 flags = BTRFS_I(dir)->flags; 130 flags = BTRFS_I(dir)->flags;
130 131
131 if (S_ISREG(inode->i_mode)) 132 if (flags & BTRFS_INODE_NOCOMPRESS) {
132 flags &= ~BTRFS_INODE_DIRSYNC; 133 BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
133 else if (!S_ISDIR(inode->i_mode)) 134 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
134 flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME); 135 } else if (flags & BTRFS_INODE_COMPRESS) {
136 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
137 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
138 }
139
140 if (flags & BTRFS_INODE_NODATACOW)
141 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
135 142
136 BTRFS_I(inode)->flags = flags;
137 btrfs_update_iflags(inode); 143 btrfs_update_iflags(inode);
138} 144}
139 145
@@ -277,6 +283,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
277 struct fstrim_range range; 283 struct fstrim_range range;
278 u64 minlen = ULLONG_MAX; 284 u64 minlen = ULLONG_MAX;
279 u64 num_devices = 0; 285 u64 num_devices = 0;
286 u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
280 int ret; 287 int ret;
281 288
282 if (!capable(CAP_SYS_ADMIN)) 289 if (!capable(CAP_SYS_ADMIN))
@@ -295,12 +302,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
295 } 302 }
296 } 303 }
297 rcu_read_unlock(); 304 rcu_read_unlock();
305
298 if (!num_devices) 306 if (!num_devices)
299 return -EOPNOTSUPP; 307 return -EOPNOTSUPP;
300
301 if (copy_from_user(&range, arg, sizeof(range))) 308 if (copy_from_user(&range, arg, sizeof(range)))
302 return -EFAULT; 309 return -EFAULT;
310 if (range.start > total_bytes)
311 return -EINVAL;
303 312
313 range.len = min(range.len, total_bytes - range.start);
304 range.minlen = max(range.minlen, minlen); 314 range.minlen = max(range.minlen, minlen);
305 ret = btrfs_trim_fs(root, &range); 315 ret = btrfs_trim_fs(root, &range);
306 if (ret < 0) 316 if (ret < 0)
@@ -760,7 +770,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
760 int ret = 1; 770 int ret = 1;
761 771
762 /* 772 /*
763 * make sure that once we start defragging and extent, we keep on 773 * make sure that once we start defragging an extent, we keep on
764 * defragging it 774 * defragging it
765 */ 775 */
766 if (start < *defrag_end) 776 if (start < *defrag_end)
@@ -805,7 +815,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
805 * extent will force at least part of that big extent to be defragged. 815 * extent will force at least part of that big extent to be defragged.
806 */ 816 */
807 if (ret) { 817 if (ret) {
808 *last_len += len;
809 *defrag_end = extent_map_end(em); 818 *defrag_end = extent_map_end(em);
810 } else { 819 } else {
811 *last_len = 0; 820 *last_len = 0;
@@ -843,6 +852,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
843 int i_done; 852 int i_done;
844 struct btrfs_ordered_extent *ordered; 853 struct btrfs_ordered_extent *ordered;
845 struct extent_state *cached_state = NULL; 854 struct extent_state *cached_state = NULL;
855 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
846 856
847 if (isize == 0) 857 if (isize == 0)
848 return 0; 858 return 0;
@@ -860,7 +870,7 @@ again:
860 for (i = 0; i < num_pages; i++) { 870 for (i = 0; i < num_pages; i++) {
861 struct page *page; 871 struct page *page;
862 page = find_or_create_page(inode->i_mapping, 872 page = find_or_create_page(inode->i_mapping,
863 start_index + i, GFP_NOFS); 873 start_index + i, mask);
864 if (!page) 874 if (!page)
865 break; 875 break;
866 876
@@ -972,18 +982,20 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
972 struct btrfs_super_block *disk_super; 982 struct btrfs_super_block *disk_super;
973 struct file_ra_state *ra = NULL; 983 struct file_ra_state *ra = NULL;
974 unsigned long last_index; 984 unsigned long last_index;
985 u64 isize = i_size_read(inode);
975 u64 features; 986 u64 features;
976 u64 last_len = 0; 987 u64 last_len = 0;
977 u64 skip = 0; 988 u64 skip = 0;
978 u64 defrag_end = 0; 989 u64 defrag_end = 0;
979 u64 newer_off = range->start; 990 u64 newer_off = range->start;
980 int newer_left = 0;
981 unsigned long i; 991 unsigned long i;
992 unsigned long ra_index = 0;
982 int ret; 993 int ret;
983 int defrag_count = 0; 994 int defrag_count = 0;
984 int compress_type = BTRFS_COMPRESS_ZLIB; 995 int compress_type = BTRFS_COMPRESS_ZLIB;
985 int extent_thresh = range->extent_thresh; 996 int extent_thresh = range->extent_thresh;
986 int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; 997 int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
998 int cluster = max_cluster;
987 u64 new_align = ~((u64)128 * 1024 - 1); 999 u64 new_align = ~((u64)128 * 1024 - 1);
988 struct page **pages = NULL; 1000 struct page **pages = NULL;
989 1001
@@ -997,7 +1009,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
997 compress_type = range->compress_type; 1009 compress_type = range->compress_type;
998 } 1010 }
999 1011
1000 if (inode->i_size == 0) 1012 if (isize == 0)
1001 return 0; 1013 return 0;
1002 1014
1003 /* 1015 /*
@@ -1013,7 +1025,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1013 ra = &file->f_ra; 1025 ra = &file->f_ra;
1014 } 1026 }
1015 1027
1016 pages = kmalloc(sizeof(struct page *) * newer_cluster, 1028 pages = kmalloc(sizeof(struct page *) * max_cluster,
1017 GFP_NOFS); 1029 GFP_NOFS);
1018 if (!pages) { 1030 if (!pages) {
1019 ret = -ENOMEM; 1031 ret = -ENOMEM;
@@ -1022,10 +1034,10 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1022 1034
1023 /* find the last page to defrag */ 1035 /* find the last page to defrag */
1024 if (range->start + range->len > range->start) { 1036 if (range->start + range->len > range->start) {
1025 last_index = min_t(u64, inode->i_size - 1, 1037 last_index = min_t(u64, isize - 1,
1026 range->start + range->len - 1) >> PAGE_CACHE_SHIFT; 1038 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
1027 } else { 1039 } else {
1028 last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; 1040 last_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1029 } 1041 }
1030 1042
1031 if (newer_than) { 1043 if (newer_than) {
@@ -1038,14 +1050,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1038 * the extents in the file evenly spaced 1050 * the extents in the file evenly spaced
1039 */ 1051 */
1040 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; 1052 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
1041 newer_left = newer_cluster;
1042 } else 1053 } else
1043 goto out_ra; 1054 goto out_ra;
1044 } else { 1055 } else {
1045 i = range->start >> PAGE_CACHE_SHIFT; 1056 i = range->start >> PAGE_CACHE_SHIFT;
1046 } 1057 }
1047 if (!max_to_defrag) 1058 if (!max_to_defrag)
1048 max_to_defrag = last_index - 1; 1059 max_to_defrag = last_index;
1049 1060
1050 /* 1061 /*
1051 * make writeback starts from i, so the defrag range can be 1062 * make writeback starts from i, so the defrag range can be
@@ -1079,18 +1090,31 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1079 i = max(i + 1, next); 1090 i = max(i + 1, next);
1080 continue; 1091 continue;
1081 } 1092 }
1093
1094 if (!newer_than) {
1095 cluster = (PAGE_CACHE_ALIGN(defrag_end) >>
1096 PAGE_CACHE_SHIFT) - i;
1097 cluster = min(cluster, max_cluster);
1098 } else {
1099 cluster = max_cluster;
1100 }
1101
1082 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) 1102 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
1083 BTRFS_I(inode)->force_compress = compress_type; 1103 BTRFS_I(inode)->force_compress = compress_type;
1084 1104
1085 btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster); 1105 if (i + cluster > ra_index) {
1106 ra_index = max(i, ra_index);
1107 btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
1108 cluster);
1109 ra_index += max_cluster;
1110 }
1086 1111
1087 ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster); 1112 ret = cluster_pages_for_defrag(inode, pages, i, cluster);
1088 if (ret < 0) 1113 if (ret < 0)
1089 goto out_ra; 1114 goto out_ra;
1090 1115
1091 defrag_count += ret; 1116 defrag_count += ret;
1092 balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); 1117 balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
1093 i += ret;
1094 1118
1095 if (newer_than) { 1119 if (newer_than) {
1096 if (newer_off == (u64)-1) 1120 if (newer_off == (u64)-1)
@@ -1105,12 +1129,17 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1105 if (!ret) { 1129 if (!ret) {
1106 range->start = newer_off; 1130 range->start = newer_off;
1107 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; 1131 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
1108 newer_left = newer_cluster;
1109 } else { 1132 } else {
1110 break; 1133 break;
1111 } 1134 }
1112 } else { 1135 } else {
1113 i++; 1136 if (ret > 0) {
1137 i += ret;
1138 last_len += ret << PAGE_CACHE_SHIFT;
1139 } else {
1140 i++;
1141 last_len = 0;
1142 }
1114 } 1143 }
1115 } 1144 }
1116 1145
@@ -1136,16 +1165,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1136 mutex_unlock(&inode->i_mutex); 1165 mutex_unlock(&inode->i_mutex);
1137 } 1166 }
1138 1167
1139 disk_super = &root->fs_info->super_copy; 1168 disk_super = root->fs_info->super_copy;
1140 features = btrfs_super_incompat_flags(disk_super); 1169 features = btrfs_super_incompat_flags(disk_super);
1141 if (range->compress_type == BTRFS_COMPRESS_LZO) { 1170 if (range->compress_type == BTRFS_COMPRESS_LZO) {
1142 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; 1171 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
1143 btrfs_set_super_incompat_flags(disk_super, features); 1172 btrfs_set_super_incompat_flags(disk_super, features);
1144 } 1173 }
1145 1174
1146 if (!file) 1175 ret = defrag_count;
1147 kfree(ra);
1148 return defrag_count;
1149 1176
1150out_ra: 1177out_ra:
1151 if (!file) 1178 if (!file)
@@ -1189,12 +1216,12 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1189 *devstr = '\0'; 1216 *devstr = '\0';
1190 devstr = vol_args->name; 1217 devstr = vol_args->name;
1191 devid = simple_strtoull(devstr, &end, 10); 1218 devid = simple_strtoull(devstr, &end, 10);
1192 printk(KERN_INFO "resizing devid %llu\n", 1219 printk(KERN_INFO "btrfs: resizing devid %llu\n",
1193 (unsigned long long)devid); 1220 (unsigned long long)devid);
1194 } 1221 }
1195 device = btrfs_find_device(root, devid, NULL, NULL); 1222 device = btrfs_find_device(root, devid, NULL, NULL);
1196 if (!device) { 1223 if (!device) {
1197 printk(KERN_INFO "resizer unable to find device %llu\n", 1224 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
1198 (unsigned long long)devid); 1225 (unsigned long long)devid);
1199 ret = -EINVAL; 1226 ret = -EINVAL;
1200 goto out_unlock; 1227 goto out_unlock;
@@ -1240,7 +1267,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1240 do_div(new_size, root->sectorsize); 1267 do_div(new_size, root->sectorsize);
1241 new_size *= root->sectorsize; 1268 new_size *= root->sectorsize;
1242 1269
1243 printk(KERN_INFO "new size for %s is %llu\n", 1270 printk(KERN_INFO "btrfs: new size for %s is %llu\n",
1244 device->name, (unsigned long long)new_size); 1271 device->name, (unsigned long long)new_size);
1245 1272
1246 if (new_size > old_size) { 1273 if (new_size > old_size) {
@@ -2587,7 +2614,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2587 return PTR_ERR(trans); 2614 return PTR_ERR(trans);
2588 } 2615 }
2589 2616
2590 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 2617 dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
2591 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, 2618 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
2592 dir_id, "default", 7, 1); 2619 dir_id, "default", 7, 1);
2593 if (IS_ERR_OR_NULL(di)) { 2620 if (IS_ERR_OR_NULL(di)) {
@@ -2603,7 +2630,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2603 btrfs_mark_buffer_dirty(path->nodes[0]); 2630 btrfs_mark_buffer_dirty(path->nodes[0]);
2604 btrfs_free_path(path); 2631 btrfs_free_path(path);
2605 2632
2606 disk_super = &root->fs_info->super_copy; 2633 disk_super = root->fs_info->super_copy;
2607 features = btrfs_super_incompat_flags(disk_super); 2634 features = btrfs_super_incompat_flags(disk_super);
2608 if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) { 2635 if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
2609 features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL; 2636 features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
@@ -2864,6 +2891,147 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
2864 return ret; 2891 return ret;
2865} 2892}
2866 2893
2894static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
2895{
2896 int ret = 0;
2897 int i;
2898 u64 rel_ptr;
2899 int size;
2900 struct btrfs_ioctl_ino_path_args *ipa = NULL;
2901 struct inode_fs_paths *ipath = NULL;
2902 struct btrfs_path *path;
2903
2904 if (!capable(CAP_SYS_ADMIN))
2905 return -EPERM;
2906
2907 path = btrfs_alloc_path();
2908 if (!path) {
2909 ret = -ENOMEM;
2910 goto out;
2911 }
2912
2913 ipa = memdup_user(arg, sizeof(*ipa));
2914 if (IS_ERR(ipa)) {
2915 ret = PTR_ERR(ipa);
2916 ipa = NULL;
2917 goto out;
2918 }
2919
2920 size = min_t(u32, ipa->size, 4096);
2921 ipath = init_ipath(size, root, path);
2922 if (IS_ERR(ipath)) {
2923 ret = PTR_ERR(ipath);
2924 ipath = NULL;
2925 goto out;
2926 }
2927
2928 ret = paths_from_inode(ipa->inum, ipath);
2929 if (ret < 0)
2930 goto out;
2931
2932 for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
2933 rel_ptr = ipath->fspath->val[i] -
2934 (u64)(unsigned long)ipath->fspath->val;
2935 ipath->fspath->val[i] = rel_ptr;
2936 }
2937
2938 ret = copy_to_user((void *)(unsigned long)ipa->fspath,
2939 (void *)(unsigned long)ipath->fspath, size);
2940 if (ret) {
2941 ret = -EFAULT;
2942 goto out;
2943 }
2944
2945out:
2946 btrfs_free_path(path);
2947 free_ipath(ipath);
2948 kfree(ipa);
2949
2950 return ret;
2951}
2952
2953static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
2954{
2955 struct btrfs_data_container *inodes = ctx;
2956 const size_t c = 3 * sizeof(u64);
2957
2958 if (inodes->bytes_left >= c) {
2959 inodes->bytes_left -= c;
2960 inodes->val[inodes->elem_cnt] = inum;
2961 inodes->val[inodes->elem_cnt + 1] = offset;
2962 inodes->val[inodes->elem_cnt + 2] = root;
2963 inodes->elem_cnt += 3;
2964 } else {
2965 inodes->bytes_missing += c - inodes->bytes_left;
2966 inodes->bytes_left = 0;
2967 inodes->elem_missed += 3;
2968 }
2969
2970 return 0;
2971}
2972
2973static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
2974 void __user *arg)
2975{
2976 int ret = 0;
2977 int size;
2978 u64 extent_offset;
2979 struct btrfs_ioctl_logical_ino_args *loi;
2980 struct btrfs_data_container *inodes = NULL;
2981 struct btrfs_path *path = NULL;
2982 struct btrfs_key key;
2983
2984 if (!capable(CAP_SYS_ADMIN))
2985 return -EPERM;
2986
2987 loi = memdup_user(arg, sizeof(*loi));
2988 if (IS_ERR(loi)) {
2989 ret = PTR_ERR(loi);
2990 loi = NULL;
2991 goto out;
2992 }
2993
2994 path = btrfs_alloc_path();
2995 if (!path) {
2996 ret = -ENOMEM;
2997 goto out;
2998 }
2999
3000 size = min_t(u32, loi->size, 4096);
3001 inodes = init_data_container(size);
3002 if (IS_ERR(inodes)) {
3003 ret = PTR_ERR(inodes);
3004 inodes = NULL;
3005 goto out;
3006 }
3007
3008 ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
3009
3010 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
3011 ret = -ENOENT;
3012 if (ret < 0)
3013 goto out;
3014
3015 extent_offset = loi->logical - key.objectid;
3016 ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
3017 extent_offset, build_ino_list, inodes);
3018
3019 if (ret < 0)
3020 goto out;
3021
3022 ret = copy_to_user((void *)(unsigned long)loi->inodes,
3023 (void *)(unsigned long)inodes, size);
3024 if (ret)
3025 ret = -EFAULT;
3026
3027out:
3028 btrfs_free_path(path);
3029 kfree(inodes);
3030 kfree(loi);
3031
3032 return ret;
3033}
3034
2867long btrfs_ioctl(struct file *file, unsigned int 3035long btrfs_ioctl(struct file *file, unsigned int
2868 cmd, unsigned long arg) 3036 cmd, unsigned long arg)
2869{ 3037{
@@ -2921,6 +3089,10 @@ long btrfs_ioctl(struct file *file, unsigned int
2921 return btrfs_ioctl_tree_search(file, argp); 3089 return btrfs_ioctl_tree_search(file, argp);
2922 case BTRFS_IOC_INO_LOOKUP: 3090 case BTRFS_IOC_INO_LOOKUP:
2923 return btrfs_ioctl_ino_lookup(file, argp); 3091 return btrfs_ioctl_ino_lookup(file, argp);
3092 case BTRFS_IOC_INO_PATHS:
3093 return btrfs_ioctl_ino_to_path(root, argp);
3094 case BTRFS_IOC_LOGICAL_INO:
3095 return btrfs_ioctl_logical_to_ino(root, argp);
2924 case BTRFS_IOC_SPACE_INFO: 3096 case BTRFS_IOC_SPACE_INFO:
2925 return btrfs_ioctl_space_info(root, argp); 3097 return btrfs_ioctl_space_info(root, argp);
2926 case BTRFS_IOC_SYNC: 3098 case BTRFS_IOC_SYNC:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index ad1ea789fcb4..252ae9915de8 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -193,6 +193,30 @@ struct btrfs_ioctl_space_args {
193 struct btrfs_ioctl_space_info spaces[0]; 193 struct btrfs_ioctl_space_info spaces[0];
194}; 194};
195 195
196struct btrfs_data_container {
197 __u32 bytes_left; /* out -- bytes not needed to deliver output */
198 __u32 bytes_missing; /* out -- additional bytes needed for result */
199 __u32 elem_cnt; /* out */
200 __u32 elem_missed; /* out */
201 __u64 val[0]; /* out */
202};
203
204struct btrfs_ioctl_ino_path_args {
205 __u64 inum; /* in */
206 __u32 size; /* in */
207 __u64 reserved[4];
208 /* struct btrfs_data_container *fspath; out */
209 __u64 fspath; /* out */
210};
211
212struct btrfs_ioctl_logical_ino_args {
213 __u64 logical; /* in */
214 __u32 size; /* in */
215 __u64 reserved[4];
216 /* struct btrfs_data_container *inodes; out */
217 __u64 inodes;
218};
219
196#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ 220#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
197 struct btrfs_ioctl_vol_args) 221 struct btrfs_ioctl_vol_args)
198#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ 222#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -248,4 +272,9 @@ struct btrfs_ioctl_space_args {
248 struct btrfs_ioctl_dev_info_args) 272 struct btrfs_ioctl_dev_info_args)
249#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ 273#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
250 struct btrfs_ioctl_fs_info_args) 274 struct btrfs_ioctl_fs_info_args)
275#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
276 struct btrfs_ioctl_ino_path_args)
277#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
278 struct btrfs_ioctl_ino_path_args)
279
251#endif 280#endif
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index fb2605d998e9..f38e452486b8 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -158,8 +158,7 @@ static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
158void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) 158void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
159{ 159{
160 int i; 160 int i;
161 u32 type; 161 u32 type, nr;
162 u32 nr = btrfs_header_nritems(l);
163 struct btrfs_item *item; 162 struct btrfs_item *item;
164 struct btrfs_root_item *ri; 163 struct btrfs_root_item *ri;
165 struct btrfs_dir_item *di; 164 struct btrfs_dir_item *di;
@@ -172,6 +171,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
172 struct btrfs_key key; 171 struct btrfs_key key;
173 struct btrfs_key found_key; 172 struct btrfs_key found_key;
174 173
174 if (!l)
175 return;
176
177 nr = btrfs_header_nritems(l);
178
175 printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", 179 printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
176 (unsigned long long)btrfs_header_bytenr(l), nr, 180 (unsigned long long)btrfs_header_bytenr(l), nr,
177 btrfs_leaf_free_space(root, l)); 181 btrfs_leaf_free_space(root, l));
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
new file mode 100644
index 000000000000..2373b39a132b
--- /dev/null
+++ b/fs/btrfs/reada.c
@@ -0,0 +1,951 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/pagemap.h>
21#include <linux/writeback.h>
22#include <linux/blkdev.h>
23#include <linux/rbtree.h>
24#include <linux/slab.h>
25#include <linux/workqueue.h>
26#include "ctree.h"
27#include "volumes.h"
28#include "disk-io.h"
29#include "transaction.h"
30
31#undef DEBUG
32
33/*
34 * This is the implementation for the generic read ahead framework.
35 *
36 * To trigger a readahead, btrfs_reada_add must be called. It will start
37 * a read ahead for the given range [start, end) on tree root. The returned
38 * handle can either be used to wait on the readahead to finish
39 * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
40 *
41 * The read ahead works as follows:
42 * On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
43 * reada_start_machine will then search for extents to prefetch and trigger
44 * some reads. When a read finishes for a node, all contained node/leaf
45 * pointers that lie in the given range will also be enqueued. The reads will
46 * be triggered in sequential order, thus giving a big win over a naive
47 * enumeration. It will also make use of multi-device layouts. Each disk
48 * will have its on read pointer and all disks will by utilized in parallel.
49 * Also will no two disks read both sides of a mirror simultaneously, as this
50 * would waste seeking capacity. Instead both disks will read different parts
51 * of the filesystem.
52 * Any number of readaheads can be started in parallel. The read order will be
53 * determined globally, i.e. 2 parallel readaheads will normally finish faster
54 * than the 2 started one after another.
55 */
56
57#define MAX_MIRRORS 2
58#define MAX_IN_FLIGHT 6
59
60struct reada_extctl {
61 struct list_head list;
62 struct reada_control *rc;
63 u64 generation;
64};
65
66struct reada_extent {
67 u64 logical;
68 struct btrfs_key top;
69 u32 blocksize;
70 int err;
71 struct list_head extctl;
72 struct kref refcnt;
73 spinlock_t lock;
74 struct reada_zone *zones[MAX_MIRRORS];
75 int nzones;
76 struct btrfs_device *scheduled_for;
77};
78
79struct reada_zone {
80 u64 start;
81 u64 end;
82 u64 elems;
83 struct list_head list;
84 spinlock_t lock;
85 int locked;
86 struct btrfs_device *device;
87 struct btrfs_device *devs[MAX_MIRRORS]; /* full list, incl self */
88 int ndevs;
89 struct kref refcnt;
90};
91
92struct reada_machine_work {
93 struct btrfs_work work;
94 struct btrfs_fs_info *fs_info;
95};
96
97static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *);
98static void reada_control_release(struct kref *kref);
99static void reada_zone_release(struct kref *kref);
100static void reada_start_machine(struct btrfs_fs_info *fs_info);
101static void __reada_start_machine(struct btrfs_fs_info *fs_info);
102
103static int reada_add_block(struct reada_control *rc, u64 logical,
104 struct btrfs_key *top, int level, u64 generation);
105
106/* recurses */
107/* in case of err, eb might be NULL */
108static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
109 u64 start, int err)
110{
111 int level = 0;
112 int nritems;
113 int i;
114 u64 bytenr;
115 u64 generation;
116 struct reada_extent *re;
117 struct btrfs_fs_info *fs_info = root->fs_info;
118 struct list_head list;
119 unsigned long index = start >> PAGE_CACHE_SHIFT;
120 struct btrfs_device *for_dev;
121
122 if (eb)
123 level = btrfs_header_level(eb);
124
125 /* find extent */
126 spin_lock(&fs_info->reada_lock);
127 re = radix_tree_lookup(&fs_info->reada_tree, index);
128 if (re)
129 kref_get(&re->refcnt);
130 spin_unlock(&fs_info->reada_lock);
131
132 if (!re)
133 return -1;
134
135 spin_lock(&re->lock);
136 /*
137 * just take the full list from the extent. afterwards we
138 * don't need the lock anymore
139 */
140 list_replace_init(&re->extctl, &list);
141 for_dev = re->scheduled_for;
142 re->scheduled_for = NULL;
143 spin_unlock(&re->lock);
144
145 if (err == 0) {
146 nritems = level ? btrfs_header_nritems(eb) : 0;
147 generation = btrfs_header_generation(eb);
148 /*
149 * FIXME: currently we just set nritems to 0 if this is a leaf,
150 * effectively ignoring the content. In a next step we could
151 * trigger more readahead depending from the content, e.g.
152 * fetch the checksums for the extents in the leaf.
153 */
154 } else {
155 /*
156 * this is the error case, the extent buffer has not been
157 * read correctly. We won't access anything from it and
158 * just cleanup our data structures. Effectively this will
159 * cut the branch below this node from read ahead.
160 */
161 nritems = 0;
162 generation = 0;
163 }
164
165 for (i = 0; i < nritems; i++) {
166 struct reada_extctl *rec;
167 u64 n_gen;
168 struct btrfs_key key;
169 struct btrfs_key next_key;
170
171 btrfs_node_key_to_cpu(eb, &key, i);
172 if (i + 1 < nritems)
173 btrfs_node_key_to_cpu(eb, &next_key, i + 1);
174 else
175 next_key = re->top;
176 bytenr = btrfs_node_blockptr(eb, i);
177 n_gen = btrfs_node_ptr_generation(eb, i);
178
179 list_for_each_entry(rec, &list, list) {
180 struct reada_control *rc = rec->rc;
181
182 /*
183 * if the generation doesn't match, just ignore this
184 * extctl. This will probably cut off a branch from
185 * prefetch. Alternatively one could start a new (sub-)
186 * prefetch for this branch, starting again from root.
187 * FIXME: move the generation check out of this loop
188 */
189#ifdef DEBUG
190 if (rec->generation != generation) {
191 printk(KERN_DEBUG "generation mismatch for "
192 "(%llu,%d,%llu) %llu != %llu\n",
193 key.objectid, key.type, key.offset,
194 rec->generation, generation);
195 }
196#endif
197 if (rec->generation == generation &&
198 btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
199 btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
200 reada_add_block(rc, bytenr, &next_key,
201 level - 1, n_gen);
202 }
203 }
204 /*
205 * free extctl records
206 */
207 while (!list_empty(&list)) {
208 struct reada_control *rc;
209 struct reada_extctl *rec;
210
211 rec = list_first_entry(&list, struct reada_extctl, list);
212 list_del(&rec->list);
213 rc = rec->rc;
214 kfree(rec);
215
216 kref_get(&rc->refcnt);
217 if (atomic_dec_and_test(&rc->elems)) {
218 kref_put(&rc->refcnt, reada_control_release);
219 wake_up(&rc->wait);
220 }
221 kref_put(&rc->refcnt, reada_control_release);
222
223 reada_extent_put(fs_info, re); /* one ref for each entry */
224 }
225 reada_extent_put(fs_info, re); /* our ref */
226 if (for_dev)
227 atomic_dec(&for_dev->reada_in_flight);
228
229 return 0;
230}
231
232/*
233 * start is passed separately in case eb in NULL, which may be the case with
234 * failed I/O
235 */
236int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
237 u64 start, int err)
238{
239 int ret;
240
241 ret = __readahead_hook(root, eb, start, err);
242
243 reada_start_machine(root->fs_info);
244
245 return ret;
246}
247
248static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
249 struct btrfs_device *dev, u64 logical,
250 struct btrfs_bio *bbio)
251{
252 int ret;
253 int looped = 0;
254 struct reada_zone *zone;
255 struct btrfs_block_group_cache *cache = NULL;
256 u64 start;
257 u64 end;
258 int i;
259
260again:
261 zone = NULL;
262 spin_lock(&fs_info->reada_lock);
263 ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
264 logical >> PAGE_CACHE_SHIFT, 1);
265 if (ret == 1)
266 kref_get(&zone->refcnt);
267 spin_unlock(&fs_info->reada_lock);
268
269 if (ret == 1) {
270 if (logical >= zone->start && logical < zone->end)
271 return zone;
272 spin_lock(&fs_info->reada_lock);
273 kref_put(&zone->refcnt, reada_zone_release);
274 spin_unlock(&fs_info->reada_lock);
275 }
276
277 if (looped)
278 return NULL;
279
280 cache = btrfs_lookup_block_group(fs_info, logical);
281 if (!cache)
282 return NULL;
283
284 start = cache->key.objectid;
285 end = start + cache->key.offset - 1;
286 btrfs_put_block_group(cache);
287
288 zone = kzalloc(sizeof(*zone), GFP_NOFS);
289 if (!zone)
290 return NULL;
291
292 zone->start = start;
293 zone->end = end;
294 INIT_LIST_HEAD(&zone->list);
295 spin_lock_init(&zone->lock);
296 zone->locked = 0;
297 kref_init(&zone->refcnt);
298 zone->elems = 0;
299 zone->device = dev; /* our device always sits at index 0 */
300 for (i = 0; i < bbio->num_stripes; ++i) {
301 /* bounds have already been checked */
302 zone->devs[i] = bbio->stripes[i].dev;
303 }
304 zone->ndevs = bbio->num_stripes;
305
306 spin_lock(&fs_info->reada_lock);
307 ret = radix_tree_insert(&dev->reada_zones,
308 (unsigned long)zone->end >> PAGE_CACHE_SHIFT,
309 zone);
310 spin_unlock(&fs_info->reada_lock);
311
312 if (ret) {
313 kfree(zone);
314 looped = 1;
315 goto again;
316 }
317
318 return zone;
319}
320
321static struct reada_extent *reada_find_extent(struct btrfs_root *root,
322 u64 logical,
323 struct btrfs_key *top, int level)
324{
325 int ret;
326 int looped = 0;
327 struct reada_extent *re = NULL;
328 struct btrfs_fs_info *fs_info = root->fs_info;
329 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
330 struct btrfs_bio *bbio = NULL;
331 struct btrfs_device *dev;
332 u32 blocksize;
333 u64 length;
334 int nzones = 0;
335 int i;
336 unsigned long index = logical >> PAGE_CACHE_SHIFT;
337
338again:
339 spin_lock(&fs_info->reada_lock);
340 re = radix_tree_lookup(&fs_info->reada_tree, index);
341 if (re)
342 kref_get(&re->refcnt);
343 spin_unlock(&fs_info->reada_lock);
344
345 if (re || looped)
346 return re;
347
348 re = kzalloc(sizeof(*re), GFP_NOFS);
349 if (!re)
350 return NULL;
351
352 blocksize = btrfs_level_size(root, level);
353 re->logical = logical;
354 re->blocksize = blocksize;
355 re->top = *top;
356 INIT_LIST_HEAD(&re->extctl);
357 spin_lock_init(&re->lock);
358 kref_init(&re->refcnt);
359
360 /*
361 * map block
362 */
363 length = blocksize;
364 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0);
365 if (ret || !bbio || length < blocksize)
366 goto error;
367
368 if (bbio->num_stripes > MAX_MIRRORS) {
369 printk(KERN_ERR "btrfs readahead: more than %d copies not "
370 "supported", MAX_MIRRORS);
371 goto error;
372 }
373
374 for (nzones = 0; nzones < bbio->num_stripes; ++nzones) {
375 struct reada_zone *zone;
376
377 dev = bbio->stripes[nzones].dev;
378 zone = reada_find_zone(fs_info, dev, logical, bbio);
379 if (!zone)
380 break;
381
382 re->zones[nzones] = zone;
383 spin_lock(&zone->lock);
384 if (!zone->elems)
385 kref_get(&zone->refcnt);
386 ++zone->elems;
387 spin_unlock(&zone->lock);
388 spin_lock(&fs_info->reada_lock);
389 kref_put(&zone->refcnt, reada_zone_release);
390 spin_unlock(&fs_info->reada_lock);
391 }
392 re->nzones = nzones;
393 if (nzones == 0) {
394 /* not a single zone found, error and out */
395 goto error;
396 }
397
398 /* insert extent in reada_tree + all per-device trees, all or nothing */
399 spin_lock(&fs_info->reada_lock);
400 ret = radix_tree_insert(&fs_info->reada_tree, index, re);
401 if (ret) {
402 spin_unlock(&fs_info->reada_lock);
403 if (ret != -ENOMEM) {
404 /* someone inserted the extent in the meantime */
405 looped = 1;
406 }
407 goto error;
408 }
409 for (i = 0; i < nzones; ++i) {
410 dev = bbio->stripes[i].dev;
411 ret = radix_tree_insert(&dev->reada_extents, index, re);
412 if (ret) {
413 while (--i >= 0) {
414 dev = bbio->stripes[i].dev;
415 BUG_ON(dev == NULL);
416 radix_tree_delete(&dev->reada_extents, index);
417 }
418 BUG_ON(fs_info == NULL);
419 radix_tree_delete(&fs_info->reada_tree, index);
420 spin_unlock(&fs_info->reada_lock);
421 goto error;
422 }
423 }
424 spin_unlock(&fs_info->reada_lock);
425
426 kfree(bbio);
427 return re;
428
429error:
430 while (nzones) {
431 struct reada_zone *zone;
432
433 --nzones;
434 zone = re->zones[nzones];
435 kref_get(&zone->refcnt);
436 spin_lock(&zone->lock);
437 --zone->elems;
438 if (zone->elems == 0) {
439 /*
440 * no fs_info->reada_lock needed, as this can't be
441 * the last ref
442 */
443 kref_put(&zone->refcnt, reada_zone_release);
444 }
445 spin_unlock(&zone->lock);
446
447 spin_lock(&fs_info->reada_lock);
448 kref_put(&zone->refcnt, reada_zone_release);
449 spin_unlock(&fs_info->reada_lock);
450 }
451 kfree(bbio);
452 kfree(re);
453 if (looped)
454 goto again;
455 return NULL;
456}
457
458static void reada_kref_dummy(struct kref *kr)
459{
460}
461
462static void reada_extent_put(struct btrfs_fs_info *fs_info,
463 struct reada_extent *re)
464{
465 int i;
466 unsigned long index = re->logical >> PAGE_CACHE_SHIFT;
467
468 spin_lock(&fs_info->reada_lock);
469 if (!kref_put(&re->refcnt, reada_kref_dummy)) {
470 spin_unlock(&fs_info->reada_lock);
471 return;
472 }
473
474 radix_tree_delete(&fs_info->reada_tree, index);
475 for (i = 0; i < re->nzones; ++i) {
476 struct reada_zone *zone = re->zones[i];
477
478 radix_tree_delete(&zone->device->reada_extents, index);
479 }
480
481 spin_unlock(&fs_info->reada_lock);
482
483 for (i = 0; i < re->nzones; ++i) {
484 struct reada_zone *zone = re->zones[i];
485
486 kref_get(&zone->refcnt);
487 spin_lock(&zone->lock);
488 --zone->elems;
489 if (zone->elems == 0) {
490 /* no fs_info->reada_lock needed, as this can't be
491 * the last ref */
492 kref_put(&zone->refcnt, reada_zone_release);
493 }
494 spin_unlock(&zone->lock);
495
496 spin_lock(&fs_info->reada_lock);
497 kref_put(&zone->refcnt, reada_zone_release);
498 spin_unlock(&fs_info->reada_lock);
499 }
500 if (re->scheduled_for)
501 atomic_dec(&re->scheduled_for->reada_in_flight);
502
503 kfree(re);
504}
505
506static void reada_zone_release(struct kref *kref)
507{
508 struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
509
510 radix_tree_delete(&zone->device->reada_zones,
511 zone->end >> PAGE_CACHE_SHIFT);
512
513 kfree(zone);
514}
515
516static void reada_control_release(struct kref *kref)
517{
518 struct reada_control *rc = container_of(kref, struct reada_control,
519 refcnt);
520
521 kfree(rc);
522}
523
524static int reada_add_block(struct reada_control *rc, u64 logical,
525 struct btrfs_key *top, int level, u64 generation)
526{
527 struct btrfs_root *root = rc->root;
528 struct reada_extent *re;
529 struct reada_extctl *rec;
530
531 re = reada_find_extent(root, logical, top, level); /* takes one ref */
532 if (!re)
533 return -1;
534
535 rec = kzalloc(sizeof(*rec), GFP_NOFS);
536 if (!rec) {
537 reada_extent_put(root->fs_info, re);
538 return -1;
539 }
540
541 rec->rc = rc;
542 rec->generation = generation;
543 atomic_inc(&rc->elems);
544
545 spin_lock(&re->lock);
546 list_add_tail(&rec->list, &re->extctl);
547 spin_unlock(&re->lock);
548
549 /* leave the ref on the extent */
550
551 return 0;
552}
553
554/*
555 * called with fs_info->reada_lock held
556 */
557static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
558{
559 int i;
560 unsigned long index = zone->end >> PAGE_CACHE_SHIFT;
561
562 for (i = 0; i < zone->ndevs; ++i) {
563 struct reada_zone *peer;
564 peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index);
565 if (peer && peer->device != zone->device)
566 peer->locked = lock;
567 }
568}
569
570/*
571 * called with fs_info->reada_lock held
572 */
573static int reada_pick_zone(struct btrfs_device *dev)
574{
575 struct reada_zone *top_zone = NULL;
576 struct reada_zone *top_locked_zone = NULL;
577 u64 top_elems = 0;
578 u64 top_locked_elems = 0;
579 unsigned long index = 0;
580 int ret;
581
582 if (dev->reada_curr_zone) {
583 reada_peer_zones_set_lock(dev->reada_curr_zone, 0);
584 kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release);
585 dev->reada_curr_zone = NULL;
586 }
587 /* pick the zone with the most elements */
588 while (1) {
589 struct reada_zone *zone;
590
591 ret = radix_tree_gang_lookup(&dev->reada_zones,
592 (void **)&zone, index, 1);
593 if (ret == 0)
594 break;
595 index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
596 if (zone->locked) {
597 if (zone->elems > top_locked_elems) {
598 top_locked_elems = zone->elems;
599 top_locked_zone = zone;
600 }
601 } else {
602 if (zone->elems > top_elems) {
603 top_elems = zone->elems;
604 top_zone = zone;
605 }
606 }
607 }
608 if (top_zone)
609 dev->reada_curr_zone = top_zone;
610 else if (top_locked_zone)
611 dev->reada_curr_zone = top_locked_zone;
612 else
613 return 0;
614
615 dev->reada_next = dev->reada_curr_zone->start;
616 kref_get(&dev->reada_curr_zone->refcnt);
617 reada_peer_zones_set_lock(dev->reada_curr_zone, 1);
618
619 return 1;
620}
621
622static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
623 struct btrfs_device *dev)
624{
625 struct reada_extent *re = NULL;
626 int mirror_num = 0;
627 struct extent_buffer *eb = NULL;
628 u64 logical;
629 u32 blocksize;
630 int ret;
631 int i;
632 int need_kick = 0;
633
634 spin_lock(&fs_info->reada_lock);
635 if (dev->reada_curr_zone == NULL) {
636 ret = reada_pick_zone(dev);
637 if (!ret) {
638 spin_unlock(&fs_info->reada_lock);
639 return 0;
640 }
641 }
642 /*
643 * FIXME currently we issue the reads one extent at a time. If we have
644 * a contiguous block of extents, we could also coagulate them or use
645 * plugging to speed things up
646 */
647 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
648 dev->reada_next >> PAGE_CACHE_SHIFT, 1);
649 if (ret == 0 || re->logical >= dev->reada_curr_zone->end) {
650 ret = reada_pick_zone(dev);
651 if (!ret) {
652 spin_unlock(&fs_info->reada_lock);
653 return 0;
654 }
655 re = NULL;
656 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
657 dev->reada_next >> PAGE_CACHE_SHIFT, 1);
658 }
659 if (ret == 0) {
660 spin_unlock(&fs_info->reada_lock);
661 return 0;
662 }
663 dev->reada_next = re->logical + re->blocksize;
664 kref_get(&re->refcnt);
665
666 spin_unlock(&fs_info->reada_lock);
667
668 /*
669 * find mirror num
670 */
671 for (i = 0; i < re->nzones; ++i) {
672 if (re->zones[i]->device == dev) {
673 mirror_num = i + 1;
674 break;
675 }
676 }
677 logical = re->logical;
678 blocksize = re->blocksize;
679
680 spin_lock(&re->lock);
681 if (re->scheduled_for == NULL) {
682 re->scheduled_for = dev;
683 need_kick = 1;
684 }
685 spin_unlock(&re->lock);
686
687 reada_extent_put(fs_info, re);
688
689 if (!need_kick)
690 return 0;
691
692 atomic_inc(&dev->reada_in_flight);
693 ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize,
694 mirror_num, &eb);
695 if (ret)
696 __readahead_hook(fs_info->extent_root, NULL, logical, ret);
697 else if (eb)
698 __readahead_hook(fs_info->extent_root, eb, eb->start, ret);
699
700 if (eb)
701 free_extent_buffer(eb);
702
703 return 1;
704
705}
706
707static void reada_start_machine_worker(struct btrfs_work *work)
708{
709 struct reada_machine_work *rmw;
710 struct btrfs_fs_info *fs_info;
711
712 rmw = container_of(work, struct reada_machine_work, work);
713 fs_info = rmw->fs_info;
714
715 kfree(rmw);
716
717 __reada_start_machine(fs_info);
718}
719
720static void __reada_start_machine(struct btrfs_fs_info *fs_info)
721{
722 struct btrfs_device *device;
723 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
724 u64 enqueued;
725 u64 total = 0;
726 int i;
727
728 do {
729 enqueued = 0;
730 list_for_each_entry(device, &fs_devices->devices, dev_list) {
731 if (atomic_read(&device->reada_in_flight) <
732 MAX_IN_FLIGHT)
733 enqueued += reada_start_machine_dev(fs_info,
734 device);
735 }
736 total += enqueued;
737 } while (enqueued && total < 10000);
738
739 if (enqueued == 0)
740 return;
741
742 /*
743 * If everything is already in the cache, this is effectively single
744 * threaded. To a) not hold the caller for too long and b) to utilize
745 * more cores, we broke the loop above after 10000 iterations and now
746 * enqueue to workers to finish it. This will distribute the load to
747 * the cores.
748 */
749 for (i = 0; i < 2; ++i)
750 reada_start_machine(fs_info);
751}
752
753static void reada_start_machine(struct btrfs_fs_info *fs_info)
754{
755 struct reada_machine_work *rmw;
756
757 rmw = kzalloc(sizeof(*rmw), GFP_NOFS);
758 if (!rmw) {
759 /* FIXME we cannot handle this properly right now */
760 BUG();
761 }
762 rmw->work.func = reada_start_machine_worker;
763 rmw->fs_info = fs_info;
764
765 btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work);
766}
767
768#ifdef DEBUG
769static void dump_devs(struct btrfs_fs_info *fs_info, int all)
770{
771 struct btrfs_device *device;
772 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
773 unsigned long index;
774 int ret;
775 int i;
776 int j;
777 int cnt;
778
779 spin_lock(&fs_info->reada_lock);
780 list_for_each_entry(device, &fs_devices->devices, dev_list) {
781 printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid,
782 atomic_read(&device->reada_in_flight));
783 index = 0;
784 while (1) {
785 struct reada_zone *zone;
786 ret = radix_tree_gang_lookup(&device->reada_zones,
787 (void **)&zone, index, 1);
788 if (ret == 0)
789 break;
790 printk(KERN_DEBUG " zone %llu-%llu elems %llu locked "
791 "%d devs", zone->start, zone->end, zone->elems,
792 zone->locked);
793 for (j = 0; j < zone->ndevs; ++j) {
794 printk(KERN_CONT " %lld",
795 zone->devs[j]->devid);
796 }
797 if (device->reada_curr_zone == zone)
798 printk(KERN_CONT " curr off %llu",
799 device->reada_next - zone->start);
800 printk(KERN_CONT "\n");
801 index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
802 }
803 cnt = 0;
804 index = 0;
805 while (all) {
806 struct reada_extent *re = NULL;
807
808 ret = radix_tree_gang_lookup(&device->reada_extents,
809 (void **)&re, index, 1);
810 if (ret == 0)
811 break;
812 printk(KERN_DEBUG
813 " re: logical %llu size %u empty %d for %lld",
814 re->logical, re->blocksize,
815 list_empty(&re->extctl), re->scheduled_for ?
816 re->scheduled_for->devid : -1);
817
818 for (i = 0; i < re->nzones; ++i) {
819 printk(KERN_CONT " zone %llu-%llu devs",
820 re->zones[i]->start,
821 re->zones[i]->end);
822 for (j = 0; j < re->zones[i]->ndevs; ++j) {
823 printk(KERN_CONT " %lld",
824 re->zones[i]->devs[j]->devid);
825 }
826 }
827 printk(KERN_CONT "\n");
828 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
829 if (++cnt > 15)
830 break;
831 }
832 }
833
834 index = 0;
835 cnt = 0;
836 while (all) {
837 struct reada_extent *re = NULL;
838
839 ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re,
840 index, 1);
841 if (ret == 0)
842 break;
843 if (!re->scheduled_for) {
844 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
845 continue;
846 }
847 printk(KERN_DEBUG
848 "re: logical %llu size %u list empty %d for %lld",
849 re->logical, re->blocksize, list_empty(&re->extctl),
850 re->scheduled_for ? re->scheduled_for->devid : -1);
851 for (i = 0; i < re->nzones; ++i) {
852 printk(KERN_CONT " zone %llu-%llu devs",
853 re->zones[i]->start,
854 re->zones[i]->end);
855 for (i = 0; i < re->nzones; ++i) {
856 printk(KERN_CONT " zone %llu-%llu devs",
857 re->zones[i]->start,
858 re->zones[i]->end);
859 for (j = 0; j < re->zones[i]->ndevs; ++j) {
860 printk(KERN_CONT " %lld",
861 re->zones[i]->devs[j]->devid);
862 }
863 }
864 }
865 printk(KERN_CONT "\n");
866 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
867 }
868 spin_unlock(&fs_info->reada_lock);
869}
870#endif
871
872/*
873 * interface
874 */
875struct reada_control *btrfs_reada_add(struct btrfs_root *root,
876 struct btrfs_key *key_start, struct btrfs_key *key_end)
877{
878 struct reada_control *rc;
879 u64 start;
880 u64 generation;
881 int level;
882 struct extent_buffer *node;
883 static struct btrfs_key max_key = {
884 .objectid = (u64)-1,
885 .type = (u8)-1,
886 .offset = (u64)-1
887 };
888
889 rc = kzalloc(sizeof(*rc), GFP_NOFS);
890 if (!rc)
891 return ERR_PTR(-ENOMEM);
892
893 rc->root = root;
894 rc->key_start = *key_start;
895 rc->key_end = *key_end;
896 atomic_set(&rc->elems, 0);
897 init_waitqueue_head(&rc->wait);
898 kref_init(&rc->refcnt);
899 kref_get(&rc->refcnt); /* one ref for having elements */
900
901 node = btrfs_root_node(root);
902 start = node->start;
903 level = btrfs_header_level(node);
904 generation = btrfs_header_generation(node);
905 free_extent_buffer(node);
906
907 reada_add_block(rc, start, &max_key, level, generation);
908
909 reada_start_machine(root->fs_info);
910
911 return rc;
912}
913
914#ifdef DEBUG
915int btrfs_reada_wait(void *handle)
916{
917 struct reada_control *rc = handle;
918
919 while (atomic_read(&rc->elems)) {
920 wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
921 5 * HZ);
922 dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
923 }
924
925 dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
926
927 kref_put(&rc->refcnt, reada_control_release);
928
929 return 0;
930}
931#else
932int btrfs_reada_wait(void *handle)
933{
934 struct reada_control *rc = handle;
935
936 while (atomic_read(&rc->elems)) {
937 wait_event(rc->wait, atomic_read(&rc->elems) == 0);
938 }
939
940 kref_put(&rc->refcnt, reada_control_release);
941
942 return 0;
943}
944#endif
945
946void btrfs_reada_detach(void *handle)
947{
948 struct reada_control *rc = handle;
949
950 kref_put(&rc->refcnt, reada_control_release);
951}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 59bb1764273d..dff29d5e151a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1174,6 +1174,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
1174 list_add_tail(&new_edge->list[UPPER], 1174 list_add_tail(&new_edge->list[UPPER],
1175 &new_node->lower); 1175 &new_node->lower);
1176 } 1176 }
1177 } else {
1178 list_add_tail(&new_node->lower, &cache->leaves);
1177 } 1179 }
1178 1180
1179 rb_node = tree_insert(&cache->rb_root, new_node->bytenr, 1181 rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
@@ -2041,8 +2043,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2041 BUG_ON(IS_ERR(trans)); 2043 BUG_ON(IS_ERR(trans));
2042 trans->block_rsv = rc->block_rsv; 2044 trans->block_rsv = rc->block_rsv;
2043 2045
2044 ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, 2046 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved);
2045 min_reserved, 0);
2046 if (ret) { 2047 if (ret) {
2047 BUG_ON(ret != -EAGAIN); 2048 BUG_ON(ret != -EAGAIN);
2048 ret = btrfs_commit_transaction(trans, root); 2049 ret = btrfs_commit_transaction(trans, root);
@@ -2152,8 +2153,7 @@ int prepare_to_merge(struct reloc_control *rc, int err)
2152again: 2153again:
2153 if (!err) { 2154 if (!err) {
2154 num_bytes = rc->merging_rsv_size; 2155 num_bytes = rc->merging_rsv_size;
2155 ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv, 2156 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
2156 num_bytes);
2157 if (ret) 2157 if (ret)
2158 err = ret; 2158 err = ret;
2159 } 2159 }
@@ -2427,7 +2427,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2427 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2427 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2428 2428
2429 trans->block_rsv = rc->block_rsv; 2429 trans->block_rsv = rc->block_rsv;
2430 ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes); 2430 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
2431 if (ret) { 2431 if (ret) {
2432 if (ret == -EAGAIN) 2432 if (ret == -EAGAIN)
2433 rc->commit_transaction = 1; 2433 rc->commit_transaction = 1;
@@ -2922,6 +2922,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
2922 unsigned long last_index; 2922 unsigned long last_index;
2923 struct page *page; 2923 struct page *page;
2924 struct file_ra_state *ra; 2924 struct file_ra_state *ra;
2925 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
2925 int nr = 0; 2926 int nr = 0;
2926 int ret = 0; 2927 int ret = 0;
2927 2928
@@ -2956,7 +2957,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
2956 ra, NULL, index, 2957 ra, NULL, index,
2957 last_index + 1 - index); 2958 last_index + 1 - index);
2958 page = find_or_create_page(inode->i_mapping, index, 2959 page = find_or_create_page(inode->i_mapping, index,
2959 GFP_NOFS); 2960 mask);
2960 if (!page) { 2961 if (!page) {
2961 btrfs_delalloc_release_metadata(inode, 2962 btrfs_delalloc_release_metadata(inode,
2962 PAGE_CACHE_SIZE); 2963 PAGE_CACHE_SIZE);
@@ -3323,8 +3324,11 @@ static int find_data_references(struct reloc_control *rc,
3323 } 3324 }
3324 3325
3325 key.objectid = ref_objectid; 3326 key.objectid = ref_objectid;
3326 key.offset = ref_offset;
3327 key.type = BTRFS_EXTENT_DATA_KEY; 3327 key.type = BTRFS_EXTENT_DATA_KEY;
3328 if (ref_offset > ((u64)-1 << 32))
3329 key.offset = 0;
3330 else
3331 key.offset = ref_offset;
3328 3332
3329 path->search_commit_root = 1; 3333 path->search_commit_root = 1;
3330 path->skip_locking = 1; 3334 path->skip_locking = 1;
@@ -3645,14 +3649,11 @@ int prepare_to_relocate(struct reloc_control *rc)
3645 * btrfs_init_reloc_root will use them when there 3649 * btrfs_init_reloc_root will use them when there
3646 * is no reservation in transaction handle. 3650 * is no reservation in transaction handle.
3647 */ 3651 */
3648 ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv, 3652 ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
3649 rc->extent_root->nodesize * 256); 3653 rc->extent_root->nodesize * 256);
3650 if (ret) 3654 if (ret)
3651 return ret; 3655 return ret;
3652 3656
3653 rc->block_rsv->refill_used = 1;
3654 btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
3655
3656 memset(&rc->cluster, 0, sizeof(rc->cluster)); 3657 memset(&rc->cluster, 0, sizeof(rc->cluster));
3657 rc->search_start = rc->block_group->key.objectid; 3658 rc->search_start = rc->block_group->key.objectid;
3658 rc->extents_found = 0; 3659 rc->extents_found = 0;
@@ -3777,8 +3778,7 @@ restart:
3777 } 3778 }
3778 } 3779 }
3779 3780
3780 ret = btrfs_block_rsv_check(trans, rc->extent_root, 3781 ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5);
3781 rc->block_rsv, 0, 5);
3782 if (ret < 0) { 3782 if (ret < 0) {
3783 if (ret != -EAGAIN) { 3783 if (ret != -EAGAIN) {
3784 err = ret; 3784 err = ret;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a8d03d5efb5d..fab420db5121 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -17,10 +17,14 @@
17 */ 17 */
18 18
19#include <linux/blkdev.h> 19#include <linux/blkdev.h>
20#include <linux/ratelimit.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "volumes.h" 22#include "volumes.h"
22#include "disk-io.h" 23#include "disk-io.h"
23#include "ordered-data.h" 24#include "ordered-data.h"
25#include "transaction.h"
26#include "backref.h"
27#include "extent_io.h"
24 28
25/* 29/*
26 * This is only the first step towards a full-features scrub. It reads all 30 * This is only the first step towards a full-features scrub. It reads all
@@ -29,15 +33,12 @@
29 * any can be found. 33 * any can be found.
30 * 34 *
31 * Future enhancements: 35 * Future enhancements:
32 * - To enhance the performance, better read-ahead strategies for the
33 * extent-tree can be employed.
34 * - In case an unrepairable extent is encountered, track which files are 36 * - In case an unrepairable extent is encountered, track which files are
35 * affected and report them 37 * affected and report them
36 * - In case of a read error on files with nodatasum, map the file and read 38 * - In case of a read error on files with nodatasum, map the file and read
37 * the extent to trigger a writeback of the good copy 39 * the extent to trigger a writeback of the good copy
38 * - track and record media errors, throw out bad devices 40 * - track and record media errors, throw out bad devices
39 * - add a mode to also read unallocated space 41 * - add a mode to also read unallocated space
40 * - make the prefetch cancellable
41 */ 42 */
42 43
43struct scrub_bio; 44struct scrub_bio;
@@ -63,7 +64,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix);
63struct scrub_page { 64struct scrub_page {
64 u64 flags; /* extent flags */ 65 u64 flags; /* extent flags */
65 u64 generation; 66 u64 generation;
66 u64 mirror_num; 67 int mirror_num;
67 int have_csum; 68 int have_csum;
68 u8 csum[BTRFS_CSUM_SIZE]; 69 u8 csum[BTRFS_CSUM_SIZE];
69}; 70};
@@ -87,6 +88,7 @@ struct scrub_dev {
87 int first_free; 88 int first_free;
88 int curr; 89 int curr;
89 atomic_t in_flight; 90 atomic_t in_flight;
91 atomic_t fixup_cnt;
90 spinlock_t list_lock; 92 spinlock_t list_lock;
91 wait_queue_head_t list_wait; 93 wait_queue_head_t list_wait;
92 u16 csum_size; 94 u16 csum_size;
@@ -100,6 +102,27 @@ struct scrub_dev {
100 spinlock_t stat_lock; 102 spinlock_t stat_lock;
101}; 103};
102 104
105struct scrub_fixup_nodatasum {
106 struct scrub_dev *sdev;
107 u64 logical;
108 struct btrfs_root *root;
109 struct btrfs_work work;
110 int mirror_num;
111};
112
113struct scrub_warning {
114 struct btrfs_path *path;
115 u64 extent_item_size;
116 char *scratch_buf;
117 char *msg_buf;
118 const char *errstr;
119 sector_t sector;
120 u64 logical;
121 struct btrfs_device *dev;
122 int msg_bufsize;
123 int scratch_bufsize;
124};
125
103static void scrub_free_csums(struct scrub_dev *sdev) 126static void scrub_free_csums(struct scrub_dev *sdev)
104{ 127{
105 while (!list_empty(&sdev->csum_list)) { 128 while (!list_empty(&sdev->csum_list)) {
@@ -175,14 +198,15 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
175 198
176 if (i != SCRUB_BIOS_PER_DEV-1) 199 if (i != SCRUB_BIOS_PER_DEV-1)
177 sdev->bios[i]->next_free = i + 1; 200 sdev->bios[i]->next_free = i + 1;
178 else 201 else
179 sdev->bios[i]->next_free = -1; 202 sdev->bios[i]->next_free = -1;
180 } 203 }
181 sdev->first_free = 0; 204 sdev->first_free = 0;
182 sdev->curr = -1; 205 sdev->curr = -1;
183 atomic_set(&sdev->in_flight, 0); 206 atomic_set(&sdev->in_flight, 0);
207 atomic_set(&sdev->fixup_cnt, 0);
184 atomic_set(&sdev->cancel_req, 0); 208 atomic_set(&sdev->cancel_req, 0);
185 sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy); 209 sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
186 INIT_LIST_HEAD(&sdev->csum_list); 210 INIT_LIST_HEAD(&sdev->csum_list);
187 211
188 spin_lock_init(&sdev->list_lock); 212 spin_lock_init(&sdev->list_lock);
@@ -195,24 +219,361 @@ nomem:
195 return ERR_PTR(-ENOMEM); 219 return ERR_PTR(-ENOMEM);
196} 220}
197 221
222static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
223{
224 u64 isize;
225 u32 nlink;
226 int ret;
227 int i;
228 struct extent_buffer *eb;
229 struct btrfs_inode_item *inode_item;
230 struct scrub_warning *swarn = ctx;
231 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
232 struct inode_fs_paths *ipath = NULL;
233 struct btrfs_root *local_root;
234 struct btrfs_key root_key;
235
236 root_key.objectid = root;
237 root_key.type = BTRFS_ROOT_ITEM_KEY;
238 root_key.offset = (u64)-1;
239 local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
240 if (IS_ERR(local_root)) {
241 ret = PTR_ERR(local_root);
242 goto err;
243 }
244
245 ret = inode_item_info(inum, 0, local_root, swarn->path);
246 if (ret) {
247 btrfs_release_path(swarn->path);
248 goto err;
249 }
250
251 eb = swarn->path->nodes[0];
252 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
253 struct btrfs_inode_item);
254 isize = btrfs_inode_size(eb, inode_item);
255 nlink = btrfs_inode_nlink(eb, inode_item);
256 btrfs_release_path(swarn->path);
257
258 ipath = init_ipath(4096, local_root, swarn->path);
259 ret = paths_from_inode(inum, ipath);
260
261 if (ret < 0)
262 goto err;
263
264 /*
265 * we deliberately ignore the bit ipath might have been too small to
266 * hold all of the paths here
267 */
268 for (i = 0; i < ipath->fspath->elem_cnt; ++i)
269 printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
270 "%s, sector %llu, root %llu, inode %llu, offset %llu, "
271 "length %llu, links %u (path: %s)\n", swarn->errstr,
272 swarn->logical, swarn->dev->name,
273 (unsigned long long)swarn->sector, root, inum, offset,
274 min(isize - offset, (u64)PAGE_SIZE), nlink,
275 (char *)(unsigned long)ipath->fspath->val[i]);
276
277 free_ipath(ipath);
278 return 0;
279
280err:
281 printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
282 "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
283 "resolving failed with ret=%d\n", swarn->errstr,
284 swarn->logical, swarn->dev->name,
285 (unsigned long long)swarn->sector, root, inum, offset, ret);
286
287 free_ipath(ipath);
288 return 0;
289}
290
291static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
292 int ix)
293{
294 struct btrfs_device *dev = sbio->sdev->dev;
295 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
296 struct btrfs_path *path;
297 struct btrfs_key found_key;
298 struct extent_buffer *eb;
299 struct btrfs_extent_item *ei;
300 struct scrub_warning swarn;
301 u32 item_size;
302 int ret;
303 u64 ref_root;
304 u8 ref_level;
305 unsigned long ptr = 0;
306 const int bufsize = 4096;
307 u64 extent_offset;
308
309 path = btrfs_alloc_path();
310
311 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
312 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
313 swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
314 swarn.logical = sbio->logical + ix * PAGE_SIZE;
315 swarn.errstr = errstr;
316 swarn.dev = dev;
317 swarn.msg_bufsize = bufsize;
318 swarn.scratch_bufsize = bufsize;
319
320 if (!path || !swarn.scratch_buf || !swarn.msg_buf)
321 goto out;
322
323 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key);
324 if (ret < 0)
325 goto out;
326
327 extent_offset = swarn.logical - found_key.objectid;
328 swarn.extent_item_size = found_key.offset;
329
330 eb = path->nodes[0];
331 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
332 item_size = btrfs_item_size_nr(eb, path->slots[0]);
333
334 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
335 do {
336 ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
337 &ref_root, &ref_level);
338 printk(KERN_WARNING "%s at logical %llu on dev %s, "
339 "sector %llu: metadata %s (level %d) in tree "
340 "%llu\n", errstr, swarn.logical, dev->name,
341 (unsigned long long)swarn.sector,
342 ref_level ? "node" : "leaf",
343 ret < 0 ? -1 : ref_level,
344 ret < 0 ? -1 : ref_root);
345 } while (ret != 1);
346 } else {
347 swarn.path = path;
348 iterate_extent_inodes(fs_info, path, found_key.objectid,
349 extent_offset,
350 scrub_print_warning_inode, &swarn);
351 }
352
353out:
354 btrfs_free_path(path);
355 kfree(swarn.scratch_buf);
356 kfree(swarn.msg_buf);
357}
358
359static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
360{
361 struct page *page = NULL;
362 unsigned long index;
363 struct scrub_fixup_nodatasum *fixup = ctx;
364 int ret;
365 int corrected = 0;
366 struct btrfs_key key;
367 struct inode *inode = NULL;
368 u64 end = offset + PAGE_SIZE - 1;
369 struct btrfs_root *local_root;
370
371 key.objectid = root;
372 key.type = BTRFS_ROOT_ITEM_KEY;
373 key.offset = (u64)-1;
374 local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
375 if (IS_ERR(local_root))
376 return PTR_ERR(local_root);
377
378 key.type = BTRFS_INODE_ITEM_KEY;
379 key.objectid = inum;
380 key.offset = 0;
381 inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
382 if (IS_ERR(inode))
383 return PTR_ERR(inode);
384
385 index = offset >> PAGE_CACHE_SHIFT;
386
387 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
388 if (!page) {
389 ret = -ENOMEM;
390 goto out;
391 }
392
393 if (PageUptodate(page)) {
394 struct btrfs_mapping_tree *map_tree;
395 if (PageDirty(page)) {
396 /*
397 * we need to write the data to the defect sector. the
398 * data that was in that sector is not in memory,
399 * because the page was modified. we must not write the
400 * modified page to that sector.
401 *
402 * TODO: what could be done here: wait for the delalloc
403 * runner to write out that page (might involve
404 * COW) and see whether the sector is still
405 * referenced afterwards.
406 *
407 * For the meantime, we'll treat this error
408 * incorrectable, although there is a chance that a
409 * later scrub will find the bad sector again and that
410 * there's no dirty page in memory, then.
411 */
412 ret = -EIO;
413 goto out;
414 }
415 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
416 ret = repair_io_failure(map_tree, offset, PAGE_SIZE,
417 fixup->logical, page,
418 fixup->mirror_num);
419 unlock_page(page);
420 corrected = !ret;
421 } else {
422 /*
423 * we need to get good data first. the general readpage path
424 * will call repair_io_failure for us, we just have to make
425 * sure we read the bad mirror.
426 */
427 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
428 EXTENT_DAMAGED, GFP_NOFS);
429 if (ret) {
430 /* set_extent_bits should give proper error */
431 WARN_ON(ret > 0);
432 if (ret > 0)
433 ret = -EFAULT;
434 goto out;
435 }
436
437 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
438 btrfs_get_extent,
439 fixup->mirror_num);
440 wait_on_page_locked(page);
441
442 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
443 end, EXTENT_DAMAGED, 0, NULL);
444 if (!corrected)
445 clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
446 EXTENT_DAMAGED, GFP_NOFS);
447 }
448
449out:
450 if (page)
451 put_page(page);
452 if (inode)
453 iput(inode);
454
455 if (ret < 0)
456 return ret;
457
458 if (ret == 0 && corrected) {
459 /*
460 * we only need to call readpage for one of the inodes belonging
461 * to this extent. so make iterate_extent_inodes stop
462 */
463 return 1;
464 }
465
466 return -EIO;
467}
468
469static void scrub_fixup_nodatasum(struct btrfs_work *work)
470{
471 int ret;
472 struct scrub_fixup_nodatasum *fixup;
473 struct scrub_dev *sdev;
474 struct btrfs_trans_handle *trans = NULL;
475 struct btrfs_fs_info *fs_info;
476 struct btrfs_path *path;
477 int uncorrectable = 0;
478
479 fixup = container_of(work, struct scrub_fixup_nodatasum, work);
480 sdev = fixup->sdev;
481 fs_info = fixup->root->fs_info;
482
483 path = btrfs_alloc_path();
484 if (!path) {
485 spin_lock(&sdev->stat_lock);
486 ++sdev->stat.malloc_errors;
487 spin_unlock(&sdev->stat_lock);
488 uncorrectable = 1;
489 goto out;
490 }
491
492 trans = btrfs_join_transaction(fixup->root);
493 if (IS_ERR(trans)) {
494 uncorrectable = 1;
495 goto out;
496 }
497
498 /*
499 * the idea is to trigger a regular read through the standard path. we
500 * read a page from the (failed) logical address by specifying the
501 * corresponding copynum of the failed sector. thus, that readpage is
502 * expected to fail.
503 * that is the point where on-the-fly error correction will kick in
504 * (once it's finished) and rewrite the failed sector if a good copy
505 * can be found.
506 */
507 ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
508 path, scrub_fixup_readpage,
509 fixup);
510 if (ret < 0) {
511 uncorrectable = 1;
512 goto out;
513 }
514 WARN_ON(ret != 1);
515
516 spin_lock(&sdev->stat_lock);
517 ++sdev->stat.corrected_errors;
518 spin_unlock(&sdev->stat_lock);
519
520out:
521 if (trans && !IS_ERR(trans))
522 btrfs_end_transaction(trans, fixup->root);
523 if (uncorrectable) {
524 spin_lock(&sdev->stat_lock);
525 ++sdev->stat.uncorrectable_errors;
526 spin_unlock(&sdev->stat_lock);
527 printk_ratelimited(KERN_ERR "btrfs: unable to fixup "
528 "(nodatasum) error at logical %llu\n",
529 fixup->logical);
530 }
531
532 btrfs_free_path(path);
533 kfree(fixup);
534
535 /* see caller why we're pretending to be paused in the scrub counters */
536 mutex_lock(&fs_info->scrub_lock);
537 atomic_dec(&fs_info->scrubs_running);
538 atomic_dec(&fs_info->scrubs_paused);
539 mutex_unlock(&fs_info->scrub_lock);
540 atomic_dec(&sdev->fixup_cnt);
541 wake_up(&fs_info->scrub_pause_wait);
542 wake_up(&sdev->list_wait);
543}
544
198/* 545/*
199 * scrub_recheck_error gets called when either verification of the page 546 * scrub_recheck_error gets called when either verification of the page
200 * failed or the bio failed to read, e.g. with EIO. In the latter case, 547 * failed or the bio failed to read, e.g. with EIO. In the latter case,
201 * recheck_error gets called for every page in the bio, even though only 548 * recheck_error gets called for every page in the bio, even though only
202 * one may be bad 549 * one may be bad
203 */ 550 */
204static void scrub_recheck_error(struct scrub_bio *sbio, int ix) 551static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
205{ 552{
553 struct scrub_dev *sdev = sbio->sdev;
554 u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
555 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
556 DEFAULT_RATELIMIT_BURST);
557
206 if (sbio->err) { 558 if (sbio->err) {
207 if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, 559 if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,
208 (sbio->physical + ix * PAGE_SIZE) >> 9,
209 sbio->bio->bi_io_vec[ix].bv_page) == 0) { 560 sbio->bio->bi_io_vec[ix].bv_page) == 0) {
210 if (scrub_fixup_check(sbio, ix) == 0) 561 if (scrub_fixup_check(sbio, ix) == 0)
211 return; 562 return 0;
212 } 563 }
564 if (__ratelimit(&_rs))
565 scrub_print_warning("i/o error", sbio, ix);
566 } else {
567 if (__ratelimit(&_rs))
568 scrub_print_warning("checksum error", sbio, ix);
213 } 569 }
214 570
571 spin_lock(&sdev->stat_lock);
572 ++sdev->stat.read_errors;
573 spin_unlock(&sdev->stat_lock);
574
215 scrub_fixup(sbio, ix); 575 scrub_fixup(sbio, ix);
576 return 1;
216} 577}
217 578
218static int scrub_fixup_check(struct scrub_bio *sbio, int ix) 579static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
@@ -250,7 +611,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
250 struct scrub_dev *sdev = sbio->sdev; 611 struct scrub_dev *sdev = sbio->sdev;
251 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 612 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
252 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 613 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
253 struct btrfs_multi_bio *multi = NULL; 614 struct btrfs_bio *bbio = NULL;
615 struct scrub_fixup_nodatasum *fixup;
254 u64 logical = sbio->logical + ix * PAGE_SIZE; 616 u64 logical = sbio->logical + ix * PAGE_SIZE;
255 u64 length; 617 u64 length;
256 int i; 618 int i;
@@ -259,38 +621,57 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
259 621
260 if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && 622 if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
261 (sbio->spag[ix].have_csum == 0)) { 623 (sbio->spag[ix].have_csum == 0)) {
624 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
625 if (!fixup)
626 goto uncorrectable;
627 fixup->sdev = sdev;
628 fixup->logical = logical;
629 fixup->root = fs_info->extent_root;
630 fixup->mirror_num = sbio->spag[ix].mirror_num;
262 /* 631 /*
263 * nodatasum, don't try to fix anything 632 * increment scrubs_running to prevent cancel requests from
264 * FIXME: we can do better, open the inode and trigger a 633 * completing as long as a fixup worker is running. we must also
265 * writeback 634 * increment scrubs_paused to prevent deadlocking on pause
635 * requests used for transactions commits (as the worker uses a
636 * transaction context). it is safe to regard the fixup worker
637 * as paused for all matters practical. effectively, we only
638 * avoid cancellation requests from completing.
266 */ 639 */
267 goto uncorrectable; 640 mutex_lock(&fs_info->scrub_lock);
641 atomic_inc(&fs_info->scrubs_running);
642 atomic_inc(&fs_info->scrubs_paused);
643 mutex_unlock(&fs_info->scrub_lock);
644 atomic_inc(&sdev->fixup_cnt);
645 fixup->work.func = scrub_fixup_nodatasum;
646 btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
647 return;
268 } 648 }
269 649
270 length = PAGE_SIZE; 650 length = PAGE_SIZE;
271 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, 651 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
272 &multi, 0); 652 &bbio, 0);
273 if (ret || !multi || length < PAGE_SIZE) { 653 if (ret || !bbio || length < PAGE_SIZE) {
274 printk(KERN_ERR 654 printk(KERN_ERR
275 "scrub_fixup: btrfs_map_block failed us for %llu\n", 655 "scrub_fixup: btrfs_map_block failed us for %llu\n",
276 (unsigned long long)logical); 656 (unsigned long long)logical);
277 WARN_ON(1); 657 WARN_ON(1);
658 kfree(bbio);
278 return; 659 return;
279 } 660 }
280 661
281 if (multi->num_stripes == 1) 662 if (bbio->num_stripes == 1)
282 /* there aren't any replicas */ 663 /* there aren't any replicas */
283 goto uncorrectable; 664 goto uncorrectable;
284 665
285 /* 666 /*
286 * first find a good copy 667 * first find a good copy
287 */ 668 */
288 for (i = 0; i < multi->num_stripes; ++i) { 669 for (i = 0; i < bbio->num_stripes; ++i) {
289 if (i == sbio->spag[ix].mirror_num) 670 if (i + 1 == sbio->spag[ix].mirror_num)
290 continue; 671 continue;
291 672
292 if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev, 673 if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev,
293 multi->stripes[i].physical >> 9, 674 bbio->stripes[i].physical >> 9,
294 sbio->bio->bi_io_vec[ix].bv_page)) { 675 sbio->bio->bi_io_vec[ix].bv_page)) {
295 /* I/O-error, this is not a good copy */ 676 /* I/O-error, this is not a good copy */
296 continue; 677 continue;
@@ -299,7 +680,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
299 if (scrub_fixup_check(sbio, ix) == 0) 680 if (scrub_fixup_check(sbio, ix) == 0)
300 break; 681 break;
301 } 682 }
302 if (i == multi->num_stripes) 683 if (i == bbio->num_stripes)
303 goto uncorrectable; 684 goto uncorrectable;
304 685
305 if (!sdev->readonly) { 686 if (!sdev->readonly) {
@@ -314,25 +695,23 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
314 } 695 }
315 } 696 }
316 697
317 kfree(multi); 698 kfree(bbio);
318 spin_lock(&sdev->stat_lock); 699 spin_lock(&sdev->stat_lock);
319 ++sdev->stat.corrected_errors; 700 ++sdev->stat.corrected_errors;
320 spin_unlock(&sdev->stat_lock); 701 spin_unlock(&sdev->stat_lock);
321 702
322 if (printk_ratelimit()) 703 printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n",
323 printk(KERN_ERR "btrfs: fixed up at %llu\n", 704 (unsigned long long)logical);
324 (unsigned long long)logical);
325 return; 705 return;
326 706
327uncorrectable: 707uncorrectable:
328 kfree(multi); 708 kfree(bbio);
329 spin_lock(&sdev->stat_lock); 709 spin_lock(&sdev->stat_lock);
330 ++sdev->stat.uncorrectable_errors; 710 ++sdev->stat.uncorrectable_errors;
331 spin_unlock(&sdev->stat_lock); 711 spin_unlock(&sdev->stat_lock);
332 712
333 if (printk_ratelimit()) 713 printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at "
334 printk(KERN_ERR "btrfs: unable to fixup at %llu\n", 714 "logical %llu\n", (unsigned long long)logical);
335 (unsigned long long)logical);
336} 715}
337 716
338static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, 717static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
@@ -382,8 +761,14 @@ static void scrub_checksum(struct btrfs_work *work)
382 int ret; 761 int ret;
383 762
384 if (sbio->err) { 763 if (sbio->err) {
764 ret = 0;
385 for (i = 0; i < sbio->count; ++i) 765 for (i = 0; i < sbio->count; ++i)
386 scrub_recheck_error(sbio, i); 766 ret |= scrub_recheck_error(sbio, i);
767 if (!ret) {
768 spin_lock(&sdev->stat_lock);
769 ++sdev->stat.unverified_errors;
770 spin_unlock(&sdev->stat_lock);
771 }
387 772
388 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); 773 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
389 sbio->bio->bi_flags |= 1 << BIO_UPTODATE; 774 sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
@@ -396,10 +781,6 @@ static void scrub_checksum(struct btrfs_work *work)
396 bi->bv_offset = 0; 781 bi->bv_offset = 0;
397 bi->bv_len = PAGE_SIZE; 782 bi->bv_len = PAGE_SIZE;
398 } 783 }
399
400 spin_lock(&sdev->stat_lock);
401 ++sdev->stat.read_errors;
402 spin_unlock(&sdev->stat_lock);
403 goto out; 784 goto out;
404 } 785 }
405 for (i = 0; i < sbio->count; ++i) { 786 for (i = 0; i < sbio->count; ++i) {
@@ -420,8 +801,14 @@ static void scrub_checksum(struct btrfs_work *work)
420 WARN_ON(1); 801 WARN_ON(1);
421 } 802 }
422 kunmap_atomic(buffer, KM_USER0); 803 kunmap_atomic(buffer, KM_USER0);
423 if (ret) 804 if (ret) {
424 scrub_recheck_error(sbio, i); 805 ret = scrub_recheck_error(sbio, i);
806 if (!ret) {
807 spin_lock(&sdev->stat_lock);
808 ++sdev->stat.unverified_errors;
809 spin_unlock(&sdev->stat_lock);
810 }
811 }
425 } 812 }
426 813
427out: 814out:
@@ -557,57 +944,27 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
557static int scrub_submit(struct scrub_dev *sdev) 944static int scrub_submit(struct scrub_dev *sdev)
558{ 945{
559 struct scrub_bio *sbio; 946 struct scrub_bio *sbio;
560 struct bio *bio;
561 int i;
562 947
563 if (sdev->curr == -1) 948 if (sdev->curr == -1)
564 return 0; 949 return 0;
565 950
566 sbio = sdev->bios[sdev->curr]; 951 sbio = sdev->bios[sdev->curr];
567
568 bio = bio_alloc(GFP_NOFS, sbio->count);
569 if (!bio)
570 goto nomem;
571
572 bio->bi_private = sbio;
573 bio->bi_end_io = scrub_bio_end_io;
574 bio->bi_bdev = sdev->dev->bdev;
575 bio->bi_sector = sbio->physical >> 9;
576
577 for (i = 0; i < sbio->count; ++i) {
578 struct page *page;
579 int ret;
580
581 page = alloc_page(GFP_NOFS);
582 if (!page)
583 goto nomem;
584
585 ret = bio_add_page(bio, page, PAGE_SIZE, 0);
586 if (!ret) {
587 __free_page(page);
588 goto nomem;
589 }
590 }
591
592 sbio->err = 0; 952 sbio->err = 0;
593 sdev->curr = -1; 953 sdev->curr = -1;
594 atomic_inc(&sdev->in_flight); 954 atomic_inc(&sdev->in_flight);
595 955
596 submit_bio(READ, bio); 956 submit_bio(READ, sbio->bio);
597 957
598 return 0; 958 return 0;
599
600nomem:
601 scrub_free_bio(bio);
602
603 return -ENOMEM;
604} 959}
605 960
606static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, 961static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
607 u64 physical, u64 flags, u64 gen, u64 mirror_num, 962 u64 physical, u64 flags, u64 gen, int mirror_num,
608 u8 *csum, int force) 963 u8 *csum, int force)
609{ 964{
610 struct scrub_bio *sbio; 965 struct scrub_bio *sbio;
966 struct page *page;
967 int ret;
611 968
612again: 969again:
613 /* 970 /*
@@ -628,12 +985,22 @@ again:
628 } 985 }
629 sbio = sdev->bios[sdev->curr]; 986 sbio = sdev->bios[sdev->curr];
630 if (sbio->count == 0) { 987 if (sbio->count == 0) {
988 struct bio *bio;
989
631 sbio->physical = physical; 990 sbio->physical = physical;
632 sbio->logical = logical; 991 sbio->logical = logical;
992 bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
993 if (!bio)
994 return -ENOMEM;
995
996 bio->bi_private = sbio;
997 bio->bi_end_io = scrub_bio_end_io;
998 bio->bi_bdev = sdev->dev->bdev;
999 bio->bi_sector = sbio->physical >> 9;
1000 sbio->err = 0;
1001 sbio->bio = bio;
633 } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || 1002 } else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
634 sbio->logical + sbio->count * PAGE_SIZE != logical) { 1003 sbio->logical + sbio->count * PAGE_SIZE != logical) {
635 int ret;
636
637 ret = scrub_submit(sdev); 1004 ret = scrub_submit(sdev);
638 if (ret) 1005 if (ret)
639 return ret; 1006 return ret;
@@ -643,6 +1010,20 @@ again:
643 sbio->spag[sbio->count].generation = gen; 1010 sbio->spag[sbio->count].generation = gen;
644 sbio->spag[sbio->count].have_csum = 0; 1011 sbio->spag[sbio->count].have_csum = 0;
645 sbio->spag[sbio->count].mirror_num = mirror_num; 1012 sbio->spag[sbio->count].mirror_num = mirror_num;
1013
1014 page = alloc_page(GFP_NOFS);
1015 if (!page)
1016 return -ENOMEM;
1017
1018 ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0);
1019 if (!ret) {
1020 __free_page(page);
1021 ret = scrub_submit(sdev);
1022 if (ret)
1023 return ret;
1024 goto again;
1025 }
1026
646 if (csum) { 1027 if (csum) {
647 sbio->spag[sbio->count].have_csum = 1; 1028 sbio->spag[sbio->count].have_csum = 1;
648 memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); 1029 memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
@@ -701,7 +1082,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
701 1082
702/* scrub extent tries to collect up to 64 kB for each bio */ 1083/* scrub extent tries to collect up to 64 kB for each bio */
703static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, 1084static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
704 u64 physical, u64 flags, u64 gen, u64 mirror_num) 1085 u64 physical, u64 flags, u64 gen, int mirror_num)
705{ 1086{
706 int ret; 1087 int ret;
707 u8 csum[BTRFS_CSUM_SIZE]; 1088 u8 csum[BTRFS_CSUM_SIZE];
@@ -741,13 +1122,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
741 int slot; 1122 int slot;
742 int i; 1123 int i;
743 u64 nstripes; 1124 u64 nstripes;
744 int start_stripe;
745 struct extent_buffer *l; 1125 struct extent_buffer *l;
746 struct btrfs_key key; 1126 struct btrfs_key key;
747 u64 physical; 1127 u64 physical;
748 u64 logical; 1128 u64 logical;
749 u64 generation; 1129 u64 generation;
750 u64 mirror_num; 1130 int mirror_num;
1131 struct reada_control *reada1;
1132 struct reada_control *reada2;
1133 struct btrfs_key key_start;
1134 struct btrfs_key key_end;
751 1135
752 u64 increment = map->stripe_len; 1136 u64 increment = map->stripe_len;
753 u64 offset; 1137 u64 offset;
@@ -758,102 +1142,88 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
758 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 1142 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
759 offset = map->stripe_len * num; 1143 offset = map->stripe_len * num;
760 increment = map->stripe_len * map->num_stripes; 1144 increment = map->stripe_len * map->num_stripes;
761 mirror_num = 0; 1145 mirror_num = 1;
762 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 1146 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
763 int factor = map->num_stripes / map->sub_stripes; 1147 int factor = map->num_stripes / map->sub_stripes;
764 offset = map->stripe_len * (num / map->sub_stripes); 1148 offset = map->stripe_len * (num / map->sub_stripes);
765 increment = map->stripe_len * factor; 1149 increment = map->stripe_len * factor;
766 mirror_num = num % map->sub_stripes; 1150 mirror_num = num % map->sub_stripes + 1;
767 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 1151 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
768 increment = map->stripe_len; 1152 increment = map->stripe_len;
769 mirror_num = num % map->num_stripes; 1153 mirror_num = num % map->num_stripes + 1;
770 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 1154 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
771 increment = map->stripe_len; 1155 increment = map->stripe_len;
772 mirror_num = num % map->num_stripes; 1156 mirror_num = num % map->num_stripes + 1;
773 } else { 1157 } else {
774 increment = map->stripe_len; 1158 increment = map->stripe_len;
775 mirror_num = 0; 1159 mirror_num = 1;
776 } 1160 }
777 1161
778 path = btrfs_alloc_path(); 1162 path = btrfs_alloc_path();
779 if (!path) 1163 if (!path)
780 return -ENOMEM; 1164 return -ENOMEM;
781 1165
782 path->reada = 2;
783 path->search_commit_root = 1; 1166 path->search_commit_root = 1;
784 path->skip_locking = 1; 1167 path->skip_locking = 1;
785 1168
786 /* 1169 /*
787 * find all extents for each stripe and just read them to get 1170 * trigger the readahead for extent tree csum tree and wait for
788 * them into the page cache 1171 * completion. During readahead, the scrub is officially paused
789 * FIXME: we can do better. build a more intelligent prefetching 1172 * to not hold off transaction commits
790 */ 1173 */
791 logical = base + offset; 1174 logical = base + offset;
792 physical = map->stripes[num].physical;
793 ret = 0;
794 for (i = 0; i < nstripes; ++i) {
795 key.objectid = logical;
796 key.type = BTRFS_EXTENT_ITEM_KEY;
797 key.offset = (u64)0;
798
799 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
800 if (ret < 0)
801 goto out_noplug;
802
803 /*
804 * we might miss half an extent here, but that doesn't matter,
805 * as it's only the prefetch
806 */
807 while (1) {
808 l = path->nodes[0];
809 slot = path->slots[0];
810 if (slot >= btrfs_header_nritems(l)) {
811 ret = btrfs_next_leaf(root, path);
812 if (ret == 0)
813 continue;
814 if (ret < 0)
815 goto out_noplug;
816 1175
817 break; 1176 wait_event(sdev->list_wait,
818 } 1177 atomic_read(&sdev->in_flight) == 0);
819 btrfs_item_key_to_cpu(l, &key, slot); 1178 atomic_inc(&fs_info->scrubs_paused);
1179 wake_up(&fs_info->scrub_pause_wait);
820 1180
821 if (key.objectid >= logical + map->stripe_len) 1181 /* FIXME it might be better to start readahead at commit root */
822 break; 1182 key_start.objectid = logical;
1183 key_start.type = BTRFS_EXTENT_ITEM_KEY;
1184 key_start.offset = (u64)0;
1185 key_end.objectid = base + offset + nstripes * increment;
1186 key_end.type = BTRFS_EXTENT_ITEM_KEY;
1187 key_end.offset = (u64)0;
1188 reada1 = btrfs_reada_add(root, &key_start, &key_end);
1189
1190 key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1191 key_start.type = BTRFS_EXTENT_CSUM_KEY;
1192 key_start.offset = logical;
1193 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1194 key_end.type = BTRFS_EXTENT_CSUM_KEY;
1195 key_end.offset = base + offset + nstripes * increment;
1196 reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
1197
1198 if (!IS_ERR(reada1))
1199 btrfs_reada_wait(reada1);
1200 if (!IS_ERR(reada2))
1201 btrfs_reada_wait(reada2);
823 1202
824 path->slots[0]++; 1203 mutex_lock(&fs_info->scrub_lock);
825 } 1204 while (atomic_read(&fs_info->scrub_pause_req)) {
826 btrfs_release_path(path); 1205 mutex_unlock(&fs_info->scrub_lock);
827 logical += increment; 1206 wait_event(fs_info->scrub_pause_wait,
828 physical += map->stripe_len; 1207 atomic_read(&fs_info->scrub_pause_req) == 0);
829 cond_resched(); 1208 mutex_lock(&fs_info->scrub_lock);
830 } 1209 }
1210 atomic_dec(&fs_info->scrubs_paused);
1211 mutex_unlock(&fs_info->scrub_lock);
1212 wake_up(&fs_info->scrub_pause_wait);
831 1213
832 /* 1214 /*
833 * collect all data csums for the stripe to avoid seeking during 1215 * collect all data csums for the stripe to avoid seeking during
834 * the scrub. This might currently (crc32) end up to be about 1MB 1216 * the scrub. This might currently (crc32) end up to be about 1MB
835 */ 1217 */
836 start_stripe = 0;
837 blk_start_plug(&plug); 1218 blk_start_plug(&plug);
838again:
839 logical = base + offset + start_stripe * increment;
840 for (i = start_stripe; i < nstripes; ++i) {
841 ret = btrfs_lookup_csums_range(csum_root, logical,
842 logical + map->stripe_len - 1,
843 &sdev->csum_list, 1);
844 if (ret)
845 goto out;
846 1219
847 logical += increment;
848 cond_resched();
849 }
850 /* 1220 /*
851 * now find all extents for each stripe and scrub them 1221 * now find all extents for each stripe and scrub them
852 */ 1222 */
853 logical = base + offset + start_stripe * increment; 1223 logical = base + offset;
854 physical = map->stripes[num].physical + start_stripe * map->stripe_len; 1224 physical = map->stripes[num].physical;
855 ret = 0; 1225 ret = 0;
856 for (i = start_stripe; i < nstripes; ++i) { 1226 for (i = 0; i < nstripes; ++i) {
857 /* 1227 /*
858 * canceled? 1228 * canceled?
859 */ 1229 */
@@ -882,11 +1252,14 @@ again:
882 atomic_dec(&fs_info->scrubs_paused); 1252 atomic_dec(&fs_info->scrubs_paused);
883 mutex_unlock(&fs_info->scrub_lock); 1253 mutex_unlock(&fs_info->scrub_lock);
884 wake_up(&fs_info->scrub_pause_wait); 1254 wake_up(&fs_info->scrub_pause_wait);
885 scrub_free_csums(sdev);
886 start_stripe = i;
887 goto again;
888 } 1255 }
889 1256
1257 ret = btrfs_lookup_csums_range(csum_root, logical,
1258 logical + map->stripe_len - 1,
1259 &sdev->csum_list, 1);
1260 if (ret)
1261 goto out;
1262
890 key.objectid = logical; 1263 key.objectid = logical;
891 key.type = BTRFS_EXTENT_ITEM_KEY; 1264 key.type = BTRFS_EXTENT_ITEM_KEY;
892 key.offset = (u64)0; 1265 key.offset = (u64)0;
@@ -982,7 +1355,6 @@ next:
982 1355
983out: 1356out:
984 blk_finish_plug(&plug); 1357 blk_finish_plug(&plug);
985out_noplug:
986 btrfs_free_path(path); 1358 btrfs_free_path(path);
987 return ret < 0 ? ret : 0; 1359 return ret < 0 ? ret : 0;
988} 1360}
@@ -1253,10 +1625,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
1253 ret = scrub_enumerate_chunks(sdev, start, end); 1625 ret = scrub_enumerate_chunks(sdev, start, end);
1254 1626
1255 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 1627 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
1256
1257 atomic_dec(&fs_info->scrubs_running); 1628 atomic_dec(&fs_info->scrubs_running);
1258 wake_up(&fs_info->scrub_pause_wait); 1629 wake_up(&fs_info->scrub_pause_wait);
1259 1630
1631 wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0);
1632
1260 if (progress) 1633 if (progress)
1261 memcpy(progress, &sdev->stat, sizeof(*progress)); 1634 memcpy(progress, &sdev->stat, sizeof(*progress));
1262 1635
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 15634d4648d7..17ee7fc5e64e 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -40,6 +40,7 @@
40#include <linux/magic.h> 40#include <linux/magic.h>
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/cleancache.h> 42#include <linux/cleancache.h>
43#include <linux/mnt_namespace.h>
43#include "compat.h" 44#include "compat.h"
44#include "delayed-inode.h" 45#include "delayed-inode.h"
45#include "ctree.h" 46#include "ctree.h"
@@ -58,6 +59,7 @@
58#include <trace/events/btrfs.h> 59#include <trace/events/btrfs.h>
59 60
60static const struct super_operations btrfs_super_ops; 61static const struct super_operations btrfs_super_ops;
62static struct file_system_type btrfs_fs_type;
61 63
62static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, 64static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
63 char nbuf[16]) 65 char nbuf[16])
@@ -162,7 +164,7 @@ enum {
162 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, 164 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
163 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, 165 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
164 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, 166 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
165 Opt_inode_cache, Opt_err, 167 Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
166}; 168};
167 169
168static match_table_t tokens = { 170static match_table_t tokens = {
@@ -195,6 +197,8 @@ static match_table_t tokens = {
195 {Opt_subvolrootid, "subvolrootid=%d"}, 197 {Opt_subvolrootid, "subvolrootid=%d"},
196 {Opt_defrag, "autodefrag"}, 198 {Opt_defrag, "autodefrag"},
197 {Opt_inode_cache, "inode_cache"}, 199 {Opt_inode_cache, "inode_cache"},
200 {Opt_no_space_cache, "nospace_cache"},
201 {Opt_recovery, "recovery"},
198 {Opt_err, NULL}, 202 {Opt_err, NULL},
199}; 203};
200 204
@@ -206,14 +210,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
206{ 210{
207 struct btrfs_fs_info *info = root->fs_info; 211 struct btrfs_fs_info *info = root->fs_info;
208 substring_t args[MAX_OPT_ARGS]; 212 substring_t args[MAX_OPT_ARGS];
209 char *p, *num, *orig; 213 char *p, *num, *orig = NULL;
214 u64 cache_gen;
210 int intarg; 215 int intarg;
211 int ret = 0; 216 int ret = 0;
212 char *compress_type; 217 char *compress_type;
213 bool compress_force = false; 218 bool compress_force = false;
214 219
220 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
221 if (cache_gen)
222 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
223
215 if (!options) 224 if (!options)
216 return 0; 225 goto out;
217 226
218 /* 227 /*
219 * strsep changes the string, duplicate it because parse_options 228 * strsep changes the string, duplicate it because parse_options
@@ -360,9 +369,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
360 btrfs_set_opt(info->mount_opt, DISCARD); 369 btrfs_set_opt(info->mount_opt, DISCARD);
361 break; 370 break;
362 case Opt_space_cache: 371 case Opt_space_cache:
363 printk(KERN_INFO "btrfs: enabling disk space caching\n");
364 btrfs_set_opt(info->mount_opt, SPACE_CACHE); 372 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
365 break; 373 break;
374 case Opt_no_space_cache:
375 printk(KERN_INFO "btrfs: disabling disk space caching\n");
376 btrfs_clear_opt(info->mount_opt, SPACE_CACHE);
377 break;
366 case Opt_inode_cache: 378 case Opt_inode_cache:
367 printk(KERN_INFO "btrfs: enabling inode map caching\n"); 379 printk(KERN_INFO "btrfs: enabling inode map caching\n");
368 btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE); 380 btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE);
@@ -381,6 +393,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
381 printk(KERN_INFO "btrfs: enabling auto defrag"); 393 printk(KERN_INFO "btrfs: enabling auto defrag");
382 btrfs_set_opt(info->mount_opt, AUTO_DEFRAG); 394 btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
383 break; 395 break;
396 case Opt_recovery:
397 printk(KERN_INFO "btrfs: enabling auto recovery");
398 btrfs_set_opt(info->mount_opt, RECOVERY);
399 break;
384 case Opt_err: 400 case Opt_err:
385 printk(KERN_INFO "btrfs: unrecognized mount option " 401 printk(KERN_INFO "btrfs: unrecognized mount option "
386 "'%s'\n", p); 402 "'%s'\n", p);
@@ -391,6 +407,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
391 } 407 }
392 } 408 }
393out: 409out:
410 if (!ret && btrfs_test_opt(root, SPACE_CACHE))
411 printk(KERN_INFO "btrfs: disk space caching is enabled\n");
394 kfree(orig); 412 kfree(orig);
395 return ret; 413 return ret;
396} 414}
@@ -406,12 +424,12 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
406 u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices) 424 u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
407{ 425{
408 substring_t args[MAX_OPT_ARGS]; 426 substring_t args[MAX_OPT_ARGS];
409 char *opts, *orig, *p; 427 char *device_name, *opts, *orig, *p;
410 int error = 0; 428 int error = 0;
411 int intarg; 429 int intarg;
412 430
413 if (!options) 431 if (!options)
414 goto out; 432 return 0;
415 433
416 /* 434 /*
417 * strsep changes the string, duplicate it because parse_options 435 * strsep changes the string, duplicate it because parse_options
@@ -430,6 +448,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
430 token = match_token(p, tokens, args); 448 token = match_token(p, tokens, args);
431 switch (token) { 449 switch (token) {
432 case Opt_subvol: 450 case Opt_subvol:
451 kfree(*subvol_name);
433 *subvol_name = match_strdup(&args[0]); 452 *subvol_name = match_strdup(&args[0]);
434 break; 453 break;
435 case Opt_subvolid: 454 case Opt_subvolid:
@@ -457,29 +476,24 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
457 } 476 }
458 break; 477 break;
459 case Opt_device: 478 case Opt_device:
460 error = btrfs_scan_one_device(match_strdup(&args[0]), 479 device_name = match_strdup(&args[0]);
480 if (!device_name) {
481 error = -ENOMEM;
482 goto out;
483 }
484 error = btrfs_scan_one_device(device_name,
461 flags, holder, fs_devices); 485 flags, holder, fs_devices);
486 kfree(device_name);
462 if (error) 487 if (error)
463 goto out_free_opts; 488 goto out;
464 break; 489 break;
465 default: 490 default:
466 break; 491 break;
467 } 492 }
468 } 493 }
469 494
470 out_free_opts: 495out:
471 kfree(orig); 496 kfree(orig);
472 out:
473 /*
474 * If no subvolume name is specified we use the default one. Allocate
475 * a copy of the string "." here so that code later in the
476 * mount path doesn't care if it's the default volume or another one.
477 */
478 if (!*subvol_name) {
479 *subvol_name = kstrdup(".", GFP_KERNEL);
480 if (!*subvol_name)
481 return -ENOMEM;
482 }
483 return error; 497 return error;
484} 498}
485 499
@@ -492,7 +506,6 @@ static struct dentry *get_default_root(struct super_block *sb,
492 struct btrfs_path *path; 506 struct btrfs_path *path;
493 struct btrfs_key location; 507 struct btrfs_key location;
494 struct inode *inode; 508 struct inode *inode;
495 struct dentry *dentry;
496 u64 dir_id; 509 u64 dir_id;
497 int new = 0; 510 int new = 0;
498 511
@@ -517,7 +530,7 @@ static struct dentry *get_default_root(struct super_block *sb,
517 * will mount by default if we haven't been given a specific subvolume 530 * will mount by default if we haven't been given a specific subvolume
518 * to mount. 531 * to mount.
519 */ 532 */
520 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 533 dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
521 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); 534 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
522 if (IS_ERR(di)) { 535 if (IS_ERR(di)) {
523 btrfs_free_path(path); 536 btrfs_free_path(path);
@@ -566,29 +579,7 @@ setup_root:
566 return dget(sb->s_root); 579 return dget(sb->s_root);
567 } 580 }
568 581
569 if (new) { 582 return d_obtain_alias(inode);
570 const struct qstr name = { .name = "/", .len = 1 };
571
572 /*
573 * New inode, we need to make the dentry a sibling of s_root so
574 * everything gets cleaned up properly on unmount.
575 */
576 dentry = d_alloc(sb->s_root, &name);
577 if (!dentry) {
578 iput(inode);
579 return ERR_PTR(-ENOMEM);
580 }
581 d_splice_alias(inode, dentry);
582 } else {
583 /*
584 * We found the inode in cache, just find a dentry for it and
585 * put the reference to the inode we just got.
586 */
587 dentry = d_find_alias(inode);
588 iput(inode);
589 }
590
591 return dentry;
592} 583}
593 584
594static int btrfs_fill_super(struct super_block *sb, 585static int btrfs_fill_super(struct super_block *sb,
@@ -719,6 +710,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
719 seq_puts(seq, ",noacl"); 710 seq_puts(seq, ",noacl");
720 if (btrfs_test_opt(root, SPACE_CACHE)) 711 if (btrfs_test_opt(root, SPACE_CACHE))
721 seq_puts(seq, ",space_cache"); 712 seq_puts(seq, ",space_cache");
713 else
714 seq_puts(seq, ",nospace_cache");
722 if (btrfs_test_opt(root, CLEAR_CACHE)) 715 if (btrfs_test_opt(root, CLEAR_CACHE))
723 seq_puts(seq, ",clear_cache"); 716 seq_puts(seq, ",clear_cache");
724 if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) 717 if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
@@ -753,6 +746,111 @@ static int btrfs_set_super(struct super_block *s, void *data)
753 return set_anon_super(s, data); 746 return set_anon_super(s, data);
754} 747}
755 748
749/*
750 * subvolumes are identified by ino 256
751 */
752static inline int is_subvolume_inode(struct inode *inode)
753{
754 if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
755 return 1;
756 return 0;
757}
758
759/*
760 * This will strip out the subvol=%s argument for an argument string and add
761 * subvolid=0 to make sure we get the actual tree root for path walking to the
762 * subvol we want.
763 */
764static char *setup_root_args(char *args)
765{
766 unsigned copied = 0;
767 unsigned len = strlen(args) + 2;
768 char *pos;
769 char *ret;
770
771 /*
772 * We need the same args as before, but minus
773 *
774 * subvol=a
775 *
776 * and add
777 *
778 * subvolid=0
779 *
780 * which is a difference of 2 characters, so we allocate strlen(args) +
781 * 2 characters.
782 */
783 ret = kzalloc(len * sizeof(char), GFP_NOFS);
784 if (!ret)
785 return NULL;
786 pos = strstr(args, "subvol=");
787
788 /* This shouldn't happen, but just in case.. */
789 if (!pos) {
790 kfree(ret);
791 return NULL;
792 }
793
794 /*
795 * The subvol=<> arg is not at the front of the string, copy everybody
796 * up to that into ret.
797 */
798 if (pos != args) {
799 *pos = '\0';
800 strcpy(ret, args);
801 copied += strlen(args);
802 pos++;
803 }
804
805 strncpy(ret + copied, "subvolid=0", len - copied);
806
807 /* Length of subvolid=0 */
808 copied += 10;
809
810 /*
811 * If there is no , after the subvol= option then we know there's no
812 * other options and we can just return.
813 */
814 pos = strchr(pos, ',');
815 if (!pos)
816 return ret;
817
818 /* Copy the rest of the arguments into our buffer */
819 strncpy(ret + copied, pos, len - copied);
820 copied += strlen(pos);
821
822 return ret;
823}
824
825static struct dentry *mount_subvol(const char *subvol_name, int flags,
826 const char *device_name, char *data)
827{
828 struct dentry *root;
829 struct vfsmount *mnt;
830 char *newargs;
831
832 newargs = setup_root_args(data);
833 if (!newargs)
834 return ERR_PTR(-ENOMEM);
835 mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
836 newargs);
837 kfree(newargs);
838 if (IS_ERR(mnt))
839 return ERR_CAST(mnt);
840
841 root = mount_subtree(mnt, subvol_name);
842
843 if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) {
844 struct super_block *s = root->d_sb;
845 dput(root);
846 root = ERR_PTR(-EINVAL);
847 deactivate_locked_super(s);
848 printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
849 subvol_name);
850 }
851
852 return root;
853}
756 854
757/* 855/*
758 * Find a superblock for the given device / mount point. 856 * Find a superblock for the given device / mount point.
@@ -767,7 +865,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
767 struct super_block *s; 865 struct super_block *s;
768 struct dentry *root; 866 struct dentry *root;
769 struct btrfs_fs_devices *fs_devices = NULL; 867 struct btrfs_fs_devices *fs_devices = NULL;
770 struct btrfs_root *tree_root = NULL;
771 struct btrfs_fs_info *fs_info = NULL; 868 struct btrfs_fs_info *fs_info = NULL;
772 fmode_t mode = FMODE_READ; 869 fmode_t mode = FMODE_READ;
773 char *subvol_name = NULL; 870 char *subvol_name = NULL;
@@ -781,21 +878,20 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
781 error = btrfs_parse_early_options(data, mode, fs_type, 878 error = btrfs_parse_early_options(data, mode, fs_type,
782 &subvol_name, &subvol_objectid, 879 &subvol_name, &subvol_objectid,
783 &subvol_rootid, &fs_devices); 880 &subvol_rootid, &fs_devices);
784 if (error) 881 if (error) {
882 kfree(subvol_name);
785 return ERR_PTR(error); 883 return ERR_PTR(error);
884 }
786 885
787 error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices); 886 if (subvol_name) {
788 if (error) 887 root = mount_subvol(subvol_name, flags, device_name, data);
789 goto error_free_subvol_name; 888 kfree(subvol_name);
889 return root;
890 }
790 891
791 error = btrfs_open_devices(fs_devices, mode, fs_type); 892 error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
792 if (error) 893 if (error)
793 goto error_free_subvol_name; 894 return ERR_PTR(error);
794
795 if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
796 error = -EACCES;
797 goto error_close_devices;
798 }
799 895
800 /* 896 /*
801 * Setup a dummy root and fs_info for test/set super. This is because 897 * Setup a dummy root and fs_info for test/set super. This is because
@@ -804,19 +900,40 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
804 * then open_ctree will properly initialize everything later. 900 * then open_ctree will properly initialize everything later.
805 */ 901 */
806 fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); 902 fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
807 tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); 903 if (!fs_info)
808 if (!fs_info || !tree_root) { 904 return ERR_PTR(-ENOMEM);
905
906 fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
907 if (!fs_info->tree_root) {
809 error = -ENOMEM; 908 error = -ENOMEM;
810 goto error_close_devices; 909 goto error_fs_info;
811 } 910 }
812 fs_info->tree_root = tree_root; 911 fs_info->tree_root->fs_info = fs_info;
813 fs_info->fs_devices = fs_devices; 912 fs_info->fs_devices = fs_devices;
814 tree_root->fs_info = fs_info; 913
914 fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
915 fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
916 if (!fs_info->super_copy || !fs_info->super_for_commit) {
917 error = -ENOMEM;
918 goto error_fs_info;
919 }
920
921 error = btrfs_open_devices(fs_devices, mode, fs_type);
922 if (error)
923 goto error_fs_info;
924
925 if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
926 error = -EACCES;
927 goto error_close_devices;
928 }
815 929
816 bdev = fs_devices->latest_bdev; 930 bdev = fs_devices->latest_bdev;
817 s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root); 931 s = sget(fs_type, btrfs_test_super, btrfs_set_super,
818 if (IS_ERR(s)) 932 fs_info->tree_root);
819 goto error_s; 933 if (IS_ERR(s)) {
934 error = PTR_ERR(s);
935 goto error_close_devices;
936 }
820 937
821 if (s->s_root) { 938 if (s->s_root) {
822 if ((flags ^ s->s_flags) & MS_RDONLY) { 939 if ((flags ^ s->s_flags) & MS_RDONLY) {
@@ -826,75 +943,35 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
826 } 943 }
827 944
828 btrfs_close_devices(fs_devices); 945 btrfs_close_devices(fs_devices);
829 kfree(fs_info); 946 free_fs_info(fs_info);
830 kfree(tree_root);
831 } else { 947 } else {
832 char b[BDEVNAME_SIZE]; 948 char b[BDEVNAME_SIZE];
833 949
834 s->s_flags = flags | MS_NOSEC; 950 s->s_flags = flags | MS_NOSEC;
835 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); 951 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
952 btrfs_sb(s)->fs_info->bdev_holder = fs_type;
836 error = btrfs_fill_super(s, fs_devices, data, 953 error = btrfs_fill_super(s, fs_devices, data,
837 flags & MS_SILENT ? 1 : 0); 954 flags & MS_SILENT ? 1 : 0);
838 if (error) { 955 if (error) {
839 deactivate_locked_super(s); 956 deactivate_locked_super(s);
840 goto error_free_subvol_name; 957 return ERR_PTR(error);
841 } 958 }
842 959
843 btrfs_sb(s)->fs_info->bdev_holder = fs_type;
844 s->s_flags |= MS_ACTIVE; 960 s->s_flags |= MS_ACTIVE;
845 } 961 }
846 962
847 /* if they gave us a subvolume name bind mount into that */ 963 root = get_default_root(s, subvol_objectid);
848 if (strcmp(subvol_name, ".")) { 964 if (IS_ERR(root)) {
849 struct dentry *new_root; 965 deactivate_locked_super(s);
850 966 return root;
851 root = get_default_root(s, subvol_rootid);
852 if (IS_ERR(root)) {
853 error = PTR_ERR(root);
854 deactivate_locked_super(s);
855 goto error_free_subvol_name;
856 }
857
858 mutex_lock(&root->d_inode->i_mutex);
859 new_root = lookup_one_len(subvol_name, root,
860 strlen(subvol_name));
861 mutex_unlock(&root->d_inode->i_mutex);
862
863 if (IS_ERR(new_root)) {
864 dput(root);
865 deactivate_locked_super(s);
866 error = PTR_ERR(new_root);
867 goto error_free_subvol_name;
868 }
869 if (!new_root->d_inode) {
870 dput(root);
871 dput(new_root);
872 deactivate_locked_super(s);
873 error = -ENXIO;
874 goto error_free_subvol_name;
875 }
876 dput(root);
877 root = new_root;
878 } else {
879 root = get_default_root(s, subvol_objectid);
880 if (IS_ERR(root)) {
881 error = PTR_ERR(root);
882 deactivate_locked_super(s);
883 goto error_free_subvol_name;
884 }
885 } 967 }
886 968
887 kfree(subvol_name);
888 return root; 969 return root;
889 970
890error_s:
891 error = PTR_ERR(s);
892error_close_devices: 971error_close_devices:
893 btrfs_close_devices(fs_devices); 972 btrfs_close_devices(fs_devices);
894 kfree(fs_info); 973error_fs_info:
895 kfree(tree_root); 974 free_fs_info(fs_info);
896error_free_subvol_name:
897 kfree(subvol_name);
898 return ERR_PTR(error); 975 return ERR_PTR(error);
899} 976}
900 977
@@ -919,7 +996,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
919 if (root->fs_info->fs_devices->rw_devices == 0) 996 if (root->fs_info->fs_devices->rw_devices == 0)
920 return -EACCES; 997 return -EACCES;
921 998
922 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) 999 if (btrfs_super_log_root(root->fs_info->super_copy) != 0)
923 return -EINVAL; 1000 return -EINVAL;
924 1001
925 ret = btrfs_cleanup_fs_roots(root->fs_info); 1002 ret = btrfs_cleanup_fs_roots(root->fs_info);
@@ -1085,7 +1162,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1085static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) 1162static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1086{ 1163{
1087 struct btrfs_root *root = btrfs_sb(dentry->d_sb); 1164 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
1088 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 1165 struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1089 struct list_head *head = &root->fs_info->space_info; 1166 struct list_head *head = &root->fs_info->space_info;
1090 struct btrfs_space_info *found; 1167 struct btrfs_space_info *found;
1091 u64 total_used = 0; 1168 u64 total_used = 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e24b7964a155..81376d94cd3c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -55,6 +55,7 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
55 struct btrfs_transaction *cur_trans; 55 struct btrfs_transaction *cur_trans;
56 56
57 spin_lock(&root->fs_info->trans_lock); 57 spin_lock(&root->fs_info->trans_lock);
58loop:
58 if (root->fs_info->trans_no_join) { 59 if (root->fs_info->trans_no_join) {
59 if (!nofail) { 60 if (!nofail) {
60 spin_unlock(&root->fs_info->trans_lock); 61 spin_unlock(&root->fs_info->trans_lock);
@@ -75,16 +76,18 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
75 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); 76 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
76 if (!cur_trans) 77 if (!cur_trans)
77 return -ENOMEM; 78 return -ENOMEM;
79
78 spin_lock(&root->fs_info->trans_lock); 80 spin_lock(&root->fs_info->trans_lock);
79 if (root->fs_info->running_transaction) { 81 if (root->fs_info->running_transaction) {
82 /*
83 * someone started a transaction after we unlocked. Make sure
84 * to redo the trans_no_join checks above
85 */
80 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 86 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
81 cur_trans = root->fs_info->running_transaction; 87 cur_trans = root->fs_info->running_transaction;
82 atomic_inc(&cur_trans->use_count); 88 goto loop;
83 atomic_inc(&cur_trans->num_writers);
84 cur_trans->num_joined++;
85 spin_unlock(&root->fs_info->trans_lock);
86 return 0;
87 } 89 }
90
88 atomic_set(&cur_trans->num_writers, 1); 91 atomic_set(&cur_trans->num_writers, 1);
89 cur_trans->num_joined = 0; 92 cur_trans->num_joined = 0;
90 init_waitqueue_head(&cur_trans->writer_wait); 93 init_waitqueue_head(&cur_trans->writer_wait);
@@ -275,7 +278,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
275 */ 278 */
276 if (num_items > 0 && root != root->fs_info->chunk_root) { 279 if (num_items > 0 && root != root->fs_info->chunk_root) {
277 num_bytes = btrfs_calc_trans_metadata_size(root, num_items); 280 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
278 ret = btrfs_block_rsv_add(NULL, root, 281 ret = btrfs_block_rsv_add(root,
279 &root->fs_info->trans_block_rsv, 282 &root->fs_info->trans_block_rsv,
280 num_bytes); 283 num_bytes);
281 if (ret) 284 if (ret)
@@ -418,8 +421,8 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
418 struct btrfs_root *root) 421 struct btrfs_root *root)
419{ 422{
420 int ret; 423 int ret;
421 ret = btrfs_block_rsv_check(trans, root, 424
422 &root->fs_info->global_block_rsv, 0, 5); 425 ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
423 return ret ? 1 : 0; 426 return ret ? 1 : 0;
424} 427}
425 428
@@ -427,17 +430,26 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
427 struct btrfs_root *root) 430 struct btrfs_root *root)
428{ 431{
429 struct btrfs_transaction *cur_trans = trans->transaction; 432 struct btrfs_transaction *cur_trans = trans->transaction;
433 struct btrfs_block_rsv *rsv = trans->block_rsv;
430 int updates; 434 int updates;
431 435
432 smp_mb(); 436 smp_mb();
433 if (cur_trans->blocked || cur_trans->delayed_refs.flushing) 437 if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
434 return 1; 438 return 1;
435 439
440 /*
441 * We need to do this in case we're deleting csums so the global block
442 * rsv get's used instead of the csum block rsv.
443 */
444 trans->block_rsv = NULL;
445
436 updates = trans->delayed_ref_updates; 446 updates = trans->delayed_ref_updates;
437 trans->delayed_ref_updates = 0; 447 trans->delayed_ref_updates = 0;
438 if (updates) 448 if (updates)
439 btrfs_run_delayed_refs(trans, root, updates); 449 btrfs_run_delayed_refs(trans, root, updates);
440 450
451 trans->block_rsv = rsv;
452
441 return should_end_transaction(trans, root); 453 return should_end_transaction(trans, root);
442} 454}
443 455
@@ -453,6 +465,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
453 return 0; 465 return 0;
454 } 466 }
455 467
468 btrfs_trans_release_metadata(trans, root);
469 trans->block_rsv = NULL;
456 while (count < 4) { 470 while (count < 4) {
457 unsigned long cur = trans->delayed_ref_updates; 471 unsigned long cur = trans->delayed_ref_updates;
458 trans->delayed_ref_updates = 0; 472 trans->delayed_ref_updates = 0;
@@ -473,8 +487,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
473 count++; 487 count++;
474 } 488 }
475 489
476 btrfs_trans_release_metadata(trans, root);
477
478 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && 490 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
479 should_end_transaction(trans, root)) { 491 should_end_transaction(trans, root)) {
480 trans->transaction->blocked = 1; 492 trans->transaction->blocked = 1;
@@ -562,50 +574,21 @@ int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
562int btrfs_write_marked_extents(struct btrfs_root *root, 574int btrfs_write_marked_extents(struct btrfs_root *root,
563 struct extent_io_tree *dirty_pages, int mark) 575 struct extent_io_tree *dirty_pages, int mark)
564{ 576{
565 int ret;
566 int err = 0; 577 int err = 0;
567 int werr = 0; 578 int werr = 0;
568 struct page *page; 579 struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
569 struct inode *btree_inode = root->fs_info->btree_inode;
570 u64 start = 0; 580 u64 start = 0;
571 u64 end; 581 u64 end;
572 unsigned long index;
573
574 while (1) {
575 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
576 mark);
577 if (ret)
578 break;
579 while (start <= end) {
580 cond_resched();
581
582 index = start >> PAGE_CACHE_SHIFT;
583 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
584 page = find_get_page(btree_inode->i_mapping, index);
585 if (!page)
586 continue;
587
588 btree_lock_page_hook(page);
589 if (!page->mapping) {
590 unlock_page(page);
591 page_cache_release(page);
592 continue;
593 }
594 582
595 if (PageWriteback(page)) { 583 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
596 if (PageDirty(page)) 584 mark)) {
597 wait_on_page_writeback(page); 585 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
598 else { 586 GFP_NOFS);
599 unlock_page(page); 587 err = filemap_fdatawrite_range(mapping, start, end);
600 page_cache_release(page); 588 if (err)
601 continue; 589 werr = err;
602 } 590 cond_resched();
603 } 591 start = end + 1;
604 err = write_one_page(page, 0);
605 if (err)
606 werr = err;
607 page_cache_release(page);
608 }
609 } 592 }
610 if (err) 593 if (err)
611 werr = err; 594 werr = err;
@@ -621,39 +604,20 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
621int btrfs_wait_marked_extents(struct btrfs_root *root, 604int btrfs_wait_marked_extents(struct btrfs_root *root,
622 struct extent_io_tree *dirty_pages, int mark) 605 struct extent_io_tree *dirty_pages, int mark)
623{ 606{
624 int ret;
625 int err = 0; 607 int err = 0;
626 int werr = 0; 608 int werr = 0;
627 struct page *page; 609 struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
628 struct inode *btree_inode = root->fs_info->btree_inode;
629 u64 start = 0; 610 u64 start = 0;
630 u64 end; 611 u64 end;
631 unsigned long index;
632 612
633 while (1) { 613 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
634 ret = find_first_extent_bit(dirty_pages, start, &start, &end, 614 EXTENT_NEED_WAIT)) {
635 mark); 615 clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
636 if (ret) 616 err = filemap_fdatawait_range(mapping, start, end);
637 break; 617 if (err)
638 618 werr = err;
639 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); 619 cond_resched();
640 while (start <= end) { 620 start = end + 1;
641 index = start >> PAGE_CACHE_SHIFT;
642 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
643 page = find_get_page(btree_inode->i_mapping, index);
644 if (!page)
645 continue;
646 if (PageDirty(page)) {
647 btree_lock_page_hook(page);
648 wait_on_page_writeback(page);
649 err = write_one_page(page, 0);
650 if (err)
651 werr = err;
652 }
653 wait_on_page_writeback(page);
654 page_cache_release(page);
655 cond_resched();
656 }
657 } 621 }
658 if (err) 622 if (err)
659 werr = err; 623 werr = err;
@@ -673,7 +637,12 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
673 637
674 ret = btrfs_write_marked_extents(root, dirty_pages, mark); 638 ret = btrfs_write_marked_extents(root, dirty_pages, mark);
675 ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); 639 ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
676 return ret || ret2; 640
641 if (ret)
642 return ret;
643 if (ret2)
644 return ret2;
645 return 0;
677} 646}
678 647
679int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 648int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
@@ -816,6 +785,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
816 785
817 btrfs_save_ino_cache(root, trans); 786 btrfs_save_ino_cache(root, trans);
818 787
788 /* see comments in should_cow_block() */
789 root->force_cow = 0;
790 smp_wmb();
791
819 if (root->commit_root != root->node) { 792 if (root->commit_root != root->node) {
820 mutex_lock(&root->fs_commit_mutex); 793 mutex_lock(&root->fs_commit_mutex);
821 switch_commit_root(root); 794 switch_commit_root(root);
@@ -911,11 +884,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
911 } 884 }
912 885
913 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); 886 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
914 btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
915 887
916 if (to_reserve > 0) { 888 if (to_reserve > 0) {
917 ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, 889 ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv,
918 to_reserve); 890 to_reserve);
919 if (ret) { 891 if (ret) {
920 pending->error = ret; 892 pending->error = ret;
921 goto fail; 893 goto fail;
@@ -979,6 +951,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
979 btrfs_tree_unlock(old); 951 btrfs_tree_unlock(old);
980 free_extent_buffer(old); 952 free_extent_buffer(old);
981 953
954 /* see comments in should_cow_block() */
955 root->force_cow = 1;
956 smp_wmb();
957
982 btrfs_set_root_node(new_root_item, tmp); 958 btrfs_set_root_node(new_root_item, tmp);
983 /* record when the snapshot was created in key.offset */ 959 /* record when the snapshot was created in key.offset */
984 key.offset = trans->transid; 960 key.offset = trans->transid;
@@ -1002,7 +978,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1002 BUG_ON(IS_ERR(pending->snap)); 978 BUG_ON(IS_ERR(pending->snap));
1003 979
1004 btrfs_reloc_post_snapshot(trans, pending); 980 btrfs_reloc_post_snapshot(trans, pending);
1005 btrfs_orphan_post_snapshot(trans, pending);
1006fail: 981fail:
1007 kfree(new_root_item); 982 kfree(new_root_item);
1008 trans->block_rsv = rsv; 983 trans->block_rsv = rsv;
@@ -1032,7 +1007,7 @@ static void update_super_roots(struct btrfs_root *root)
1032 struct btrfs_root_item *root_item; 1007 struct btrfs_root_item *root_item;
1033 struct btrfs_super_block *super; 1008 struct btrfs_super_block *super;
1034 1009
1035 super = &root->fs_info->super_copy; 1010 super = root->fs_info->super_copy;
1036 1011
1037 root_item = &root->fs_info->chunk_root->root_item; 1012 root_item = &root->fs_info->chunk_root->root_item;
1038 super->chunk_root = root_item->bytenr; 1013 super->chunk_root = root_item->bytenr;
@@ -1043,7 +1018,7 @@ static void update_super_roots(struct btrfs_root *root)
1043 super->root = root_item->bytenr; 1018 super->root = root_item->bytenr;
1044 super->generation = root_item->generation; 1019 super->generation = root_item->generation;
1045 super->root_level = root_item->level; 1020 super->root_level = root_item->level;
1046 if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE)) 1021 if (btrfs_test_opt(root, SPACE_CACHE))
1047 super->cache_generation = root_item->generation; 1022 super->cache_generation = root_item->generation;
1048} 1023}
1049 1024
@@ -1168,14 +1143,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1168 1143
1169 btrfs_run_ordered_operations(root, 0); 1144 btrfs_run_ordered_operations(root, 0);
1170 1145
1146 btrfs_trans_release_metadata(trans, root);
1147 trans->block_rsv = NULL;
1148
1171 /* make a pass through all the delayed refs we have so far 1149 /* make a pass through all the delayed refs we have so far
1172 * any runnings procs may add more while we are here 1150 * any runnings procs may add more while we are here
1173 */ 1151 */
1174 ret = btrfs_run_delayed_refs(trans, root, 0); 1152 ret = btrfs_run_delayed_refs(trans, root, 0);
1175 BUG_ON(ret); 1153 BUG_ON(ret);
1176 1154
1177 btrfs_trans_release_metadata(trans, root);
1178
1179 cur_trans = trans->transaction; 1155 cur_trans = trans->transaction;
1180 /* 1156 /*
1181 * set the flushing flag so procs in this transaction have to 1157 * set the flushing flag so procs in this transaction have to
@@ -1341,12 +1317,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1341 update_super_roots(root); 1317 update_super_roots(root);
1342 1318
1343 if (!root->fs_info->log_root_recovering) { 1319 if (!root->fs_info->log_root_recovering) {
1344 btrfs_set_super_log_root(&root->fs_info->super_copy, 0); 1320 btrfs_set_super_log_root(root->fs_info->super_copy, 0);
1345 btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0); 1321 btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
1346 } 1322 }
1347 1323
1348 memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, 1324 memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
1349 sizeof(root->fs_info->super_copy)); 1325 sizeof(*root->fs_info->super_copy));
1350 1326
1351 trans->transaction->blocked = 0; 1327 trans->transaction->blocked = 0;
1352 spin_lock(&root->fs_info->trans_lock); 1328 spin_lock(&root->fs_info->trans_lock);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 0618aa39740b..3568374d419d 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -276,8 +276,9 @@ static int process_one_buffer(struct btrfs_root *log,
276 struct walk_control *wc, u64 gen) 276 struct walk_control *wc, u64 gen)
277{ 277{
278 if (wc->pin) 278 if (wc->pin)
279 btrfs_pin_extent(log->fs_info->extent_root, 279 btrfs_pin_extent_for_log_replay(wc->trans,
280 eb->start, eb->len, 0); 280 log->fs_info->extent_root,
281 eb->start, eb->len);
281 282
282 if (btrfs_buffer_uptodate(eb, gen)) { 283 if (btrfs_buffer_uptodate(eb, gen)) {
283 if (wc->write) 284 if (wc->write)
@@ -1760,7 +1761,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1760 1761
1761 WARN_ON(root_owner != 1762 WARN_ON(root_owner !=
1762 BTRFS_TREE_LOG_OBJECTID); 1763 BTRFS_TREE_LOG_OBJECTID);
1763 ret = btrfs_free_reserved_extent(root, 1764 ret = btrfs_free_and_pin_reserved_extent(root,
1764 bytenr, blocksize); 1765 bytenr, blocksize);
1765 BUG_ON(ret); 1766 BUG_ON(ret);
1766 } 1767 }
@@ -1828,7 +1829,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1828 btrfs_tree_unlock(next); 1829 btrfs_tree_unlock(next);
1829 1830
1830 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1831 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1831 ret = btrfs_free_reserved_extent(root, 1832 ret = btrfs_free_and_pin_reserved_extent(root,
1832 path->nodes[*level]->start, 1833 path->nodes[*level]->start,
1833 path->nodes[*level]->len); 1834 path->nodes[*level]->len);
1834 BUG_ON(ret); 1835 BUG_ON(ret);
@@ -1897,7 +1898,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1897 1898
1898 WARN_ON(log->root_key.objectid != 1899 WARN_ON(log->root_key.objectid !=
1899 BTRFS_TREE_LOG_OBJECTID); 1900 BTRFS_TREE_LOG_OBJECTID);
1900 ret = btrfs_free_reserved_extent(log, next->start, 1901 ret = btrfs_free_and_pin_reserved_extent(log, next->start,
1901 next->len); 1902 next->len);
1902 BUG_ON(ret); 1903 BUG_ON(ret);
1903 } 1904 }
@@ -2013,10 +2014,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2013 /* wait for previous tree log sync to complete */ 2014 /* wait for previous tree log sync to complete */
2014 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2015 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2015 wait_log_commit(trans, root, root->log_transid - 1); 2016 wait_log_commit(trans, root, root->log_transid - 1);
2016
2017 while (1) { 2017 while (1) {
2018 unsigned long batch = root->log_batch; 2018 unsigned long batch = root->log_batch;
2019 if (root->log_multiple_pids) { 2019 /* when we're on an ssd, just kick the log commit out */
2020 if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
2020 mutex_unlock(&root->log_mutex); 2021 mutex_unlock(&root->log_mutex);
2021 schedule_timeout_uninterruptible(1); 2022 schedule_timeout_uninterruptible(1);
2022 mutex_lock(&root->log_mutex); 2023 mutex_lock(&root->log_mutex);
@@ -2117,9 +2118,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2117 BUG_ON(ret); 2118 BUG_ON(ret);
2118 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2119 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2119 2120
2120 btrfs_set_super_log_root(&root->fs_info->super_for_commit, 2121 btrfs_set_super_log_root(root->fs_info->super_for_commit,
2121 log_root_tree->node->start); 2122 log_root_tree->node->start);
2122 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, 2123 btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
2123 btrfs_header_level(log_root_tree->node)); 2124 btrfs_header_level(log_root_tree->node));
2124 2125
2125 log_root_tree->log_batch = 0; 2126 log_root_tree->log_batch = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f2a4cc79da61..c37433d3cd82 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -366,6 +366,14 @@ static noinline int device_list_add(const char *path,
366 } 366 }
367 INIT_LIST_HEAD(&device->dev_alloc_list); 367 INIT_LIST_HEAD(&device->dev_alloc_list);
368 368
369 /* init readahead state */
370 spin_lock_init(&device->reada_lock);
371 device->reada_curr_zone = NULL;
372 atomic_set(&device->reada_in_flight, 0);
373 device->reada_next = 0;
374 INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
375 INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
376
369 mutex_lock(&fs_devices->device_list_mutex); 377 mutex_lock(&fs_devices->device_list_mutex);
370 list_add_rcu(&device->dev_list, &fs_devices->devices); 378 list_add_rcu(&device->dev_list, &fs_devices->devices);
371 mutex_unlock(&fs_devices->device_list_mutex); 379 mutex_unlock(&fs_devices->device_list_mutex);
@@ -597,10 +605,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
597 set_blocksize(bdev, 4096); 605 set_blocksize(bdev, 4096);
598 606
599 bh = btrfs_read_dev_super(bdev); 607 bh = btrfs_read_dev_super(bdev);
600 if (!bh) { 608 if (!bh)
601 ret = -EINVAL;
602 goto error_close; 609 goto error_close;
603 }
604 610
605 disk_super = (struct btrfs_super_block *)bh->b_data; 611 disk_super = (struct btrfs_super_block *)bh->b_data;
606 devid = btrfs_stack_device_id(&disk_super->dev_item); 612 devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -655,7 +661,7 @@ error:
655 continue; 661 continue;
656 } 662 }
657 if (fs_devices->open_devices == 0) { 663 if (fs_devices->open_devices == 0) {
658 ret = -EIO; 664 ret = -EINVAL;
659 goto out; 665 goto out;
660 } 666 }
661 fs_devices->seeding = seeding; 667 fs_devices->seeding = seeding;
@@ -993,7 +999,7 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
993 key.objectid = device->devid; 999 key.objectid = device->devid;
994 key.offset = start; 1000 key.offset = start;
995 key.type = BTRFS_DEV_EXTENT_KEY; 1001 key.type = BTRFS_DEV_EXTENT_KEY;
996 1002again:
997 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1003 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
998 if (ret > 0) { 1004 if (ret > 0) {
999 ret = btrfs_previous_item(root, path, key.objectid, 1005 ret = btrfs_previous_item(root, path, key.objectid,
@@ -1006,6 +1012,9 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1006 struct btrfs_dev_extent); 1012 struct btrfs_dev_extent);
1007 BUG_ON(found_key.offset > start || found_key.offset + 1013 BUG_ON(found_key.offset > start || found_key.offset +
1008 btrfs_dev_extent_length(leaf, extent) < start); 1014 btrfs_dev_extent_length(leaf, extent) < start);
1015 key = found_key;
1016 btrfs_release_path(path);
1017 goto again;
1009 } else if (ret == 0) { 1018 } else if (ret == 0) {
1010 leaf = path->nodes[0]; 1019 leaf = path->nodes[0];
1011 extent = btrfs_item_ptr(leaf, path->slots[0], 1020 extent = btrfs_item_ptr(leaf, path->slots[0],
@@ -1013,8 +1022,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1013 } 1022 }
1014 BUG_ON(ret); 1023 BUG_ON(ret);
1015 1024
1016 if (device->bytes_used > 0) 1025 if (device->bytes_used > 0) {
1017 device->bytes_used -= btrfs_dev_extent_length(leaf, extent); 1026 u64 len = btrfs_dev_extent_length(leaf, extent);
1027 device->bytes_used -= len;
1028 spin_lock(&root->fs_info->free_chunk_lock);
1029 root->fs_info->free_chunk_space += len;
1030 spin_unlock(&root->fs_info->free_chunk_lock);
1031 }
1018 ret = btrfs_del_item(trans, root, path); 1032 ret = btrfs_del_item(trans, root, path);
1019 1033
1020out: 1034out:
@@ -1356,6 +1370,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1356 if (ret) 1370 if (ret)
1357 goto error_undo; 1371 goto error_undo;
1358 1372
1373 spin_lock(&root->fs_info->free_chunk_lock);
1374 root->fs_info->free_chunk_space = device->total_bytes -
1375 device->bytes_used;
1376 spin_unlock(&root->fs_info->free_chunk_lock);
1377
1359 device->in_fs_metadata = 0; 1378 device->in_fs_metadata = 0;
1360 btrfs_scrub_cancel_dev(root, device); 1379 btrfs_scrub_cancel_dev(root, device);
1361 1380
@@ -1387,8 +1406,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1387 call_rcu(&device->rcu, free_device); 1406 call_rcu(&device->rcu, free_device);
1388 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1407 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1389 1408
1390 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 1409 num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
1391 btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); 1410 btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
1392 1411
1393 if (cur_devices->open_devices == 0) { 1412 if (cur_devices->open_devices == 0) {
1394 struct btrfs_fs_devices *fs_devices; 1413 struct btrfs_fs_devices *fs_devices;
@@ -1450,7 +1469,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1450 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1469 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1451 struct btrfs_fs_devices *old_devices; 1470 struct btrfs_fs_devices *old_devices;
1452 struct btrfs_fs_devices *seed_devices; 1471 struct btrfs_fs_devices *seed_devices;
1453 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 1472 struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1454 struct btrfs_device *device; 1473 struct btrfs_device *device;
1455 u64 super_flags; 1474 u64 super_flags;
1456 1475
@@ -1691,15 +1710,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1691 root->fs_info->fs_devices->num_can_discard++; 1710 root->fs_info->fs_devices->num_can_discard++;
1692 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1711 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1693 1712
1713 spin_lock(&root->fs_info->free_chunk_lock);
1714 root->fs_info->free_chunk_space += device->total_bytes;
1715 spin_unlock(&root->fs_info->free_chunk_lock);
1716
1694 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 1717 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
1695 root->fs_info->fs_devices->rotating = 1; 1718 root->fs_info->fs_devices->rotating = 1;
1696 1719
1697 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); 1720 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
1698 btrfs_set_super_total_bytes(&root->fs_info->super_copy, 1721 btrfs_set_super_total_bytes(root->fs_info->super_copy,
1699 total_bytes + device->total_bytes); 1722 total_bytes + device->total_bytes);
1700 1723
1701 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); 1724 total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
1702 btrfs_set_super_num_devices(&root->fs_info->super_copy, 1725 btrfs_set_super_num_devices(root->fs_info->super_copy,
1703 total_bytes + 1); 1726 total_bytes + 1);
1704 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1727 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1705 1728
@@ -1790,7 +1813,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1790 struct btrfs_device *device, u64 new_size) 1813 struct btrfs_device *device, u64 new_size)
1791{ 1814{
1792 struct btrfs_super_block *super_copy = 1815 struct btrfs_super_block *super_copy =
1793 &device->dev_root->fs_info->super_copy; 1816 device->dev_root->fs_info->super_copy;
1794 u64 old_total = btrfs_super_total_bytes(super_copy); 1817 u64 old_total = btrfs_super_total_bytes(super_copy);
1795 u64 diff = new_size - device->total_bytes; 1818 u64 diff = new_size - device->total_bytes;
1796 1819
@@ -1849,7 +1872,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1849static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 1872static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1850 chunk_offset) 1873 chunk_offset)
1851{ 1874{
1852 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 1875 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
1853 struct btrfs_disk_key *disk_key; 1876 struct btrfs_disk_key *disk_key;
1854 struct btrfs_chunk *chunk; 1877 struct btrfs_chunk *chunk;
1855 u8 *ptr; 1878 u8 *ptr;
@@ -2175,7 +2198,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2175 bool retried = false; 2198 bool retried = false;
2176 struct extent_buffer *l; 2199 struct extent_buffer *l;
2177 struct btrfs_key key; 2200 struct btrfs_key key;
2178 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2201 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2179 u64 old_total = btrfs_super_total_bytes(super_copy); 2202 u64 old_total = btrfs_super_total_bytes(super_copy);
2180 u64 old_size = device->total_bytes; 2203 u64 old_size = device->total_bytes;
2181 u64 diff = device->total_bytes - new_size; 2204 u64 diff = device->total_bytes - new_size;
@@ -2192,8 +2215,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2192 lock_chunks(root); 2215 lock_chunks(root);
2193 2216
2194 device->total_bytes = new_size; 2217 device->total_bytes = new_size;
2195 if (device->writeable) 2218 if (device->writeable) {
2196 device->fs_devices->total_rw_bytes -= diff; 2219 device->fs_devices->total_rw_bytes -= diff;
2220 spin_lock(&root->fs_info->free_chunk_lock);
2221 root->fs_info->free_chunk_space -= diff;
2222 spin_unlock(&root->fs_info->free_chunk_lock);
2223 }
2197 unlock_chunks(root); 2224 unlock_chunks(root);
2198 2225
2199again: 2226again:
@@ -2257,6 +2284,9 @@ again:
2257 device->total_bytes = old_size; 2284 device->total_bytes = old_size;
2258 if (device->writeable) 2285 if (device->writeable)
2259 device->fs_devices->total_rw_bytes += diff; 2286 device->fs_devices->total_rw_bytes += diff;
2287 spin_lock(&root->fs_info->free_chunk_lock);
2288 root->fs_info->free_chunk_space += diff;
2289 spin_unlock(&root->fs_info->free_chunk_lock);
2260 unlock_chunks(root); 2290 unlock_chunks(root);
2261 goto done; 2291 goto done;
2262 } 2292 }
@@ -2292,7 +2322,7 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
2292 struct btrfs_key *key, 2322 struct btrfs_key *key,
2293 struct btrfs_chunk *chunk, int item_size) 2323 struct btrfs_chunk *chunk, int item_size)
2294{ 2324{
2295 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2325 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2296 struct btrfs_disk_key disk_key; 2326 struct btrfs_disk_key disk_key;
2297 u32 array_size; 2327 u32 array_size;
2298 u8 *ptr; 2328 u8 *ptr;
@@ -2615,6 +2645,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2615 index++; 2645 index++;
2616 } 2646 }
2617 2647
2648 spin_lock(&extent_root->fs_info->free_chunk_lock);
2649 extent_root->fs_info->free_chunk_space -= (stripe_size *
2650 map->num_stripes);
2651 spin_unlock(&extent_root->fs_info->free_chunk_lock);
2652
2618 index = 0; 2653 index = 0;
2619 stripe = &chunk->stripe; 2654 stripe = &chunk->stripe;
2620 while (index < map->num_stripes) { 2655 while (index < map->num_stripes) {
@@ -2848,7 +2883,7 @@ static int find_live_mirror(struct map_lookup *map, int first, int num,
2848 2883
2849static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2884static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2850 u64 logical, u64 *length, 2885 u64 logical, u64 *length,
2851 struct btrfs_multi_bio **multi_ret, 2886 struct btrfs_bio **bbio_ret,
2852 int mirror_num) 2887 int mirror_num)
2853{ 2888{
2854 struct extent_map *em; 2889 struct extent_map *em;
@@ -2866,18 +2901,18 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2866 int i; 2901 int i;
2867 int num_stripes; 2902 int num_stripes;
2868 int max_errors = 0; 2903 int max_errors = 0;
2869 struct btrfs_multi_bio *multi = NULL; 2904 struct btrfs_bio *bbio = NULL;
2870 2905
2871 if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) 2906 if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
2872 stripes_allocated = 1; 2907 stripes_allocated = 1;
2873again: 2908again:
2874 if (multi_ret) { 2909 if (bbio_ret) {
2875 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), 2910 bbio = kzalloc(btrfs_bio_size(stripes_allocated),
2876 GFP_NOFS); 2911 GFP_NOFS);
2877 if (!multi) 2912 if (!bbio)
2878 return -ENOMEM; 2913 return -ENOMEM;
2879 2914
2880 atomic_set(&multi->error, 0); 2915 atomic_set(&bbio->error, 0);
2881 } 2916 }
2882 2917
2883 read_lock(&em_tree->lock); 2918 read_lock(&em_tree->lock);
@@ -2898,7 +2933,7 @@ again:
2898 if (mirror_num > map->num_stripes) 2933 if (mirror_num > map->num_stripes)
2899 mirror_num = 0; 2934 mirror_num = 0;
2900 2935
2901 /* if our multi bio struct is too small, back off and try again */ 2936 /* if our btrfs_bio struct is too small, back off and try again */
2902 if (rw & REQ_WRITE) { 2937 if (rw & REQ_WRITE) {
2903 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 2938 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2904 BTRFS_BLOCK_GROUP_DUP)) { 2939 BTRFS_BLOCK_GROUP_DUP)) {
@@ -2917,11 +2952,11 @@ again:
2917 stripes_required = map->num_stripes; 2952 stripes_required = map->num_stripes;
2918 } 2953 }
2919 } 2954 }
2920 if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && 2955 if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
2921 stripes_allocated < stripes_required) { 2956 stripes_allocated < stripes_required) {
2922 stripes_allocated = map->num_stripes; 2957 stripes_allocated = map->num_stripes;
2923 free_extent_map(em); 2958 free_extent_map(em);
2924 kfree(multi); 2959 kfree(bbio);
2925 goto again; 2960 goto again;
2926 } 2961 }
2927 stripe_nr = offset; 2962 stripe_nr = offset;
@@ -2950,7 +2985,7 @@ again:
2950 *length = em->len - offset; 2985 *length = em->len - offset;
2951 } 2986 }
2952 2987
2953 if (!multi_ret) 2988 if (!bbio_ret)
2954 goto out; 2989 goto out;
2955 2990
2956 num_stripes = 1; 2991 num_stripes = 1;
@@ -2975,13 +3010,17 @@ again:
2975 stripe_index = find_live_mirror(map, 0, 3010 stripe_index = find_live_mirror(map, 0,
2976 map->num_stripes, 3011 map->num_stripes,
2977 current->pid % map->num_stripes); 3012 current->pid % map->num_stripes);
3013 mirror_num = stripe_index + 1;
2978 } 3014 }
2979 3015
2980 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 3016 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2981 if (rw & (REQ_WRITE | REQ_DISCARD)) 3017 if (rw & (REQ_WRITE | REQ_DISCARD)) {
2982 num_stripes = map->num_stripes; 3018 num_stripes = map->num_stripes;
2983 else if (mirror_num) 3019 } else if (mirror_num) {
2984 stripe_index = mirror_num - 1; 3020 stripe_index = mirror_num - 1;
3021 } else {
3022 mirror_num = 1;
3023 }
2985 3024
2986 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3025 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2987 int factor = map->num_stripes / map->sub_stripes; 3026 int factor = map->num_stripes / map->sub_stripes;
@@ -3001,6 +3040,7 @@ again:
3001 stripe_index = find_live_mirror(map, stripe_index, 3040 stripe_index = find_live_mirror(map, stripe_index,
3002 map->sub_stripes, stripe_index + 3041 map->sub_stripes, stripe_index +
3003 current->pid % map->sub_stripes); 3042 current->pid % map->sub_stripes);
3043 mirror_num = stripe_index + 1;
3004 } 3044 }
3005 } else { 3045 } else {
3006 /* 3046 /*
@@ -3009,15 +3049,16 @@ again:
3009 * stripe_index is the number of our device in the stripe array 3049 * stripe_index is the number of our device in the stripe array
3010 */ 3050 */
3011 stripe_index = do_div(stripe_nr, map->num_stripes); 3051 stripe_index = do_div(stripe_nr, map->num_stripes);
3052 mirror_num = stripe_index + 1;
3012 } 3053 }
3013 BUG_ON(stripe_index >= map->num_stripes); 3054 BUG_ON(stripe_index >= map->num_stripes);
3014 3055
3015 if (rw & REQ_DISCARD) { 3056 if (rw & REQ_DISCARD) {
3016 for (i = 0; i < num_stripes; i++) { 3057 for (i = 0; i < num_stripes; i++) {
3017 multi->stripes[i].physical = 3058 bbio->stripes[i].physical =
3018 map->stripes[stripe_index].physical + 3059 map->stripes[stripe_index].physical +
3019 stripe_offset + stripe_nr * map->stripe_len; 3060 stripe_offset + stripe_nr * map->stripe_len;
3020 multi->stripes[i].dev = map->stripes[stripe_index].dev; 3061 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
3021 3062
3022 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3063 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3023 u64 stripes; 3064 u64 stripes;
@@ -3038,16 +3079,16 @@ again:
3038 } 3079 }
3039 stripes = stripe_nr_end - 1 - j; 3080 stripes = stripe_nr_end - 1 - j;
3040 do_div(stripes, map->num_stripes); 3081 do_div(stripes, map->num_stripes);
3041 multi->stripes[i].length = map->stripe_len * 3082 bbio->stripes[i].length = map->stripe_len *
3042 (stripes - stripe_nr + 1); 3083 (stripes - stripe_nr + 1);
3043 3084
3044 if (i == 0) { 3085 if (i == 0) {
3045 multi->stripes[i].length -= 3086 bbio->stripes[i].length -=
3046 stripe_offset; 3087 stripe_offset;
3047 stripe_offset = 0; 3088 stripe_offset = 0;
3048 } 3089 }
3049 if (stripe_index == last_stripe) 3090 if (stripe_index == last_stripe)
3050 multi->stripes[i].length -= 3091 bbio->stripes[i].length -=
3051 stripe_end_offset; 3092 stripe_end_offset;
3052 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3093 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3053 u64 stripes; 3094 u64 stripes;
@@ -3072,11 +3113,11 @@ again:
3072 } 3113 }
3073 stripes = stripe_nr_end - 1 - j; 3114 stripes = stripe_nr_end - 1 - j;
3074 do_div(stripes, factor); 3115 do_div(stripes, factor);
3075 multi->stripes[i].length = map->stripe_len * 3116 bbio->stripes[i].length = map->stripe_len *
3076 (stripes - stripe_nr + 1); 3117 (stripes - stripe_nr + 1);
3077 3118
3078 if (i < map->sub_stripes) { 3119 if (i < map->sub_stripes) {
3079 multi->stripes[i].length -= 3120 bbio->stripes[i].length -=
3080 stripe_offset; 3121 stripe_offset;
3081 if (i == map->sub_stripes - 1) 3122 if (i == map->sub_stripes - 1)
3082 stripe_offset = 0; 3123 stripe_offset = 0;
@@ -3084,11 +3125,11 @@ again:
3084 if (stripe_index >= last_stripe && 3125 if (stripe_index >= last_stripe &&
3085 stripe_index <= (last_stripe + 3126 stripe_index <= (last_stripe +
3086 map->sub_stripes - 1)) { 3127 map->sub_stripes - 1)) {
3087 multi->stripes[i].length -= 3128 bbio->stripes[i].length -=
3088 stripe_end_offset; 3129 stripe_end_offset;
3089 } 3130 }
3090 } else 3131 } else
3091 multi->stripes[i].length = *length; 3132 bbio->stripes[i].length = *length;
3092 3133
3093 stripe_index++; 3134 stripe_index++;
3094 if (stripe_index == map->num_stripes) { 3135 if (stripe_index == map->num_stripes) {
@@ -3099,19 +3140,20 @@ again:
3099 } 3140 }
3100 } else { 3141 } else {
3101 for (i = 0; i < num_stripes; i++) { 3142 for (i = 0; i < num_stripes; i++) {
3102 multi->stripes[i].physical = 3143 bbio->stripes[i].physical =
3103 map->stripes[stripe_index].physical + 3144 map->stripes[stripe_index].physical +
3104 stripe_offset + 3145 stripe_offset +
3105 stripe_nr * map->stripe_len; 3146 stripe_nr * map->stripe_len;
3106 multi->stripes[i].dev = 3147 bbio->stripes[i].dev =
3107 map->stripes[stripe_index].dev; 3148 map->stripes[stripe_index].dev;
3108 stripe_index++; 3149 stripe_index++;
3109 } 3150 }
3110 } 3151 }
3111 if (multi_ret) { 3152 if (bbio_ret) {
3112 *multi_ret = multi; 3153 *bbio_ret = bbio;
3113 multi->num_stripes = num_stripes; 3154 bbio->num_stripes = num_stripes;
3114 multi->max_errors = max_errors; 3155 bbio->max_errors = max_errors;
3156 bbio->mirror_num = mirror_num;
3115 } 3157 }
3116out: 3158out:
3117 free_extent_map(em); 3159 free_extent_map(em);
@@ -3120,9 +3162,9 @@ out:
3120 3162
3121int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 3163int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3122 u64 logical, u64 *length, 3164 u64 logical, u64 *length,
3123 struct btrfs_multi_bio **multi_ret, int mirror_num) 3165 struct btrfs_bio **bbio_ret, int mirror_num)
3124{ 3166{
3125 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, 3167 return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
3126 mirror_num); 3168 mirror_num);
3127} 3169}
3128 3170
@@ -3191,28 +3233,30 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
3191 return 0; 3233 return 0;
3192} 3234}
3193 3235
3194static void end_bio_multi_stripe(struct bio *bio, int err) 3236static void btrfs_end_bio(struct bio *bio, int err)
3195{ 3237{
3196 struct btrfs_multi_bio *multi = bio->bi_private; 3238 struct btrfs_bio *bbio = bio->bi_private;
3197 int is_orig_bio = 0; 3239 int is_orig_bio = 0;
3198 3240
3199 if (err) 3241 if (err)
3200 atomic_inc(&multi->error); 3242 atomic_inc(&bbio->error);
3201 3243
3202 if (bio == multi->orig_bio) 3244 if (bio == bbio->orig_bio)
3203 is_orig_bio = 1; 3245 is_orig_bio = 1;
3204 3246
3205 if (atomic_dec_and_test(&multi->stripes_pending)) { 3247 if (atomic_dec_and_test(&bbio->stripes_pending)) {
3206 if (!is_orig_bio) { 3248 if (!is_orig_bio) {
3207 bio_put(bio); 3249 bio_put(bio);
3208 bio = multi->orig_bio; 3250 bio = bbio->orig_bio;
3209 } 3251 }
3210 bio->bi_private = multi->private; 3252 bio->bi_private = bbio->private;
3211 bio->bi_end_io = multi->end_io; 3253 bio->bi_end_io = bbio->end_io;
3254 bio->bi_bdev = (struct block_device *)
3255 (unsigned long)bbio->mirror_num;
3212 /* only send an error to the higher layers if it is 3256 /* only send an error to the higher layers if it is
3213 * beyond the tolerance of the multi-bio 3257 * beyond the tolerance of the multi-bio
3214 */ 3258 */
3215 if (atomic_read(&multi->error) > multi->max_errors) { 3259 if (atomic_read(&bbio->error) > bbio->max_errors) {
3216 err = -EIO; 3260 err = -EIO;
3217 } else if (err) { 3261 } else if (err) {
3218 /* 3262 /*
@@ -3222,7 +3266,7 @@ static void end_bio_multi_stripe(struct bio *bio, int err)
3222 set_bit(BIO_UPTODATE, &bio->bi_flags); 3266 set_bit(BIO_UPTODATE, &bio->bi_flags);
3223 err = 0; 3267 err = 0;
3224 } 3268 }
3225 kfree(multi); 3269 kfree(bbio);
3226 3270
3227 bio_endio(bio, err); 3271 bio_endio(bio, err);
3228 } else if (!is_orig_bio) { 3272 } else if (!is_orig_bio) {
@@ -3302,20 +3346,20 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3302 u64 logical = (u64)bio->bi_sector << 9; 3346 u64 logical = (u64)bio->bi_sector << 9;
3303 u64 length = 0; 3347 u64 length = 0;
3304 u64 map_length; 3348 u64 map_length;
3305 struct btrfs_multi_bio *multi = NULL;
3306 int ret; 3349 int ret;
3307 int dev_nr = 0; 3350 int dev_nr = 0;
3308 int total_devs = 1; 3351 int total_devs = 1;
3352 struct btrfs_bio *bbio = NULL;
3309 3353
3310 length = bio->bi_size; 3354 length = bio->bi_size;
3311 map_tree = &root->fs_info->mapping_tree; 3355 map_tree = &root->fs_info->mapping_tree;
3312 map_length = length; 3356 map_length = length;
3313 3357
3314 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, 3358 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
3315 mirror_num); 3359 mirror_num);
3316 BUG_ON(ret); 3360 BUG_ON(ret);
3317 3361
3318 total_devs = multi->num_stripes; 3362 total_devs = bbio->num_stripes;
3319 if (map_length < length) { 3363 if (map_length < length) {
3320 printk(KERN_CRIT "mapping failed logical %llu bio len %llu " 3364 printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
3321 "len %llu\n", (unsigned long long)logical, 3365 "len %llu\n", (unsigned long long)logical,
@@ -3323,25 +3367,28 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3323 (unsigned long long)map_length); 3367 (unsigned long long)map_length);
3324 BUG(); 3368 BUG();
3325 } 3369 }
3326 multi->end_io = first_bio->bi_end_io; 3370
3327 multi->private = first_bio->bi_private; 3371 bbio->orig_bio = first_bio;
3328 multi->orig_bio = first_bio; 3372 bbio->private = first_bio->bi_private;
3329 atomic_set(&multi->stripes_pending, multi->num_stripes); 3373 bbio->end_io = first_bio->bi_end_io;
3374 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
3330 3375
3331 while (dev_nr < total_devs) { 3376 while (dev_nr < total_devs) {
3332 if (total_devs > 1) { 3377 if (dev_nr < total_devs - 1) {
3333 if (dev_nr < total_devs - 1) { 3378 bio = bio_clone(first_bio, GFP_NOFS);
3334 bio = bio_clone(first_bio, GFP_NOFS); 3379 BUG_ON(!bio);
3335 BUG_ON(!bio); 3380 } else {
3336 } else { 3381 bio = first_bio;
3337 bio = first_bio;
3338 }
3339 bio->bi_private = multi;
3340 bio->bi_end_io = end_bio_multi_stripe;
3341 } 3382 }
3342 bio->bi_sector = multi->stripes[dev_nr].physical >> 9; 3383 bio->bi_private = bbio;
3343 dev = multi->stripes[dev_nr].dev; 3384 bio->bi_end_io = btrfs_end_bio;
3385 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
3386 dev = bbio->stripes[dev_nr].dev;
3344 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { 3387 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
3388 pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
3389 "(%s id %llu), size=%u\n", rw,
3390 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
3391 dev->name, dev->devid, bio->bi_size);
3345 bio->bi_bdev = dev->bdev; 3392 bio->bi_bdev = dev->bdev;
3346 if (async_submit) 3393 if (async_submit)
3347 schedule_bio(root, dev, rw, bio); 3394 schedule_bio(root, dev, rw, bio);
@@ -3354,8 +3401,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3354 } 3401 }
3355 dev_nr++; 3402 dev_nr++;
3356 } 3403 }
3357 if (total_devs == 1)
3358 kfree(multi);
3359 return 0; 3404 return 0;
3360} 3405}
3361 3406
@@ -3616,15 +3661,20 @@ static int read_one_dev(struct btrfs_root *root,
3616 fill_device_from_item(leaf, dev_item, device); 3661 fill_device_from_item(leaf, dev_item, device);
3617 device->dev_root = root->fs_info->dev_root; 3662 device->dev_root = root->fs_info->dev_root;
3618 device->in_fs_metadata = 1; 3663 device->in_fs_metadata = 1;
3619 if (device->writeable) 3664 if (device->writeable) {
3620 device->fs_devices->total_rw_bytes += device->total_bytes; 3665 device->fs_devices->total_rw_bytes += device->total_bytes;
3666 spin_lock(&root->fs_info->free_chunk_lock);
3667 root->fs_info->free_chunk_space += device->total_bytes -
3668 device->bytes_used;
3669 spin_unlock(&root->fs_info->free_chunk_lock);
3670 }
3621 ret = 0; 3671 ret = 0;
3622 return ret; 3672 return ret;
3623} 3673}
3624 3674
3625int btrfs_read_sys_array(struct btrfs_root *root) 3675int btrfs_read_sys_array(struct btrfs_root *root)
3626{ 3676{
3627 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 3677 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3628 struct extent_buffer *sb; 3678 struct extent_buffer *sb;
3629 struct btrfs_disk_key *disk_key; 3679 struct btrfs_disk_key *disk_key;
3630 struct btrfs_chunk *chunk; 3680 struct btrfs_chunk *chunk;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 6d866db4e177..78f2d4d4f37f 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -92,6 +92,20 @@ struct btrfs_device {
92 struct btrfs_work work; 92 struct btrfs_work work;
93 struct rcu_head rcu; 93 struct rcu_head rcu;
94 struct work_struct rcu_work; 94 struct work_struct rcu_work;
95
96 /* readahead state */
97 spinlock_t reada_lock;
98 atomic_t reada_in_flight;
99 u64 reada_next;
100 struct reada_zone *reada_curr_zone;
101 struct radix_tree_root reada_zones;
102 struct radix_tree_root reada_extents;
103
104 /* for sending down flush barriers */
105 struct bio *flush_bio;
106 struct completion flush_wait;
107 int nobarriers;
108
95}; 109};
96 110
97struct btrfs_fs_devices { 111struct btrfs_fs_devices {
@@ -136,7 +150,10 @@ struct btrfs_bio_stripe {
136 u64 length; /* only used for discard mappings */ 150 u64 length; /* only used for discard mappings */
137}; 151};
138 152
139struct btrfs_multi_bio { 153struct btrfs_bio;
154typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
155
156struct btrfs_bio {
140 atomic_t stripes_pending; 157 atomic_t stripes_pending;
141 bio_end_io_t *end_io; 158 bio_end_io_t *end_io;
142 struct bio *orig_bio; 159 struct bio *orig_bio;
@@ -144,6 +161,7 @@ struct btrfs_multi_bio {
144 atomic_t error; 161 atomic_t error;
145 int max_errors; 162 int max_errors;
146 int num_stripes; 163 int num_stripes;
164 int mirror_num;
147 struct btrfs_bio_stripe stripes[]; 165 struct btrfs_bio_stripe stripes[];
148}; 166};
149 167
@@ -171,7 +189,7 @@ struct map_lookup {
171int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 189int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
172 u64 end, u64 *length); 190 u64 end, u64 *length);
173 191
174#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ 192#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \
175 (sizeof(struct btrfs_bio_stripe) * (n))) 193 (sizeof(struct btrfs_bio_stripe) * (n)))
176 194
177int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 195int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
@@ -180,7 +198,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
180 u64 chunk_offset, u64 start, u64 num_bytes); 198 u64 chunk_offset, u64 start, u64 num_bytes);
181int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 199int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
182 u64 logical, u64 *length, 200 u64 logical, u64 *length,
183 struct btrfs_multi_bio **multi_ret, int mirror_num); 201 struct btrfs_bio **bbio_ret, int mirror_num);
184int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 202int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
185 u64 chunk_start, u64 physical, u64 devid, 203 u64 chunk_start, u64 physical, u64 devid,
186 u64 **logical, int *naddrs, int *stripe_len); 204 u64 **logical, int *naddrs, int *stripe_len);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 426aa464f1af..3848b04e310e 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -127,6 +127,17 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
127again: 127again:
128 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), 128 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
129 name, name_len, value, size); 129 name, name_len, value, size);
130 /*
131 * If we're setting an xattr to a new value but the new value is say
132 * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting
133 * back from split_leaf. This is because it thinks we'll be extending
134 * the existing item size, but we're asking for enough space to add the
135 * item itself. So if we get EOVERFLOW just set ret to EEXIST and let
136 * the rest of the function figure it out.
137 */
138 if (ret == -EOVERFLOW)
139 ret = -EEXIST;
140
130 if (ret == -EEXIST) { 141 if (ret == -EEXIST) {
131 if (flags & XATTR_CREATE) 142 if (flags & XATTR_CREATE)
132 goto out; 143 goto out;
diff --git a/fs/buffer.c b/fs/buffer.c
index 70a19745cb61..19d8eb7fdc81 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -288,7 +288,7 @@ static void free_more_memory(void)
288 struct zone *zone; 288 struct zone *zone;
289 int nid; 289 int nid;
290 290
291 wakeup_flusher_threads(1024); 291 wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
292 yield(); 292 yield();
293 293
294 for_each_online_node(nid) { 294 for_each_online_node(nid) {
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 15b21e35078a..0f327c6c9679 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -487,17 +487,15 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
487 ci->i_rdcache_gen++; 487 ci->i_rdcache_gen++;
488 488
489 /* 489 /*
490 * if we are newly issued FILE_SHARED, clear I_COMPLETE; we 490 * if we are newly issued FILE_SHARED, clear D_COMPLETE; we
491 * don't know what happened to this directory while we didn't 491 * don't know what happened to this directory while we didn't
492 * have the cap. 492 * have the cap.
493 */ 493 */
494 if ((issued & CEPH_CAP_FILE_SHARED) && 494 if ((issued & CEPH_CAP_FILE_SHARED) &&
495 (had & CEPH_CAP_FILE_SHARED) == 0) { 495 (had & CEPH_CAP_FILE_SHARED) == 0) {
496 ci->i_shared_gen++; 496 ci->i_shared_gen++;
497 if (S_ISDIR(ci->vfs_inode.i_mode)) { 497 if (S_ISDIR(ci->vfs_inode.i_mode))
498 dout(" marking %p NOT complete\n", &ci->vfs_inode); 498 ceph_dir_clear_complete(&ci->vfs_inode);
499 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
500 }
501 } 499 }
502} 500}
503 501
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 382abc9a6a54..bca3948e9dbf 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -108,7 +108,7 @@ static unsigned fpos_off(loff_t p)
108 * falling back to a "normal" sync readdir if any dentries in the dir 108 * falling back to a "normal" sync readdir if any dentries in the dir
109 * are dropped. 109 * are dropped.
110 * 110 *
111 * I_COMPLETE tells indicates we have all dentries in the dir. It is 111 * D_COMPLETE tells indicates we have all dentries in the dir. It is
112 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by 112 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
113 * the MDS if/when the directory is modified). 113 * the MDS if/when the directory is modified).
114 */ 114 */
@@ -199,8 +199,8 @@ more:
199 filp->f_pos++; 199 filp->f_pos++;
200 200
201 /* make sure a dentry wasn't dropped while we didn't have parent lock */ 201 /* make sure a dentry wasn't dropped while we didn't have parent lock */
202 if (!ceph_i_test(dir, CEPH_I_COMPLETE)) { 202 if (!ceph_dir_test_complete(dir)) {
203 dout(" lost I_COMPLETE on %p; falling back to mds\n", dir); 203 dout(" lost D_COMPLETE on %p; falling back to mds\n", dir);
204 err = -EAGAIN; 204 err = -EAGAIN;
205 goto out; 205 goto out;
206 } 206 }
@@ -285,7 +285,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
285 if ((filp->f_pos == 2 || fi->dentry) && 285 if ((filp->f_pos == 2 || fi->dentry) &&
286 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && 286 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
287 ceph_snap(inode) != CEPH_SNAPDIR && 287 ceph_snap(inode) != CEPH_SNAPDIR &&
288 (ci->i_ceph_flags & CEPH_I_COMPLETE) && 288 ceph_dir_test_complete(inode) &&
289 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { 289 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
290 spin_unlock(&inode->i_lock); 290 spin_unlock(&inode->i_lock);
291 err = __dcache_readdir(filp, dirent, filldir); 291 err = __dcache_readdir(filp, dirent, filldir);
@@ -351,7 +351,7 @@ more:
351 351
352 if (!req->r_did_prepopulate) { 352 if (!req->r_did_prepopulate) {
353 dout("readdir !did_prepopulate"); 353 dout("readdir !did_prepopulate");
354 fi->dir_release_count--; /* preclude I_COMPLETE */ 354 fi->dir_release_count--; /* preclude D_COMPLETE */
355 } 355 }
356 356
357 /* note next offset and last dentry name */ 357 /* note next offset and last dentry name */
@@ -430,8 +430,7 @@ more:
430 */ 430 */
431 spin_lock(&inode->i_lock); 431 spin_lock(&inode->i_lock);
432 if (ci->i_release_count == fi->dir_release_count) { 432 if (ci->i_release_count == fi->dir_release_count) {
433 dout(" marking %p complete\n", inode); 433 ceph_dir_set_complete(inode);
434 /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
435 ci->i_max_offset = filp->f_pos; 434 ci->i_max_offset = filp->f_pos;
436 } 435 }
437 spin_unlock(&inode->i_lock); 436 spin_unlock(&inode->i_lock);
@@ -614,7 +613,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
614 fsc->mount_options->snapdir_name, 613 fsc->mount_options->snapdir_name,
615 dentry->d_name.len) && 614 dentry->d_name.len) &&
616 !is_root_ceph_dentry(dir, dentry) && 615 !is_root_ceph_dentry(dir, dentry) &&
617 (ci->i_ceph_flags & CEPH_I_COMPLETE) && 616 ceph_dir_test_complete(dir) &&
618 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { 617 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
619 spin_unlock(&dir->i_lock); 618 spin_unlock(&dir->i_lock);
620 dout(" dir %p complete, -ENOENT\n", dir); 619 dout(" dir %p complete, -ENOENT\n", dir);
@@ -934,7 +933,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
934 */ 933 */
935 934
936 /* d_move screws up d_subdirs order */ 935 /* d_move screws up d_subdirs order */
937 ceph_i_clear(new_dir, CEPH_I_COMPLETE); 936 ceph_dir_clear_complete(new_dir);
938 937
939 d_move(old_dentry, new_dentry); 938 d_move(old_dentry, new_dentry);
940 939
@@ -1092,7 +1091,75 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry,
1092 return 1; 1091 return 1;
1093} 1092}
1094 1093
1094/*
1095 * Set/clear/test dir complete flag on the dir's dentry.
1096 */
1097static struct dentry * __d_find_any_alias(struct inode *inode)
1098{
1099 struct dentry *alias;
1100
1101 if (list_empty(&inode->i_dentry))
1102 return NULL;
1103 alias = list_first_entry(&inode->i_dentry, struct dentry, d_alias);
1104 return alias;
1105}
1106
1107void ceph_dir_set_complete(struct inode *inode)
1108{
1109 struct dentry *dentry = __d_find_any_alias(inode);
1110
1111 if (dentry && ceph_dentry(dentry)) {
1112 dout(" marking %p (%p) complete\n", inode, dentry);
1113 set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1114 }
1115}
1116
1117void ceph_dir_clear_complete(struct inode *inode)
1118{
1119 struct dentry *dentry = __d_find_any_alias(inode);
1095 1120
1121 if (dentry && ceph_dentry(dentry)) {
1122 dout(" marking %p (%p) NOT complete\n", inode, dentry);
1123 clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1124 }
1125}
1126
1127bool ceph_dir_test_complete(struct inode *inode)
1128{
1129 struct dentry *dentry = __d_find_any_alias(inode);
1130
1131 if (dentry && ceph_dentry(dentry))
1132 return test_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1133 return false;
1134}
1135
1136/*
1137 * When the VFS prunes a dentry from the cache, we need to clear the
1138 * complete flag on the parent directory.
1139 *
1140 * Called under dentry->d_lock.
1141 */
1142static void ceph_d_prune(struct dentry *dentry)
1143{
1144 struct ceph_dentry_info *di;
1145
1146 dout("ceph_d_prune %p\n", dentry);
1147
1148 /* do we have a valid parent? */
1149 if (!dentry->d_parent || IS_ROOT(dentry))
1150 return;
1151
1152 /* if we are not hashed, we don't affect D_COMPLETE */
1153 if (d_unhashed(dentry))
1154 return;
1155
1156 /*
1157 * we hold d_lock, so d_parent is stable, and d_fsdata is never
1158 * cleared until d_release
1159 */
1160 di = ceph_dentry(dentry->d_parent);
1161 clear_bit(CEPH_D_COMPLETE, &di->flags);
1162}
1096 1163
1097/* 1164/*
1098 * read() on a dir. This weird interface hack only works if mounted 1165 * read() on a dir. This weird interface hack only works if mounted
@@ -1306,6 +1373,7 @@ const struct inode_operations ceph_dir_iops = {
1306const struct dentry_operations ceph_dentry_ops = { 1373const struct dentry_operations ceph_dentry_ops = {
1307 .d_revalidate = ceph_d_revalidate, 1374 .d_revalidate = ceph_d_revalidate,
1308 .d_release = ceph_d_release, 1375 .d_release = ceph_d_release,
1376 .d_prune = ceph_d_prune,
1309}; 1377};
1310 1378
1311const struct dentry_operations ceph_snapdir_dentry_ops = { 1379const struct dentry_operations ceph_snapdir_dentry_ops = {
@@ -1315,4 +1383,5 @@ const struct dentry_operations ceph_snapdir_dentry_ops = {
1315 1383
1316const struct dentry_operations ceph_snap_dentry_ops = { 1384const struct dentry_operations ceph_snap_dentry_ops = {
1317 .d_release = ceph_d_release, 1385 .d_release = ceph_d_release,
1386 .d_prune = ceph_d_prune,
1318}; 1387};
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 1616a0d37cbd..116f36502f17 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -771,9 +771,9 @@ no_change:
771 ceph_snap(inode) == CEPH_NOSNAP && 771 ceph_snap(inode) == CEPH_NOSNAP &&
772 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && 772 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
773 (issued & CEPH_CAP_FILE_EXCL) == 0 && 773 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
774 (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { 774 !ceph_dir_test_complete(inode)) {
775 dout(" marking %p complete (empty)\n", inode); 775 dout(" marking %p complete (empty)\n", inode);
776 /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */ 776 ceph_dir_set_complete(inode);
777 ci->i_max_offset = 2; 777 ci->i_max_offset = 2;
778 } 778 }
779 779
@@ -856,7 +856,7 @@ static void ceph_set_dentry_offset(struct dentry *dn)
856 di = ceph_dentry(dn); 856 di = ceph_dentry(dn);
857 857
858 spin_lock(&inode->i_lock); 858 spin_lock(&inode->i_lock);
859 if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) { 859 if (!ceph_dir_test_complete(inode)) {
860 spin_unlock(&inode->i_lock); 860 spin_unlock(&inode->i_lock);
861 return; 861 return;
862 } 862 }
@@ -1056,7 +1056,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1056 * d_move() puts the renamed dentry at the end of 1056 * d_move() puts the renamed dentry at the end of
1057 * d_subdirs. We need to assign it an appropriate 1057 * d_subdirs. We need to assign it an appropriate
1058 * directory offset so we can behave when holding 1058 * directory offset so we can behave when holding
1059 * I_COMPLETE. 1059 * D_COMPLETE.
1060 */ 1060 */
1061 ceph_set_dentry_offset(req->r_old_dentry); 1061 ceph_set_dentry_offset(req->r_old_dentry);
1062 dout("dn %p gets new offset %lld\n", req->r_old_dentry, 1062 dout("dn %p gets new offset %lld\n", req->r_old_dentry,
@@ -1328,12 +1328,13 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
1328 */ 1328 */
1329void ceph_queue_writeback(struct inode *inode) 1329void ceph_queue_writeback(struct inode *inode)
1330{ 1330{
1331 ihold(inode);
1331 if (queue_work(ceph_inode_to_client(inode)->wb_wq, 1332 if (queue_work(ceph_inode_to_client(inode)->wb_wq,
1332 &ceph_inode(inode)->i_wb_work)) { 1333 &ceph_inode(inode)->i_wb_work)) {
1333 dout("ceph_queue_writeback %p\n", inode); 1334 dout("ceph_queue_writeback %p\n", inode);
1334 ihold(inode);
1335 } else { 1335 } else {
1336 dout("ceph_queue_writeback %p failed\n", inode); 1336 dout("ceph_queue_writeback %p failed\n", inode);
1337 iput(inode);
1337 } 1338 }
1338} 1339}
1339 1340
@@ -1353,12 +1354,13 @@ static void ceph_writeback_work(struct work_struct *work)
1353 */ 1354 */
1354void ceph_queue_invalidate(struct inode *inode) 1355void ceph_queue_invalidate(struct inode *inode)
1355{ 1356{
1357 ihold(inode);
1356 if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq, 1358 if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
1357 &ceph_inode(inode)->i_pg_inv_work)) { 1359 &ceph_inode(inode)->i_pg_inv_work)) {
1358 dout("ceph_queue_invalidate %p\n", inode); 1360 dout("ceph_queue_invalidate %p\n", inode);
1359 ihold(inode);
1360 } else { 1361 } else {
1361 dout("ceph_queue_invalidate %p failed\n", inode); 1362 dout("ceph_queue_invalidate %p failed\n", inode);
1363 iput(inode);
1362 } 1364 }
1363} 1365}
1364 1366
@@ -1434,13 +1436,14 @@ void ceph_queue_vmtruncate(struct inode *inode)
1434{ 1436{
1435 struct ceph_inode_info *ci = ceph_inode(inode); 1437 struct ceph_inode_info *ci = ceph_inode(inode);
1436 1438
1439 ihold(inode);
1437 if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq, 1440 if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
1438 &ci->i_vmtruncate_work)) { 1441 &ci->i_vmtruncate_work)) {
1439 dout("ceph_queue_vmtruncate %p\n", inode); 1442 dout("ceph_queue_vmtruncate %p\n", inode);
1440 ihold(inode);
1441 } else { 1443 } else {
1442 dout("ceph_queue_vmtruncate %p failed, pending=%d\n", 1444 dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
1443 inode, ci->i_truncate_pending); 1445 inode, ci->i_truncate_pending);
1446 iput(inode);
1444 } 1447 }
1445} 1448}
1446 1449
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 1d72f15fe9f4..264ab701154f 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -619,7 +619,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
619 * 619 *
620 * Called under mdsc->mutex. 620 * Called under mdsc->mutex.
621 */ 621 */
622struct dentry *get_nonsnap_parent(struct dentry *dentry) 622static struct dentry *get_nonsnap_parent(struct dentry *dentry)
623{ 623{
624 /* 624 /*
625 * we don't need to worry about protecting the d_parent access 625 * we don't need to worry about protecting the d_parent access
@@ -2002,7 +2002,7 @@ out:
2002} 2002}
2003 2003
2004/* 2004/*
2005 * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS 2005 * Invalidate dir D_COMPLETE, dentry lease state on an aborted MDS
2006 * namespace request. 2006 * namespace request.
2007 */ 2007 */
2008void ceph_invalidate_dir_request(struct ceph_mds_request *req) 2008void ceph_invalidate_dir_request(struct ceph_mds_request *req)
@@ -2010,9 +2010,9 @@ void ceph_invalidate_dir_request(struct ceph_mds_request *req)
2010 struct inode *inode = req->r_locked_dir; 2010 struct inode *inode = req->r_locked_dir;
2011 struct ceph_inode_info *ci = ceph_inode(inode); 2011 struct ceph_inode_info *ci = ceph_inode(inode);
2012 2012
2013 dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode); 2013 dout("invalidate_dir_request %p (D_COMPLETE, lease(s))\n", inode);
2014 spin_lock(&inode->i_lock); 2014 spin_lock(&inode->i_lock);
2015 ci->i_ceph_flags &= ~CEPH_I_COMPLETE; 2015 ceph_dir_clear_complete(inode);
2016 ci->i_release_count++; 2016 ci->i_release_count++;
2017 spin_unlock(&inode->i_lock); 2017 spin_unlock(&inode->i_lock);
2018 2018
@@ -3154,7 +3154,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
3154/* 3154/*
3155 * true if all sessions are closed, or we force unmount 3155 * true if all sessions are closed, or we force unmount
3156 */ 3156 */
3157bool done_closing_sessions(struct ceph_mds_client *mdsc) 3157static bool done_closing_sessions(struct ceph_mds_client *mdsc)
3158{ 3158{
3159 int i, n = 0; 3159 int i, n = 0;
3160 3160
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 788f5ad8e66d..8dc73a594a90 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -426,7 +426,7 @@ static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
426/* 426/*
427 * create a new fs client 427 * create a new fs client
428 */ 428 */
429struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, 429static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
430 struct ceph_options *opt) 430 struct ceph_options *opt)
431{ 431{
432 struct ceph_fs_client *fsc; 432 struct ceph_fs_client *fsc;
@@ -502,7 +502,7 @@ fail:
502 return ERR_PTR(err); 502 return ERR_PTR(err);
503} 503}
504 504
505void destroy_fs_client(struct ceph_fs_client *fsc) 505static void destroy_fs_client(struct ceph_fs_client *fsc)
506{ 506{
507 dout("destroy_fs_client %p\n", fsc); 507 dout("destroy_fs_client %p\n", fsc);
508 508
@@ -638,10 +638,12 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
638 if (err == 0) { 638 if (err == 0) {
639 dout("open_root_inode success\n"); 639 dout("open_root_inode success\n");
640 if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT && 640 if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
641 fsc->sb->s_root == NULL) 641 fsc->sb->s_root == NULL) {
642 root = d_alloc_root(req->r_target_inode); 642 root = d_alloc_root(req->r_target_inode);
643 else 643 ceph_init_dentry(root);
644 } else {
644 root = d_obtain_alias(req->r_target_inode); 645 root = d_obtain_alias(req->r_target_inode);
646 }
645 req->r_target_inode = NULL; 647 req->r_target_inode = NULL;
646 dout("open_root_inode success, root dentry is %p\n", root); 648 dout("open_root_inode success, root dentry is %p\n", root);
647 } else { 649 } else {
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index b01442aaf278..01bf189e08a9 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -203,6 +203,7 @@ struct ceph_inode_xattr {
203 * Ceph dentry state 203 * Ceph dentry state
204 */ 204 */
205struct ceph_dentry_info { 205struct ceph_dentry_info {
206 unsigned long flags;
206 struct ceph_mds_session *lease_session; 207 struct ceph_mds_session *lease_session;
207 u32 lease_gen, lease_shared_gen; 208 u32 lease_gen, lease_shared_gen;
208 u32 lease_seq; 209 u32 lease_seq;
@@ -213,6 +214,18 @@ struct ceph_dentry_info {
213 u64 offset; 214 u64 offset;
214}; 215};
215 216
217/*
218 * dentry flags
219 *
220 * The locking for D_COMPLETE is a bit odd:
221 * - we can clear it at almost any time (see ceph_d_prune)
222 * - it is only meaningful if:
223 * - we hold dir inode i_lock
224 * - we hold dir FILE_SHARED caps
225 * - the dentry D_COMPLETE is set
226 */
227#define CEPH_D_COMPLETE 1 /* if set, d_u.d_subdirs is complete directory */
228
216struct ceph_inode_xattrs_info { 229struct ceph_inode_xattrs_info {
217 /* 230 /*
218 * (still encoded) xattr blob. we avoid the overhead of parsing 231 * (still encoded) xattr blob. we avoid the overhead of parsing
@@ -251,7 +264,7 @@ struct ceph_inode_info {
251 struct timespec i_rctime; 264 struct timespec i_rctime;
252 u64 i_rbytes, i_rfiles, i_rsubdirs; 265 u64 i_rbytes, i_rfiles, i_rsubdirs;
253 u64 i_files, i_subdirs; 266 u64 i_files, i_subdirs;
254 u64 i_max_offset; /* largest readdir offset, set with I_COMPLETE */ 267 u64 i_max_offset; /* largest readdir offset, set with D_COMPLETE */
255 268
256 struct rb_root i_fragtree; 269 struct rb_root i_fragtree;
257 struct mutex i_fragtree_mutex; 270 struct mutex i_fragtree_mutex;
@@ -416,7 +429,6 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
416/* 429/*
417 * Ceph inode. 430 * Ceph inode.
418 */ 431 */
419#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
420#define CEPH_I_NODELAY 4 /* do not delay cap release */ 432#define CEPH_I_NODELAY 4 /* do not delay cap release */
421#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ 433#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
422#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ 434#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
@@ -474,6 +486,13 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
474} 486}
475 487
476/* 488/*
489 * set/clear directory D_COMPLETE flag
490 */
491void ceph_dir_set_complete(struct inode *inode);
492void ceph_dir_clear_complete(struct inode *inode);
493bool ceph_dir_test_complete(struct inode *inode);
494
495/*
477 * caps helpers 496 * caps helpers
478 */ 497 */
479static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci) 498static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index c0458c543f17..d6a972df0338 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -37,6 +37,7 @@
37#include <asm/uaccess.h> 37#include <asm/uaccess.h>
38#include <asm/processor.h> 38#include <asm/processor.h>
39#include <linux/inet.h> 39#include <linux/inet.h>
40#include <linux/module.h>
40#include <net/ipv6.h> 41#include <net/ipv6.h>
41#include "cifspdu.h" 42#include "cifspdu.h"
42#include "cifsglob.h" 43#include "cifsglob.h"
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index c1f063cd1b0c..cf0b1539b321 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -645,20 +645,20 @@ int cifs_closedir(struct inode *inode, struct file *file)
645} 645}
646 646
647static struct cifsLockInfo * 647static struct cifsLockInfo *
648cifs_lock_init(__u64 len, __u64 offset, __u8 type, __u16 netfid) 648cifs_lock_init(__u64 offset, __u64 length, __u8 type, __u16 netfid)
649{ 649{
650 struct cifsLockInfo *li = 650 struct cifsLockInfo *lock =
651 kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL); 651 kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL);
652 if (!li) 652 if (!lock)
653 return li; 653 return lock;
654 li->netfid = netfid; 654 lock->offset = offset;
655 li->offset = offset; 655 lock->length = length;
656 li->length = len; 656 lock->type = type;
657 li->type = type; 657 lock->netfid = netfid;
658 li->pid = current->tgid; 658 lock->pid = current->tgid;
659 INIT_LIST_HEAD(&li->blist); 659 INIT_LIST_HEAD(&lock->blist);
660 init_waitqueue_head(&li->block_q); 660 init_waitqueue_head(&lock->block_q);
661 return li; 661 return lock;
662} 662}
663 663
664static void 664static void
@@ -672,7 +672,7 @@ cifs_del_lock_waiters(struct cifsLockInfo *lock)
672} 672}
673 673
674static bool 674static bool
675cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset, 675__cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset,
676 __u64 length, __u8 type, __u16 netfid, 676 __u64 length, __u8 type, __u16 netfid,
677 struct cifsLockInfo **conf_lock) 677 struct cifsLockInfo **conf_lock)
678{ 678{
@@ -694,6 +694,14 @@ cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset,
694 return false; 694 return false;
695} 695}
696 696
697static bool
698cifs_find_lock_conflict(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock,
699 struct cifsLockInfo **conf_lock)
700{
701 return __cifs_find_lock_conflict(cinode, lock->offset, lock->length,
702 lock->type, lock->netfid, conf_lock);
703}
704
697static int 705static int
698cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length, 706cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
699 __u8 type, __u16 netfid, struct file_lock *flock) 707 __u8 type, __u16 netfid, struct file_lock *flock)
@@ -704,8 +712,8 @@ cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
704 712
705 mutex_lock(&cinode->lock_mutex); 713 mutex_lock(&cinode->lock_mutex);
706 714
707 exist = cifs_find_lock_conflict(cinode, offset, length, type, netfid, 715 exist = __cifs_find_lock_conflict(cinode, offset, length, type, netfid,
708 &conf_lock); 716 &conf_lock);
709 if (exist) { 717 if (exist) {
710 flock->fl_start = conf_lock->offset; 718 flock->fl_start = conf_lock->offset;
711 flock->fl_end = conf_lock->offset + conf_lock->length - 1; 719 flock->fl_end = conf_lock->offset + conf_lock->length - 1;
@@ -723,40 +731,27 @@ cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
723 return rc; 731 return rc;
724} 732}
725 733
726static int 734static void
727cifs_lock_add(struct cifsInodeInfo *cinode, __u64 len, __u64 offset, 735cifs_lock_add(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock)
728 __u8 type, __u16 netfid)
729{ 736{
730 struct cifsLockInfo *li;
731
732 li = cifs_lock_init(len, offset, type, netfid);
733 if (!li)
734 return -ENOMEM;
735
736 mutex_lock(&cinode->lock_mutex); 737 mutex_lock(&cinode->lock_mutex);
737 list_add_tail(&li->llist, &cinode->llist); 738 list_add_tail(&lock->llist, &cinode->llist);
738 mutex_unlock(&cinode->lock_mutex); 739 mutex_unlock(&cinode->lock_mutex);
739 return 0;
740} 740}
741 741
742static int 742static int
743cifs_lock_add_if(struct cifsInodeInfo *cinode, __u64 offset, __u64 length, 743cifs_lock_add_if(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock,
744 __u8 type, __u16 netfid, bool wait) 744 bool wait)
745{ 745{
746 struct cifsLockInfo *lock, *conf_lock; 746 struct cifsLockInfo *conf_lock;
747 bool exist; 747 bool exist;
748 int rc = 0; 748 int rc = 0;
749 749
750 lock = cifs_lock_init(length, offset, type, netfid);
751 if (!lock)
752 return -ENOMEM;
753
754try_again: 750try_again:
755 exist = false; 751 exist = false;
756 mutex_lock(&cinode->lock_mutex); 752 mutex_lock(&cinode->lock_mutex);
757 753
758 exist = cifs_find_lock_conflict(cinode, offset, length, type, netfid, 754 exist = cifs_find_lock_conflict(cinode, lock, &conf_lock);
759 &conf_lock);
760 if (!exist && cinode->can_cache_brlcks) { 755 if (!exist && cinode->can_cache_brlcks) {
761 list_add_tail(&lock->llist, &cinode->llist); 756 list_add_tail(&lock->llist, &cinode->llist);
762 mutex_unlock(&cinode->lock_mutex); 757 mutex_unlock(&cinode->lock_mutex);
@@ -775,13 +770,10 @@ try_again:
775 (lock->blist.next == &lock->blist)); 770 (lock->blist.next == &lock->blist));
776 if (!rc) 771 if (!rc)
777 goto try_again; 772 goto try_again;
778 else { 773 mutex_lock(&cinode->lock_mutex);
779 mutex_lock(&cinode->lock_mutex); 774 list_del_init(&lock->blist);
780 list_del_init(&lock->blist);
781 }
782 } 775 }
783 776
784 kfree(lock);
785 mutex_unlock(&cinode->lock_mutex); 777 mutex_unlock(&cinode->lock_mutex);
786 return rc; 778 return rc;
787} 779}
@@ -933,7 +925,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
933 else 925 else
934 type = CIFS_WRLCK; 926 type = CIFS_WRLCK;
935 927
936 lck = cifs_lock_init(length, flock->fl_start, type, 928 lck = cifs_lock_init(flock->fl_start, length, type,
937 cfile->netfid); 929 cfile->netfid);
938 if (!lck) { 930 if (!lck) {
939 rc = -ENOMEM; 931 rc = -ENOMEM;
@@ -1070,14 +1062,12 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u8 type,
1070 if (rc != 0) 1062 if (rc != 0)
1071 cERROR(1, "Error unlocking previously locked " 1063 cERROR(1, "Error unlocking previously locked "
1072 "range %d during test of lock", rc); 1064 "range %d during test of lock", rc);
1073 rc = 0; 1065 return 0;
1074 return rc;
1075 } 1066 }
1076 1067
1077 if (type & LOCKING_ANDX_SHARED_LOCK) { 1068 if (type & LOCKING_ANDX_SHARED_LOCK) {
1078 flock->fl_type = F_WRLCK; 1069 flock->fl_type = F_WRLCK;
1079 rc = 0; 1070 return 0;
1080 return rc;
1081 } 1071 }
1082 1072
1083 rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length, 1073 rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
@@ -1095,8 +1085,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u8 type,
1095 } else 1085 } else
1096 flock->fl_type = F_WRLCK; 1086 flock->fl_type = F_WRLCK;
1097 1087
1098 rc = 0; 1088 return 0;
1099 return rc;
1100} 1089}
1101 1090
1102static void 1091static void
@@ -1254,20 +1243,26 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u8 type,
1254 } 1243 }
1255 1244
1256 if (lock) { 1245 if (lock) {
1257 rc = cifs_lock_add_if(cinode, flock->fl_start, length, 1246 struct cifsLockInfo *lock;
1258 type, netfid, wait_flag); 1247
1248 lock = cifs_lock_init(flock->fl_start, length, type, netfid);
1249 if (!lock)
1250 return -ENOMEM;
1251
1252 rc = cifs_lock_add_if(cinode, lock, wait_flag);
1259 if (rc < 0) 1253 if (rc < 0)
1260 return rc; 1254 kfree(lock);
1261 else if (!rc) 1255 if (rc <= 0)
1262 goto out; 1256 goto out;
1263 1257
1264 rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length, 1258 rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
1265 flock->fl_start, 0, 1, type, wait_flag, 0); 1259 flock->fl_start, 0, 1, type, wait_flag, 0);
1266 if (rc == 0) { 1260 if (rc) {
1267 /* For Windows locks we must store them. */ 1261 kfree(lock);
1268 rc = cifs_lock_add(cinode, length, flock->fl_start, 1262 goto out;
1269 type, netfid);
1270 } 1263 }
1264
1265 cifs_lock_add(cinode, lock);
1271 } else if (unlock) 1266 } else if (unlock)
1272 rc = cifs_unlock_range(cfile, flock, xid); 1267 rc = cifs_unlock_range(cfile, flock, xid);
1273 1268
diff --git a/fs/dcache.c b/fs/dcache.c
index 274f13e2f094..10ba92def3f6 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -36,6 +36,7 @@
36#include <linux/bit_spinlock.h> 36#include <linux/bit_spinlock.h>
37#include <linux/rculist_bl.h> 37#include <linux/rculist_bl.h>
38#include <linux/prefetch.h> 38#include <linux/prefetch.h>
39#include <linux/ratelimit.h>
39#include "internal.h" 40#include "internal.h"
40 41
41/* 42/*
@@ -546,9 +547,11 @@ int d_invalidate(struct dentry * dentry)
546 * would make it unreachable from the root, 547 * would make it unreachable from the root,
547 * we might still populate it if it was a 548 * we might still populate it if it was a
548 * working directory or similar). 549 * working directory or similar).
550 * We also need to leave mountpoints alone,
551 * directory or not.
549 */ 552 */
550 if (dentry->d_count > 1) { 553 if (dentry->d_count > 1 && dentry->d_inode) {
551 if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) { 554 if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry)) {
552 spin_unlock(&dentry->d_lock); 555 spin_unlock(&dentry->d_lock);
553 return -EBUSY; 556 return -EBUSY;
554 } 557 }
@@ -2381,8 +2384,16 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
2381 actual = __d_unalias(inode, dentry, alias); 2384 actual = __d_unalias(inode, dentry, alias);
2382 } 2385 }
2383 write_sequnlock(&rename_lock); 2386 write_sequnlock(&rename_lock);
2384 if (IS_ERR(actual)) 2387 if (IS_ERR(actual)) {
2388 if (PTR_ERR(actual) == -ELOOP)
2389 pr_warn_ratelimited(
2390 "VFS: Lookup of '%s' in %s %s"
2391 " would have caused loop\n",
2392 dentry->d_name.name,
2393 inode->i_sb->s_type->name,
2394 inode->i_sb->s_id);
2385 dput(alias); 2395 dput(alias);
2396 }
2386 goto out_nolock; 2397 goto out_nolock;
2387 } 2398 }
2388 } 2399 }
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index fcfa86ae6faf..d271ad837202 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -23,6 +23,7 @@
23 */ 23 */
24 24
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/module.h>
26#include <asm/div64.h> 27#include <asm/div64.h>
27#include <linux/lcm.h> 28#include <linux/lcm.h>
28 29
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 057b237b8b69..e6085ec192d6 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -35,6 +35,7 @@
35#include <linux/parser.h> 35#include <linux/parser.h>
36#include <linux/vfs.h> 36#include <linux/vfs.h>
37#include <linux/random.h> 37#include <linux/random.h>
38#include <linux/module.h>
38#include <linux/exportfs.h> 39#include <linux/exportfs.h>
39#include <linux/slab.h> 40#include <linux/slab.h>
40 41
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index f6dba4505f1c..12ccacda44e0 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -565,7 +565,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
565 brelse(bitmap_bh); 565 brelse(bitmap_bh);
566 printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu" 566 printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu"
567 ", computed = %llu, %llu\n", 567 ", computed = %llu, %llu\n",
568 EXT4_B2C(sbi, ext4_free_blocks_count(es)), 568 EXT4_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)),
569 desc_count, bitmap_count); 569 desc_count, bitmap_count);
570 return bitmap_count; 570 return bitmap_count;
571#else 571#else
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cc5a6da030a1..fffec40d5996 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2270,6 +2270,7 @@ retry:
2270 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " 2270 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2271 "%ld pages, ino %lu; err %d", __func__, 2271 "%ld pages, ino %lu; err %d", __func__,
2272 wbc->nr_to_write, inode->i_ino, ret); 2272 wbc->nr_to_write, inode->i_ino, ret);
2273 blk_finish_plug(&plug);
2273 goto out_writepages; 2274 goto out_writepages;
2274 } 2275 }
2275 2276
@@ -2372,7 +2373,7 @@ static int ext4_nonda_switch(struct super_block *sb)
2372 * start pushing delalloc when 1/2 of free blocks are dirty. 2373 * start pushing delalloc when 1/2 of free blocks are dirty.
2373 */ 2374 */
2374 if (free_blocks < 2 * dirty_blocks) 2375 if (free_blocks < 2 * dirty_blocks)
2375 writeback_inodes_sb_if_idle(sb); 2376 writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE);
2376 2377
2377 return 0; 2378 return 0;
2378} 2379}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9953d80145ad..3858767ec672 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1683,7 +1683,9 @@ static int parse_options(char *options, struct super_block *sb,
1683 data_opt = EXT4_MOUNT_WRITEBACK_DATA; 1683 data_opt = EXT4_MOUNT_WRITEBACK_DATA;
1684 datacheck: 1684 datacheck:
1685 if (is_remount) { 1685 if (is_remount) {
1686 if (test_opt(sb, DATA_FLAGS) != data_opt) { 1686 if (!sbi->s_journal)
1687 ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
1688 else if (test_opt(sb, DATA_FLAGS) != data_opt) {
1687 ext4_msg(sb, KERN_ERR, 1689 ext4_msg(sb, KERN_ERR,
1688 "Cannot change data mode on remount"); 1690 "Cannot change data mode on remount");
1689 return 0; 1691 return 0;
@@ -3099,8 +3101,6 @@ static void ext4_destroy_lazyinit_thread(void)
3099} 3101}
3100 3102
3101static int ext4_fill_super(struct super_block *sb, void *data, int silent) 3103static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3102 __releases(kernel_lock)
3103 __acquires(kernel_lock)
3104{ 3104{
3105 char *orig_data = kstrdup(data, GFP_KERNEL); 3105 char *orig_data = kstrdup(data, GFP_KERNEL);
3106 struct buffer_head *bh; 3106 struct buffer_head *bh;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 04cf3b91e501..73c3992b2bb4 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -41,11 +41,23 @@ struct wb_writeback_work {
41 unsigned int for_kupdate:1; 41 unsigned int for_kupdate:1;
42 unsigned int range_cyclic:1; 42 unsigned int range_cyclic:1;
43 unsigned int for_background:1; 43 unsigned int for_background:1;
44 enum wb_reason reason; /* why was writeback initiated? */
44 45
45 struct list_head list; /* pending work list */ 46 struct list_head list; /* pending work list */
46 struct completion *done; /* set if the caller waits */ 47 struct completion *done; /* set if the caller waits */
47}; 48};
48 49
50const char *wb_reason_name[] = {
51 [WB_REASON_BACKGROUND] = "background",
52 [WB_REASON_TRY_TO_FREE_PAGES] = "try_to_free_pages",
53 [WB_REASON_SYNC] = "sync",
54 [WB_REASON_PERIODIC] = "periodic",
55 [WB_REASON_LAPTOP_TIMER] = "laptop_timer",
56 [WB_REASON_FREE_MORE_MEM] = "free_more_memory",
57 [WB_REASON_FS_FREE_SPACE] = "fs_free_space",
58 [WB_REASON_FORKER_THREAD] = "forker_thread"
59};
60
49/* 61/*
50 * Include the creation of the trace points after defining the 62 * Include the creation of the trace points after defining the
51 * wb_writeback_work structure so that the definition remains local to this 63 * wb_writeback_work structure so that the definition remains local to this
@@ -115,7 +127,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
115 127
116static void 128static void
117__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, 129__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
118 bool range_cyclic) 130 bool range_cyclic, enum wb_reason reason)
119{ 131{
120 struct wb_writeback_work *work; 132 struct wb_writeback_work *work;
121 133
@@ -135,6 +147,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
135 work->sync_mode = WB_SYNC_NONE; 147 work->sync_mode = WB_SYNC_NONE;
136 work->nr_pages = nr_pages; 148 work->nr_pages = nr_pages;
137 work->range_cyclic = range_cyclic; 149 work->range_cyclic = range_cyclic;
150 work->reason = reason;
138 151
139 bdi_queue_work(bdi, work); 152 bdi_queue_work(bdi, work);
140} 153}
@@ -150,9 +163,10 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
150 * completion. Caller need not hold sb s_umount semaphore. 163 * completion. Caller need not hold sb s_umount semaphore.
151 * 164 *
152 */ 165 */
153void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) 166void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
167 enum wb_reason reason)
154{ 168{
155 __bdi_start_writeback(bdi, nr_pages, true); 169 __bdi_start_writeback(bdi, nr_pages, true, reason);
156} 170}
157 171
158/** 172/**
@@ -251,7 +265,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
251 */ 265 */
252static int move_expired_inodes(struct list_head *delaying_queue, 266static int move_expired_inodes(struct list_head *delaying_queue,
253 struct list_head *dispatch_queue, 267 struct list_head *dispatch_queue,
254 unsigned long *older_than_this) 268 struct wb_writeback_work *work)
255{ 269{
256 LIST_HEAD(tmp); 270 LIST_HEAD(tmp);
257 struct list_head *pos, *node; 271 struct list_head *pos, *node;
@@ -262,8 +276,8 @@ static int move_expired_inodes(struct list_head *delaying_queue,
262 276
263 while (!list_empty(delaying_queue)) { 277 while (!list_empty(delaying_queue)) {
264 inode = wb_inode(delaying_queue->prev); 278 inode = wb_inode(delaying_queue->prev);
265 if (older_than_this && 279 if (work->older_than_this &&
266 inode_dirtied_after(inode, *older_than_this)) 280 inode_dirtied_after(inode, *work->older_than_this))
267 break; 281 break;
268 if (sb && sb != inode->i_sb) 282 if (sb && sb != inode->i_sb)
269 do_sb_sort = 1; 283 do_sb_sort = 1;
@@ -302,13 +316,13 @@ out:
302 * | 316 * |
303 * +--> dequeue for IO 317 * +--> dequeue for IO
304 */ 318 */
305static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) 319static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
306{ 320{
307 int moved; 321 int moved;
308 assert_spin_locked(&wb->list_lock); 322 assert_spin_locked(&wb->list_lock);
309 list_splice_init(&wb->b_more_io, &wb->b_io); 323 list_splice_init(&wb->b_more_io, &wb->b_io);
310 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); 324 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
311 trace_writeback_queue_io(wb, older_than_this, moved); 325 trace_writeback_queue_io(wb, work, moved);
312} 326}
313 327
314static int write_inode(struct inode *inode, struct writeback_control *wbc) 328static int write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -641,31 +655,40 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
641 return wrote; 655 return wrote;
642} 656}
643 657
644long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages) 658long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
659 enum wb_reason reason)
645{ 660{
646 struct wb_writeback_work work = { 661 struct wb_writeback_work work = {
647 .nr_pages = nr_pages, 662 .nr_pages = nr_pages,
648 .sync_mode = WB_SYNC_NONE, 663 .sync_mode = WB_SYNC_NONE,
649 .range_cyclic = 1, 664 .range_cyclic = 1,
665 .reason = reason,
650 }; 666 };
651 667
652 spin_lock(&wb->list_lock); 668 spin_lock(&wb->list_lock);
653 if (list_empty(&wb->b_io)) 669 if (list_empty(&wb->b_io))
654 queue_io(wb, NULL); 670 queue_io(wb, &work);
655 __writeback_inodes_wb(wb, &work); 671 __writeback_inodes_wb(wb, &work);
656 spin_unlock(&wb->list_lock); 672 spin_unlock(&wb->list_lock);
657 673
658 return nr_pages - work.nr_pages; 674 return nr_pages - work.nr_pages;
659} 675}
660 676
661static inline bool over_bground_thresh(void) 677static bool over_bground_thresh(struct backing_dev_info *bdi)
662{ 678{
663 unsigned long background_thresh, dirty_thresh; 679 unsigned long background_thresh, dirty_thresh;
664 680
665 global_dirty_limits(&background_thresh, &dirty_thresh); 681 global_dirty_limits(&background_thresh, &dirty_thresh);
666 682
667 return (global_page_state(NR_FILE_DIRTY) + 683 if (global_page_state(NR_FILE_DIRTY) +
668 global_page_state(NR_UNSTABLE_NFS) > background_thresh); 684 global_page_state(NR_UNSTABLE_NFS) > background_thresh)
685 return true;
686
687 if (bdi_stat(bdi, BDI_RECLAIMABLE) >
688 bdi_dirty_limit(bdi, background_thresh))
689 return true;
690
691 return false;
669} 692}
670 693
671/* 694/*
@@ -675,7 +698,7 @@ static inline bool over_bground_thresh(void)
675static void wb_update_bandwidth(struct bdi_writeback *wb, 698static void wb_update_bandwidth(struct bdi_writeback *wb,
676 unsigned long start_time) 699 unsigned long start_time)
677{ 700{
678 __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time); 701 __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time);
679} 702}
680 703
681/* 704/*
@@ -727,7 +750,7 @@ static long wb_writeback(struct bdi_writeback *wb,
727 * For background writeout, stop when we are below the 750 * For background writeout, stop when we are below the
728 * background dirty threshold 751 * background dirty threshold
729 */ 752 */
730 if (work->for_background && !over_bground_thresh()) 753 if (work->for_background && !over_bground_thresh(wb->bdi))
731 break; 754 break;
732 755
733 if (work->for_kupdate) { 756 if (work->for_kupdate) {
@@ -738,7 +761,7 @@ static long wb_writeback(struct bdi_writeback *wb,
738 761
739 trace_writeback_start(wb->bdi, work); 762 trace_writeback_start(wb->bdi, work);
740 if (list_empty(&wb->b_io)) 763 if (list_empty(&wb->b_io))
741 queue_io(wb, work->older_than_this); 764 queue_io(wb, work);
742 if (work->sb) 765 if (work->sb)
743 progress = writeback_sb_inodes(work->sb, wb, work); 766 progress = writeback_sb_inodes(work->sb, wb, work);
744 else 767 else
@@ -811,13 +834,14 @@ static unsigned long get_nr_dirty_pages(void)
811 834
812static long wb_check_background_flush(struct bdi_writeback *wb) 835static long wb_check_background_flush(struct bdi_writeback *wb)
813{ 836{
814 if (over_bground_thresh()) { 837 if (over_bground_thresh(wb->bdi)) {
815 838
816 struct wb_writeback_work work = { 839 struct wb_writeback_work work = {
817 .nr_pages = LONG_MAX, 840 .nr_pages = LONG_MAX,
818 .sync_mode = WB_SYNC_NONE, 841 .sync_mode = WB_SYNC_NONE,
819 .for_background = 1, 842 .for_background = 1,
820 .range_cyclic = 1, 843 .range_cyclic = 1,
844 .reason = WB_REASON_BACKGROUND,
821 }; 845 };
822 846
823 return wb_writeback(wb, &work); 847 return wb_writeback(wb, &work);
@@ -851,6 +875,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
851 .sync_mode = WB_SYNC_NONE, 875 .sync_mode = WB_SYNC_NONE,
852 .for_kupdate = 1, 876 .for_kupdate = 1,
853 .range_cyclic = 1, 877 .range_cyclic = 1,
878 .reason = WB_REASON_PERIODIC,
854 }; 879 };
855 880
856 return wb_writeback(wb, &work); 881 return wb_writeback(wb, &work);
@@ -969,7 +994,7 @@ int bdi_writeback_thread(void *data)
969 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back 994 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
970 * the whole world. 995 * the whole world.
971 */ 996 */
972void wakeup_flusher_threads(long nr_pages) 997void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
973{ 998{
974 struct backing_dev_info *bdi; 999 struct backing_dev_info *bdi;
975 1000
@@ -982,7 +1007,7 @@ void wakeup_flusher_threads(long nr_pages)
982 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { 1007 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
983 if (!bdi_has_dirty_io(bdi)) 1008 if (!bdi_has_dirty_io(bdi))
984 continue; 1009 continue;
985 __bdi_start_writeback(bdi, nr_pages, false); 1010 __bdi_start_writeback(bdi, nr_pages, false, reason);
986 } 1011 }
987 rcu_read_unlock(); 1012 rcu_read_unlock();
988} 1013}
@@ -1203,7 +1228,9 @@ static void wait_sb_inodes(struct super_block *sb)
1203 * on how many (if any) will be written, and this function does not wait 1228 * on how many (if any) will be written, and this function does not wait
1204 * for IO completion of submitted IO. 1229 * for IO completion of submitted IO.
1205 */ 1230 */
1206void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) 1231void writeback_inodes_sb_nr(struct super_block *sb,
1232 unsigned long nr,
1233 enum wb_reason reason)
1207{ 1234{
1208 DECLARE_COMPLETION_ONSTACK(done); 1235 DECLARE_COMPLETION_ONSTACK(done);
1209 struct wb_writeback_work work = { 1236 struct wb_writeback_work work = {
@@ -1212,6 +1239,7 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
1212 .tagged_writepages = 1, 1239 .tagged_writepages = 1,
1213 .done = &done, 1240 .done = &done,
1214 .nr_pages = nr, 1241 .nr_pages = nr,
1242 .reason = reason,
1215 }; 1243 };
1216 1244
1217 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1245 WARN_ON(!rwsem_is_locked(&sb->s_umount));
@@ -1228,9 +1256,9 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr);
1228 * on how many (if any) will be written, and this function does not wait 1256 * on how many (if any) will be written, and this function does not wait
1229 * for IO completion of submitted IO. 1257 * for IO completion of submitted IO.
1230 */ 1258 */
1231void writeback_inodes_sb(struct super_block *sb) 1259void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
1232{ 1260{
1233 return writeback_inodes_sb_nr(sb, get_nr_dirty_pages()); 1261 return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
1234} 1262}
1235EXPORT_SYMBOL(writeback_inodes_sb); 1263EXPORT_SYMBOL(writeback_inodes_sb);
1236 1264
@@ -1241,11 +1269,11 @@ EXPORT_SYMBOL(writeback_inodes_sb);
1241 * Invoke writeback_inodes_sb if no writeback is currently underway. 1269 * Invoke writeback_inodes_sb if no writeback is currently underway.
1242 * Returns 1 if writeback was started, 0 if not. 1270 * Returns 1 if writeback was started, 0 if not.
1243 */ 1271 */
1244int writeback_inodes_sb_if_idle(struct super_block *sb) 1272int writeback_inodes_sb_if_idle(struct super_block *sb, enum wb_reason reason)
1245{ 1273{
1246 if (!writeback_in_progress(sb->s_bdi)) { 1274 if (!writeback_in_progress(sb->s_bdi)) {
1247 down_read(&sb->s_umount); 1275 down_read(&sb->s_umount);
1248 writeback_inodes_sb(sb); 1276 writeback_inodes_sb(sb, reason);
1249 up_read(&sb->s_umount); 1277 up_read(&sb->s_umount);
1250 return 1; 1278 return 1;
1251 } else 1279 } else
@@ -1262,11 +1290,12 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
1262 * Returns 1 if writeback was started, 0 if not. 1290 * Returns 1 if writeback was started, 0 if not.
1263 */ 1291 */
1264int writeback_inodes_sb_nr_if_idle(struct super_block *sb, 1292int writeback_inodes_sb_nr_if_idle(struct super_block *sb,
1265 unsigned long nr) 1293 unsigned long nr,
1294 enum wb_reason reason)
1266{ 1295{
1267 if (!writeback_in_progress(sb->s_bdi)) { 1296 if (!writeback_in_progress(sb->s_bdi)) {
1268 down_read(&sb->s_umount); 1297 down_read(&sb->s_umount);
1269 writeback_inodes_sb_nr(sb, nr); 1298 writeback_inodes_sb_nr(sb, nr, reason);
1270 up_read(&sb->s_umount); 1299 up_read(&sb->s_umount);
1271 return 1; 1300 return 1;
1272 } else 1301 } else
@@ -1290,6 +1319,7 @@ void sync_inodes_sb(struct super_block *sb)
1290 .nr_pages = LONG_MAX, 1319 .nr_pages = LONG_MAX,
1291 .range_cyclic = 0, 1320 .range_cyclic = 0,
1292 .done = &done, 1321 .done = &done,
1322 .reason = WB_REASON_SYNC,
1293 }; 1323 };
1294 1324
1295 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1325 WARN_ON(!rwsem_is_locked(&sb->s_umount));
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index b6cca47f7b07..3426521f3205 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -47,6 +47,7 @@
47#include <linux/slab.h> 47#include <linux/slab.h>
48#include <linux/spinlock.h> 48#include <linux/spinlock.h>
49#include <linux/stat.h> 49#include <linux/stat.h>
50#include <linux/module.h>
50 51
51#include "fuse_i.h" 52#include "fuse_i.h"
52 53
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 7e823bbd2453..cb23c2be731a 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -14,6 +14,7 @@
14#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
15#include <linux/blkdev.h> 15#include <linux/blkdev.h>
16#include <linux/kthread.h> 16#include <linux/kthread.h>
17#include <linux/export.h>
17#include <linux/namei.h> 18#include <linux/namei.h>
18#include <linux/mount.h> 19#include <linux/mount.h>
19#include <linux/gfs2_ondisk.h> 20#include <linux/gfs2_ondisk.h>
diff --git a/fs/hfs/trans.c b/fs/hfs/trans.c
index e673a88b8ae7..b1ce4c7ad3fb 100644
--- a/fs/hfs/trans.c
+++ b/fs/hfs/trans.c
@@ -40,6 +40,8 @@ int hfs_mac2asc(struct super_block *sb, char *out, const struct hfs_name *in)
40 40
41 src = in->name; 41 src = in->name;
42 srclen = in->len; 42 srclen = in->len;
43 if (srclen > HFS_NAMELEN)
44 srclen = HFS_NAMELEN;
43 dst = out; 45 dst = out;
44 dstlen = HFS_MAX_NAMELEN; 46 dstlen = HFS_MAX_NAMELEN;
45 if (nls_io) { 47 if (nls_io) {
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 7da2a06508e5..f79dab83e17b 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -21,6 +21,7 @@
21 */ 21 */
22#include <linux/gfp.h> 22#include <linux/gfp.h>
23#include <linux/kernel.h> 23#include <linux/kernel.h>
24#include <linux/export.h>
24#include <linux/ioprio.h> 25#include <linux/ioprio.h>
25#include <linux/blkdev.h> 26#include <linux/blkdev.h>
26#include <linux/capability.h> 27#include <linux/capability.h>
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index de4247021d25..5b6c9d1a2fb9 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -53,6 +53,78 @@ static int jffs2_is_best_compression(struct jffs2_compressor *this,
53 return 0; 53 return 0;
54} 54}
55 55
56/*
57 * jffs2_selected_compress:
58 * @compr: Explicit compression type to use (ie, JFFS2_COMPR_ZLIB).
59 * If 0, just take the first available compression mode.
60 * @data_in: Pointer to uncompressed data
61 * @cpage_out: Pointer to returned pointer to buffer for compressed data
62 * @datalen: On entry, holds the amount of data available for compression.
63 * On exit, expected to hold the amount of data actually compressed.
64 * @cdatalen: On entry, holds the amount of space available for compressed
65 * data. On exit, expected to hold the actual size of the compressed
66 * data.
67 *
68 * Returns: the compression type used. Zero is used to show that the data
69 * could not be compressed; probably because we couldn't find the requested
70 * compression mode.
71 */
72static int jffs2_selected_compress(u8 compr, unsigned char *data_in,
73 unsigned char **cpage_out, u32 *datalen, u32 *cdatalen)
74{
75 struct jffs2_compressor *this;
76 int err, ret = JFFS2_COMPR_NONE;
77 uint32_t orig_slen, orig_dlen;
78 char *output_buf;
79
80 output_buf = kmalloc(*cdatalen, GFP_KERNEL);
81 if (!output_buf) {
82 printk(KERN_WARNING "JFFS2: No memory for compressor allocation. Compression failed.\n");
83 return ret;
84 }
85 orig_slen = *datalen;
86 orig_dlen = *cdatalen;
87 spin_lock(&jffs2_compressor_list_lock);
88 list_for_each_entry(this, &jffs2_compressor_list, list) {
89 /* Skip decompress-only and disabled modules */
90 if (!this->compress || this->disabled)
91 continue;
92
93 /* Skip if not the desired compression type */
94 if (compr && (compr != this->compr))
95 continue;
96
97 /*
98 * Either compression type was unspecified, or we found our
99 * compressor; either way, we're good to go.
100 */
101 this->usecount++;
102 spin_unlock(&jffs2_compressor_list_lock);
103
104 *datalen = orig_slen;
105 *cdatalen = orig_dlen;
106 err = this->compress(data_in, output_buf, datalen, cdatalen);
107
108 spin_lock(&jffs2_compressor_list_lock);
109 this->usecount--;
110 if (!err) {
111 /* Success */
112 ret = this->compr;
113 this->stat_compr_blocks++;
114 this->stat_compr_orig_size += *datalen;
115 this->stat_compr_new_size += *cdatalen;
116 break;
117 }
118 }
119 spin_unlock(&jffs2_compressor_list_lock);
120 if (ret == JFFS2_COMPR_NONE)
121 kfree(output_buf);
122 else
123 *cpage_out = output_buf;
124
125 return ret;
126}
127
56/* jffs2_compress: 128/* jffs2_compress:
57 * @data_in: Pointer to uncompressed data 129 * @data_in: Pointer to uncompressed data
58 * @cpage_out: Pointer to returned pointer to buffer for compressed data 130 * @cpage_out: Pointer to returned pointer to buffer for compressed data
@@ -76,47 +148,23 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
76 uint32_t *datalen, uint32_t *cdatalen) 148 uint32_t *datalen, uint32_t *cdatalen)
77{ 149{
78 int ret = JFFS2_COMPR_NONE; 150 int ret = JFFS2_COMPR_NONE;
79 int compr_ret; 151 int mode, compr_ret;
80 struct jffs2_compressor *this, *best=NULL; 152 struct jffs2_compressor *this, *best=NULL;
81 unsigned char *output_buf = NULL, *tmp_buf; 153 unsigned char *output_buf = NULL, *tmp_buf;
82 uint32_t orig_slen, orig_dlen; 154 uint32_t orig_slen, orig_dlen;
83 uint32_t best_slen=0, best_dlen=0; 155 uint32_t best_slen=0, best_dlen=0;
84 156
85 switch (jffs2_compression_mode) { 157 if (c->mount_opts.override_compr)
158 mode = c->mount_opts.compr;
159 else
160 mode = jffs2_compression_mode;
161
162 switch (mode) {
86 case JFFS2_COMPR_MODE_NONE: 163 case JFFS2_COMPR_MODE_NONE:
87 break; 164 break;
88 case JFFS2_COMPR_MODE_PRIORITY: 165 case JFFS2_COMPR_MODE_PRIORITY:
89 output_buf = kmalloc(*cdatalen,GFP_KERNEL); 166 ret = jffs2_selected_compress(0, data_in, cpage_out, datalen,
90 if (!output_buf) { 167 cdatalen);
91 printk(KERN_WARNING "JFFS2: No memory for compressor allocation. Compression failed.\n");
92 goto out;
93 }
94 orig_slen = *datalen;
95 orig_dlen = *cdatalen;
96 spin_lock(&jffs2_compressor_list_lock);
97 list_for_each_entry(this, &jffs2_compressor_list, list) {
98 /* Skip decompress-only backwards-compatibility and disabled modules */
99 if ((!this->compress)||(this->disabled))
100 continue;
101
102 this->usecount++;
103 spin_unlock(&jffs2_compressor_list_lock);
104 *datalen = orig_slen;
105 *cdatalen = orig_dlen;
106 compr_ret = this->compress(data_in, output_buf, datalen, cdatalen);
107 spin_lock(&jffs2_compressor_list_lock);
108 this->usecount--;
109 if (!compr_ret) {
110 ret = this->compr;
111 this->stat_compr_blocks++;
112 this->stat_compr_orig_size += *datalen;
113 this->stat_compr_new_size += *cdatalen;
114 break;
115 }
116 }
117 spin_unlock(&jffs2_compressor_list_lock);
118 if (ret == JFFS2_COMPR_NONE)
119 kfree(output_buf);
120 break; 168 break;
121 case JFFS2_COMPR_MODE_SIZE: 169 case JFFS2_COMPR_MODE_SIZE:
122 case JFFS2_COMPR_MODE_FAVOURLZO: 170 case JFFS2_COMPR_MODE_FAVOURLZO:
@@ -174,22 +222,28 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
174 best->stat_compr_orig_size += best_slen; 222 best->stat_compr_orig_size += best_slen;
175 best->stat_compr_new_size += best_dlen; 223 best->stat_compr_new_size += best_dlen;
176 ret = best->compr; 224 ret = best->compr;
225 *cpage_out = output_buf;
177 } 226 }
178 spin_unlock(&jffs2_compressor_list_lock); 227 spin_unlock(&jffs2_compressor_list_lock);
179 break; 228 break;
229 case JFFS2_COMPR_MODE_FORCELZO:
230 ret = jffs2_selected_compress(JFFS2_COMPR_LZO, data_in,
231 cpage_out, datalen, cdatalen);
232 break;
233 case JFFS2_COMPR_MODE_FORCEZLIB:
234 ret = jffs2_selected_compress(JFFS2_COMPR_ZLIB, data_in,
235 cpage_out, datalen, cdatalen);
236 break;
180 default: 237 default:
181 printk(KERN_ERR "JFFS2: unknown compression mode.\n"); 238 printk(KERN_ERR "JFFS2: unknown compression mode.\n");
182 } 239 }
183 out: 240
184 if (ret == JFFS2_COMPR_NONE) { 241 if (ret == JFFS2_COMPR_NONE) {
185 *cpage_out = data_in; 242 *cpage_out = data_in;
186 *datalen = *cdatalen; 243 *datalen = *cdatalen;
187 none_stat_compr_blocks++; 244 none_stat_compr_blocks++;
188 none_stat_compr_size += *datalen; 245 none_stat_compr_size += *datalen;
189 } 246 }
190 else {
191 *cpage_out = output_buf;
192 }
193 return ret; 247 return ret;
194} 248}
195 249
diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h
index 13bb7597ab39..5e91d578f4ed 100644
--- a/fs/jffs2/compr.h
+++ b/fs/jffs2/compr.h
@@ -40,6 +40,8 @@
40#define JFFS2_COMPR_MODE_PRIORITY 1 40#define JFFS2_COMPR_MODE_PRIORITY 1
41#define JFFS2_COMPR_MODE_SIZE 2 41#define JFFS2_COMPR_MODE_SIZE 2
42#define JFFS2_COMPR_MODE_FAVOURLZO 3 42#define JFFS2_COMPR_MODE_FAVOURLZO 3
43#define JFFS2_COMPR_MODE_FORCELZO 4
44#define JFFS2_COMPR_MODE_FORCEZLIB 5
43 45
44#define FAVOUR_LZO_PERCENT 80 46#define FAVOUR_LZO_PERCENT 80
45 47
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 7286e44ac665..4b8afe39a87f 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -379,7 +379,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags)
379 jffs2_do_setattr(inode, &iattr); 379 jffs2_do_setattr(inode, &iattr);
380} 380}
381 381
382int jffs2_remount_fs (struct super_block *sb, int *flags, char *data) 382int jffs2_do_remount_fs(struct super_block *sb, int *flags, char *data)
383{ 383{
384 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); 384 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
385 385
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 0bc6a6c80a56..55a0c1dceadf 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -29,6 +29,11 @@
29 29
30struct jffs2_inodirty; 30struct jffs2_inodirty;
31 31
32struct jffs2_mount_opts {
33 bool override_compr;
34 unsigned int compr;
35};
36
32/* A struct for the overall file system control. Pointers to 37/* A struct for the overall file system control. Pointers to
33 jffs2_sb_info structs are named `c' in the source code. 38 jffs2_sb_info structs are named `c' in the source code.
34 Nee jffs_control 39 Nee jffs_control
@@ -126,6 +131,7 @@ struct jffs2_sb_info {
126#endif 131#endif
127 132
128 struct jffs2_summary *summary; /* Summary information */ 133 struct jffs2_summary *summary; /* Summary information */
134 struct jffs2_mount_opts mount_opts;
129 135
130#ifdef CONFIG_JFFS2_FS_XATTR 136#ifdef CONFIG_JFFS2_FS_XATTR
131#define XATTRINDEX_HASHSIZE (57) 137#define XATTRINDEX_HASHSIZE (57)
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 6c1755c59c0f..ab65ee3ec858 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -176,7 +176,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags);
176struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, 176struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode,
177 struct jffs2_raw_inode *ri); 177 struct jffs2_raw_inode *ri);
178int jffs2_statfs (struct dentry *, struct kstatfs *); 178int jffs2_statfs (struct dentry *, struct kstatfs *);
179int jffs2_remount_fs (struct super_block *, int *, char *); 179int jffs2_do_remount_fs(struct super_block *, int *, char *);
180int jffs2_do_fill_super(struct super_block *sb, void *data, int silent); 180int jffs2_do_fill_super(struct super_block *sb, void *data, int silent);
181void jffs2_gc_release_inode(struct jffs2_sb_info *c, 181void jffs2_gc_release_inode(struct jffs2_sb_info *c,
182 struct jffs2_inode_info *f); 182 struct jffs2_inode_info *f);
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 8d8cd3419d02..28107ca136e4 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -275,9 +275,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
275 else 275 else
276 c->mtd->unpoint(c->mtd, 0, c->mtd->size); 276 c->mtd->unpoint(c->mtd, 0, c->mtd->size);
277#endif 277#endif
278 if (s) 278 kfree(s);
279 kfree(s);
280
281 return ret; 279 return ret;
282} 280}
283 281
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 853b8e300084..e7e974454115 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -17,11 +17,13 @@
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/err.h> 18#include <linux/err.h>
19#include <linux/mount.h> 19#include <linux/mount.h>
20#include <linux/parser.h>
20#include <linux/jffs2.h> 21#include <linux/jffs2.h>
21#include <linux/pagemap.h> 22#include <linux/pagemap.h>
22#include <linux/mtd/super.h> 23#include <linux/mtd/super.h>
23#include <linux/ctype.h> 24#include <linux/ctype.h>
24#include <linux/namei.h> 25#include <linux/namei.h>
26#include <linux/seq_file.h>
25#include <linux/exportfs.h> 27#include <linux/exportfs.h>
26#include "compr.h" 28#include "compr.h"
27#include "nodelist.h" 29#include "nodelist.h"
@@ -75,6 +77,37 @@ static void jffs2_write_super(struct super_block *sb)
75 unlock_super(sb); 77 unlock_super(sb);
76} 78}
77 79
80static const char *jffs2_compr_name(unsigned int compr)
81{
82 switch (compr) {
83 case JFFS2_COMPR_MODE_NONE:
84 return "none";
85#ifdef CONFIG_JFFS2_LZO
86 case JFFS2_COMPR_MODE_FORCELZO:
87 return "lzo";
88#endif
89#ifdef CONFIG_JFFS2_ZLIB
90 case JFFS2_COMPR_MODE_FORCEZLIB:
91 return "zlib";
92#endif
93 default:
94 /* should never happen; programmer error */
95 WARN_ON(1);
96 return "";
97 }
98}
99
100static int jffs2_show_options(struct seq_file *s, struct vfsmount *mnt)
101{
102 struct jffs2_sb_info *c = JFFS2_SB_INFO(mnt->mnt_sb);
103 struct jffs2_mount_opts *opts = &c->mount_opts;
104
105 if (opts->override_compr)
106 seq_printf(s, ",compr=%s", jffs2_compr_name(opts->compr));
107
108 return 0;
109}
110
78static int jffs2_sync_fs(struct super_block *sb, int wait) 111static int jffs2_sync_fs(struct super_block *sb, int wait)
79{ 112{
80 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); 113 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
@@ -133,6 +166,85 @@ static const struct export_operations jffs2_export_ops = {
133 .fh_to_parent = jffs2_fh_to_parent, 166 .fh_to_parent = jffs2_fh_to_parent,
134}; 167};
135 168
169/*
170 * JFFS2 mount options.
171 *
172 * Opt_override_compr: override default compressor
173 * Opt_err: just end of array marker
174 */
175enum {
176 Opt_override_compr,
177 Opt_err,
178};
179
180static const match_table_t tokens = {
181 {Opt_override_compr, "compr=%s"},
182 {Opt_err, NULL},
183};
184
185static int jffs2_parse_options(struct jffs2_sb_info *c, char *data)
186{
187 substring_t args[MAX_OPT_ARGS];
188 char *p, *name;
189
190 if (!data)
191 return 0;
192
193 while ((p = strsep(&data, ","))) {
194 int token;
195
196 if (!*p)
197 continue;
198
199 token = match_token(p, tokens, args);
200 switch (token) {
201 case Opt_override_compr:
202 name = match_strdup(&args[0]);
203
204 if (!name)
205 return -ENOMEM;
206 if (!strcmp(name, "none"))
207 c->mount_opts.compr = JFFS2_COMPR_MODE_NONE;
208#ifdef CONFIG_JFFS2_LZO
209 else if (!strcmp(name, "lzo"))
210 c->mount_opts.compr = JFFS2_COMPR_MODE_FORCELZO;
211#endif
212#ifdef CONFIG_JFFS2_ZLIB
213 else if (!strcmp(name, "zlib"))
214 c->mount_opts.compr =
215 JFFS2_COMPR_MODE_FORCEZLIB;
216#endif
217 else {
218 printk(KERN_ERR "JFFS2 Error: unknown compressor \"%s\"",
219 name);
220 kfree(name);
221 return -EINVAL;
222 }
223 kfree(name);
224 c->mount_opts.override_compr = true;
225 break;
226 default:
227 printk(KERN_ERR "JFFS2 Error: unrecognized mount option '%s' or missing value\n",
228 p);
229 return -EINVAL;
230 }
231 }
232
233 return 0;
234}
235
236static int jffs2_remount_fs(struct super_block *sb, int *flags, char *data)
237{
238 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
239 int err;
240
241 err = jffs2_parse_options(c, data);
242 if (err)
243 return -EINVAL;
244
245 return jffs2_do_remount_fs(sb, flags, data);
246}
247
136static const struct super_operations jffs2_super_operations = 248static const struct super_operations jffs2_super_operations =
137{ 249{
138 .alloc_inode = jffs2_alloc_inode, 250 .alloc_inode = jffs2_alloc_inode,
@@ -143,6 +255,7 @@ static const struct super_operations jffs2_super_operations =
143 .remount_fs = jffs2_remount_fs, 255 .remount_fs = jffs2_remount_fs,
144 .evict_inode = jffs2_evict_inode, 256 .evict_inode = jffs2_evict_inode,
145 .dirty_inode = jffs2_dirty_inode, 257 .dirty_inode = jffs2_dirty_inode,
258 .show_options = jffs2_show_options,
146 .sync_fs = jffs2_sync_fs, 259 .sync_fs = jffs2_sync_fs,
147}; 260};
148 261
@@ -166,6 +279,12 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
166 c->os_priv = sb; 279 c->os_priv = sb;
167 sb->s_fs_info = c; 280 sb->s_fs_info = c;
168 281
282 ret = jffs2_parse_options(c, data);
283 if (ret) {
284 kfree(c);
285 return -EINVAL;
286 }
287
169 /* Initialize JFFS2 superblock locks, the further initialization will 288 /* Initialize JFFS2 superblock locks, the further initialization will
170 * be done later */ 289 * be done later */
171 mutex_init(&c->alloc_sem); 290 mutex_init(&c->alloc_sem);
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 4515bea0268f..b09e51d2f81f 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -578,8 +578,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
578 if (!jffs2_is_writebuffered(c)) 578 if (!jffs2_is_writebuffered(c))
579 return 0; 579 return 0;
580 580
581 if (mutex_trylock(&c->alloc_sem)) { 581 if (!mutex_is_locked(&c->alloc_sem)) {
582 mutex_unlock(&c->alloc_sem);
583 printk(KERN_CRIT "jffs2_flush_wbuf() called with alloc_sem not locked!\n"); 582 printk(KERN_CRIT "jffs2_flush_wbuf() called with alloc_sem not locked!\n");
584 BUG(); 583 BUG();
585 } 584 }
@@ -1026,7 +1025,7 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c,
1026 int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); 1025 int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
1027 struct mtd_oob_ops ops; 1026 struct mtd_oob_ops ops;
1028 1027
1029 ops.mode = MTD_OOB_AUTO; 1028 ops.mode = MTD_OPS_AUTO_OOB;
1030 ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail; 1029 ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail;
1031 ops.oobbuf = c->oobbuf; 1030 ops.oobbuf = c->oobbuf;
1032 ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; 1031 ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
@@ -1069,7 +1068,7 @@ int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c,
1069 struct mtd_oob_ops ops; 1068 struct mtd_oob_ops ops;
1070 int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); 1069 int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
1071 1070
1072 ops.mode = MTD_OOB_AUTO; 1071 ops.mode = MTD_OPS_AUTO_OOB;
1073 ops.ooblen = cmlen; 1072 ops.ooblen = cmlen;
1074 ops.oobbuf = c->oobbuf; 1073 ops.oobbuf = c->oobbuf;
1075 ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; 1074 ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
@@ -1095,7 +1094,7 @@ int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c,
1095 struct mtd_oob_ops ops; 1094 struct mtd_oob_ops ops;
1096 int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); 1095 int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
1097 1096
1098 ops.mode = MTD_OOB_AUTO; 1097 ops.mode = MTD_OPS_AUTO_OOB;
1099 ops.ooblen = cmlen; 1098 ops.ooblen = cmlen;
1100 ops.oobbuf = (uint8_t *)&oob_cleanmarker; 1099 ops.oobbuf = (uint8_t *)&oob_cleanmarker;
1101 ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; 1100 ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 583636f745e5..cc5f811ed383 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -67,6 +67,7 @@
67#include <linux/buffer_head.h> /* for sync_blockdev() */ 67#include <linux/buffer_head.h> /* for sync_blockdev() */
68#include <linux/bio.h> 68#include <linux/bio.h>
69#include <linux/freezer.h> 69#include <linux/freezer.h>
70#include <linux/export.h>
70#include <linux/delay.h> 71#include <linux/delay.h>
71#include <linux/mutex.h> 72#include <linux/mutex.h>
72#include <linux/seq_file.h> 73#include <linux/seq_file.h>
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index f2697e4df109..e795c234ea33 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -13,6 +13,7 @@
13#include <linux/bio.h> 13#include <linux/bio.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/blkdev.h> 15#include <linux/blkdev.h>
16#include <linux/module.h>
16#include <linux/mtd/mtd.h> 17#include <linux/mtd/mtd.h>
17#include <linux/statfs.h> 18#include <linux/statfs.h>
18#include <linux/buffer_head.h> 19#include <linux/buffer_head.h>
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 3f32bcb0d9bd..ef175cb8cfd8 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -16,38 +16,26 @@
16#include <linux/bitops.h> 16#include <linux/bitops.h>
17#include <linux/sched.h> 17#include <linux/sched.h>
18 18
19static const int nibblemap[] = { 4,3,3,2,3,2,2,1,3,2,2,1,2,1,1,0 };
20
21static DEFINE_SPINLOCK(bitmap_lock); 19static DEFINE_SPINLOCK(bitmap_lock);
22 20
23static unsigned long count_free(struct buffer_head *map[], unsigned numblocks, __u32 numbits) 21/*
22 * bitmap consists of blocks filled with 16bit words
23 * bit set == busy, bit clear == free
24 * endianness is a mess, but for counting zero bits it really doesn't matter...
25 */
26static __u32 count_free(struct buffer_head *map[], unsigned blocksize, __u32 numbits)
24{ 27{
25 unsigned i, j, sum = 0; 28 __u32 sum = 0;
26 struct buffer_head *bh; 29 unsigned blocks = DIV_ROUND_UP(numbits, blocksize * 8);
27
28 for (i=0; i<numblocks-1; i++) {
29 if (!(bh=map[i]))
30 return(0);
31 for (j=0; j<bh->b_size; j++)
32 sum += nibblemap[bh->b_data[j] & 0xf]
33 + nibblemap[(bh->b_data[j]>>4) & 0xf];
34 }
35 30
36 if (numblocks==0 || !(bh=map[numblocks-1])) 31 while (blocks--) {
37 return(0); 32 unsigned words = blocksize / 2;
38 i = ((numbits - (numblocks-1) * bh->b_size * 8) / 16) * 2; 33 __u16 *p = (__u16 *)(*map++)->b_data;
39 for (j=0; j<i; j++) { 34 while (words--)
40 sum += nibblemap[bh->b_data[j] & 0xf] 35 sum += 16 - hweight16(*p++);
41 + nibblemap[(bh->b_data[j]>>4) & 0xf];
42 } 36 }
43 37
44 i = numbits%16; 38 return sum;
45 if (i!=0) {
46 i = *(__u16 *)(&bh->b_data[j]) | ~((1<<i) - 1);
47 sum += nibblemap[i & 0xf] + nibblemap[(i>>4) & 0xf];
48 sum += nibblemap[(i>>8) & 0xf] + nibblemap[(i>>12) & 0xf];
49 }
50 return(sum);
51} 39}
52 40
53void minix_free_block(struct inode *inode, unsigned long block) 41void minix_free_block(struct inode *inode, unsigned long block)
@@ -105,10 +93,12 @@ int minix_new_block(struct inode * inode)
105 return 0; 93 return 0;
106} 94}
107 95
108unsigned long minix_count_free_blocks(struct minix_sb_info *sbi) 96unsigned long minix_count_free_blocks(struct super_block *sb)
109{ 97{
110 return (count_free(sbi->s_zmap, sbi->s_zmap_blocks, 98 struct minix_sb_info *sbi = minix_sb(sb);
111 sbi->s_nzones - sbi->s_firstdatazone + 1) 99 u32 bits = sbi->s_nzones - (sbi->s_firstdatazone + 1);
100
101 return (count_free(sbi->s_zmap, sb->s_blocksize, bits)
112 << sbi->s_log_zone_size); 102 << sbi->s_log_zone_size);
113} 103}
114 104
@@ -273,7 +263,10 @@ struct inode *minix_new_inode(const struct inode *dir, int mode, int *error)
273 return inode; 263 return inode;
274} 264}
275 265
276unsigned long minix_count_free_inodes(struct minix_sb_info *sbi) 266unsigned long minix_count_free_inodes(struct super_block *sb)
277{ 267{
278 return count_free(sbi->s_imap, sbi->s_imap_blocks, sbi->s_ninodes + 1); 268 struct minix_sb_info *sbi = minix_sb(sb);
269 u32 bits = sbi->s_ninodes + 1;
270
271 return count_free(sbi->s_imap, sb->s_blocksize, bits);
279} 272}
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 64cdcd662ffc..1d9e33966db0 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -279,6 +279,27 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
279 else if (sbi->s_mount_state & MINIX_ERROR_FS) 279 else if (sbi->s_mount_state & MINIX_ERROR_FS)
280 printk("MINIX-fs: mounting file system with errors, " 280 printk("MINIX-fs: mounting file system with errors, "
281 "running fsck is recommended\n"); 281 "running fsck is recommended\n");
282
283 /* Apparently minix can create filesystems that allocate more blocks for
284 * the bitmaps than needed. We simply ignore that, but verify it didn't
285 * create one with not enough blocks and bail out if so.
286 */
287 block = minix_blocks_needed(sbi->s_ninodes, s->s_blocksize);
288 if (sbi->s_imap_blocks < block) {
289 printk("MINIX-fs: file system does not have enough "
290 "imap blocks allocated. Refusing to mount\n");
291 goto out_iput;
292 }
293
294 block = minix_blocks_needed(
295 (sbi->s_nzones - (sbi->s_firstdatazone + 1)),
296 s->s_blocksize);
297 if (sbi->s_zmap_blocks < block) {
298 printk("MINIX-fs: file system does not have enough "
299 "zmap blocks allocated. Refusing to mount.\n");
300 goto out_iput;
301 }
302
282 return 0; 303 return 0;
283 304
284out_iput: 305out_iput:
@@ -339,10 +360,10 @@ static int minix_statfs(struct dentry *dentry, struct kstatfs *buf)
339 buf->f_type = sb->s_magic; 360 buf->f_type = sb->s_magic;
340 buf->f_bsize = sb->s_blocksize; 361 buf->f_bsize = sb->s_blocksize;
341 buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size; 362 buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size;
342 buf->f_bfree = minix_count_free_blocks(sbi); 363 buf->f_bfree = minix_count_free_blocks(sb);
343 buf->f_bavail = buf->f_bfree; 364 buf->f_bavail = buf->f_bfree;
344 buf->f_files = sbi->s_ninodes; 365 buf->f_files = sbi->s_ninodes;
345 buf->f_ffree = minix_count_free_inodes(sbi); 366 buf->f_ffree = minix_count_free_inodes(sb);
346 buf->f_namelen = sbi->s_namelen; 367 buf->f_namelen = sbi->s_namelen;
347 buf->f_fsid.val[0] = (u32)id; 368 buf->f_fsid.val[0] = (u32)id;
348 buf->f_fsid.val[1] = (u32)(id >> 32); 369 buf->f_fsid.val[1] = (u32)(id >> 32);
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 341e2122879a..26bbd55e82ea 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -48,10 +48,10 @@ extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, stru
48extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); 48extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **);
49extern struct inode * minix_new_inode(const struct inode *, int, int *); 49extern struct inode * minix_new_inode(const struct inode *, int, int *);
50extern void minix_free_inode(struct inode * inode); 50extern void minix_free_inode(struct inode * inode);
51extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi); 51extern unsigned long minix_count_free_inodes(struct super_block *sb);
52extern int minix_new_block(struct inode * inode); 52extern int minix_new_block(struct inode * inode);
53extern void minix_free_block(struct inode *inode, unsigned long block); 53extern void minix_free_block(struct inode *inode, unsigned long block);
54extern unsigned long minix_count_free_blocks(struct minix_sb_info *sbi); 54extern unsigned long minix_count_free_blocks(struct super_block *sb);
55extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *); 55extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *);
56extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len); 56extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len);
57 57
@@ -88,6 +88,11 @@ static inline struct minix_inode_info *minix_i(struct inode *inode)
88 return list_entry(inode, struct minix_inode_info, vfs_inode); 88 return list_entry(inode, struct minix_inode_info, vfs_inode);
89} 89}
90 90
91static inline unsigned minix_blocks_needed(unsigned bits, unsigned blocksize)
92{
93 return DIV_ROUND_UP(bits, blocksize * 8);
94}
95
91#if defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) && \ 96#if defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) && \
92 defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED) 97 defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED)
93 98
@@ -125,7 +130,7 @@ static inline int minix_find_first_zero_bit(const void *vaddr, unsigned size)
125 if (!size) 130 if (!size)
126 return 0; 131 return 0;
127 132
128 size = (size >> 4) + ((size & 15) > 0); 133 size >>= 4;
129 while (*p++ == 0xffff) { 134 while (*p++ == 0xffff) {
130 if (--size == 0) 135 if (--size == 0)
131 return (p - addr) << 4; 136 return (p - addr) << 4;
diff --git a/fs/namei.c b/fs/namei.c
index ac6d214da827..5008f01787f5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -852,7 +852,7 @@ static int follow_managed(struct path *path, unsigned flags)
852 mntput(path->mnt); 852 mntput(path->mnt);
853 if (ret == -EISDIR) 853 if (ret == -EISDIR)
854 ret = 0; 854 ret = 0;
855 return ret; 855 return ret < 0 ? ret : need_mntput;
856} 856}
857 857
858int follow_down_one(struct path *path) 858int follow_down_one(struct path *path)
@@ -900,6 +900,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
900 break; 900 break;
901 path->mnt = mounted; 901 path->mnt = mounted;
902 path->dentry = mounted->mnt_root; 902 path->dentry = mounted->mnt_root;
903 nd->flags |= LOOKUP_JUMPED;
903 nd->seq = read_seqcount_begin(&path->dentry->d_seq); 904 nd->seq = read_seqcount_begin(&path->dentry->d_seq);
904 /* 905 /*
905 * Update the inode too. We don't need to re-check the 906 * Update the inode too. We don't need to re-check the
@@ -1213,6 +1214,8 @@ retry:
1213 path_put_conditional(path, nd); 1214 path_put_conditional(path, nd);
1214 return err; 1215 return err;
1215 } 1216 }
1217 if (err)
1218 nd->flags |= LOOKUP_JUMPED;
1216 *inode = path->dentry->d_inode; 1219 *inode = path->dentry->d_inode;
1217 return 0; 1220 return 0;
1218} 1221}
@@ -2146,6 +2149,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2146 } 2149 }
2147 2150
2148 /* create side of things */ 2151 /* create side of things */
2152 /*
2153 * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED has been
2154 * cleared when we got to the last component we are about to look up
2155 */
2149 error = complete_walk(nd); 2156 error = complete_walk(nd);
2150 if (error) 2157 if (error)
2151 return ERR_PTR(error); 2158 return ERR_PTR(error);
@@ -2214,6 +2221,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2214 if (error < 0) 2221 if (error < 0)
2215 goto exit_dput; 2222 goto exit_dput;
2216 2223
2224 if (error)
2225 nd->flags |= LOOKUP_JUMPED;
2226
2217 error = -ENOENT; 2227 error = -ENOENT;
2218 if (!path->dentry->d_inode) 2228 if (!path->dentry->d_inode)
2219 goto exit_dput; 2229 goto exit_dput;
@@ -2223,6 +2233,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2223 2233
2224 path_to_nameidata(path, nd); 2234 path_to_nameidata(path, nd);
2225 nd->inode = path->dentry->d_inode; 2235 nd->inode = path->dentry->d_inode;
2236 /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */
2237 error = complete_walk(nd);
2238 if (error)
2239 goto exit;
2226 error = -EISDIR; 2240 error = -EISDIR;
2227 if (S_ISDIR(nd->inode->i_mode)) 2241 if (S_ISDIR(nd->inode->i_mode))
2228 goto exit; 2242 goto exit;
diff --git a/fs/namespace.c b/fs/namespace.c
index e5e1c7d1839b..50ee30345b4f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2483,11 +2483,41 @@ struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt)
2483 __mnt_make_longterm(mnt); 2483 __mnt_make_longterm(mnt);
2484 new_ns->root = mnt; 2484 new_ns->root = mnt;
2485 list_add(&new_ns->list, &new_ns->root->mnt_list); 2485 list_add(&new_ns->list, &new_ns->root->mnt_list);
2486 } else {
2487 mntput(mnt);
2486 } 2488 }
2487 return new_ns; 2489 return new_ns;
2488} 2490}
2489EXPORT_SYMBOL(create_mnt_ns); 2491EXPORT_SYMBOL(create_mnt_ns);
2490 2492
2493struct dentry *mount_subtree(struct vfsmount *mnt, const char *name)
2494{
2495 struct mnt_namespace *ns;
2496 struct path path;
2497 int err;
2498
2499 ns = create_mnt_ns(mnt);
2500 if (IS_ERR(ns))
2501 return ERR_CAST(ns);
2502
2503 err = vfs_path_lookup(mnt->mnt_root, mnt,
2504 name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
2505
2506 put_mnt_ns(ns);
2507
2508 if (err)
2509 return ERR_PTR(err);
2510
2511 /* trade a vfsmount reference for active sb one */
2512 atomic_inc(&path.mnt->mnt_sb->s_active);
2513 mntput(path.mnt);
2514 /* lock the sucker */
2515 down_write(&path.mnt->mnt_sb->s_umount);
2516 /* ... and return the root of (sub)tree on it */
2517 return path.dentry;
2518}
2519EXPORT_SYMBOL(mount_subtree);
2520
2491SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, 2521SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
2492 char __user *, type, unsigned long, flags, void __user *, data) 2522 char __user *, type, unsigned long, flags, void __user *, data)
2493{ 2523{
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 12185aadb349..a62d36b9a99e 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -31,6 +31,7 @@
31 31
32#include <linux/nfs_fs.h> 32#include <linux/nfs_fs.h>
33#include <linux/nfs_page.h> 33#include <linux/nfs_page.h>
34#include <linux/module.h>
34 35
35#include "internal.h" 36#include "internal.h"
36#include "nfs4filelayout.h" 37#include "nfs4filelayout.h"
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 0a5ff5c19511..5668f7c54c41 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -18,6 +18,7 @@
18#include <linux/nfs_page.h> 18#include <linux/nfs_page.h>
19#include <linux/nfs_fs.h> 19#include <linux/nfs_fs.h>
20#include <linux/nfs_mount.h> 20#include <linux/nfs_mount.h>
21#include <linux/export.h>
21 22
22#include "internal.h" 23#include "internal.h"
23#include "pnfs.h" 24#include "pnfs.h"
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index e40a3ca22eaf..8e672a2b2d69 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -29,6 +29,7 @@
29 29
30#include <linux/nfs_fs.h> 30#include <linux/nfs_fs.h>
31#include <linux/nfs_page.h> 31#include <linux/nfs_page.h>
32#include <linux/module.h>
32#include "internal.h" 33#include "internal.h"
33#include "pnfs.h" 34#include "pnfs.h"
34#include "iostat.h" 35#include "iostat.h"
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 6fda5228ef56..4f359d2a26eb 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -28,6 +28,7 @@
28 * such damages. 28 * such damages.
29 */ 29 */
30 30
31#include <linux/export.h>
31#include "pnfs.h" 32#include "pnfs.h"
32 33
33#define NFSDBG_FACILITY NFSDBG_PNFS 34#define NFSDBG_FACILITY NFSDBG_PNFS
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 480b3b6bf71e..134777406ee3 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2787,43 +2787,18 @@ static void nfs_referral_loop_unprotect(void)
2787static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt, 2787static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt,
2788 const char *export_path) 2788 const char *export_path)
2789{ 2789{
2790 struct mnt_namespace *ns_private;
2791 struct super_block *s;
2792 struct dentry *dentry; 2790 struct dentry *dentry;
2793 struct path path; 2791 int ret = nfs_referral_loop_protect();
2794 int ret;
2795
2796 ns_private = create_mnt_ns(root_mnt);
2797 ret = PTR_ERR(ns_private);
2798 if (IS_ERR(ns_private))
2799 goto out_mntput;
2800
2801 ret = nfs_referral_loop_protect();
2802 if (ret != 0)
2803 goto out_put_mnt_ns;
2804 2792
2805 ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt, 2793 if (ret) {
2806 export_path, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); 2794 mntput(root_mnt);
2795 return ERR_PTR(ret);
2796 }
2807 2797
2798 dentry = mount_subtree(root_mnt, export_path);
2808 nfs_referral_loop_unprotect(); 2799 nfs_referral_loop_unprotect();
2809 put_mnt_ns(ns_private);
2810
2811 if (ret != 0)
2812 goto out_err;
2813
2814 s = path.mnt->mnt_sb;
2815 atomic_inc(&s->s_active);
2816 dentry = dget(path.dentry);
2817 2800
2818 path_put(&path);
2819 down_write(&s->s_umount);
2820 return dentry; 2801 return dentry;
2821out_put_mnt_ns:
2822 put_mnt_ns(ns_private);
2823out_mntput:
2824 mntput(root_mnt);
2825out_err:
2826 return ERR_PTR(ret);
2827} 2802}
2828 2803
2829static struct dentry *nfs4_try_mount(int flags, const char *dev_name, 2804static struct dentry *nfs4_try_mount(int flags, const char *dev_name,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b016b8a36399..1dda78db6a73 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -20,6 +20,7 @@
20#include <linux/nfs_mount.h> 20#include <linux/nfs_mount.h>
21#include <linux/nfs_page.h> 21#include <linux/nfs_page.h>
22#include <linux/backing-dev.h> 22#include <linux/backing-dev.h>
23#include <linux/export.h>
23 24
24#include <asm/uaccess.h> 25#include <asm/uaccess.h>
25 26
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index ad88f1c0a4c3..9c51aff02ae2 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -36,6 +36,7 @@
36 36
37#include <linux/slab.h> 37#include <linux/slab.h>
38#include <linux/nfs_fs.h> 38#include <linux/nfs_fs.h>
39#include <linux/export.h>
39#include "acl.h" 40#include "acl.h"
40 41
41 42
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index db34a585e112..c45a2ea4a090 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -13,6 +13,7 @@
13#include <linux/sunrpc/clnt.h> 13#include <linux/sunrpc/clnt.h>
14#include <linux/sunrpc/gss_api.h> 14#include <linux/sunrpc/gss_api.h>
15#include <linux/sunrpc/gss_krb5_enctypes.h> 15#include <linux/sunrpc/gss_krb5_enctypes.h>
16#include <linux/module.h>
16 17
17#include "idmap.h" 18#include "idmap.h"
18#include "nfsd.h" 19#include "nfsd.h"
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 52cd976b6099..eda7d7e55e05 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/sched.h> 9#include <linux/sched.h>
10#include <linux/freezer.h> 10#include <linux/freezer.h>
11#include <linux/module.h>
11#include <linux/fs_struct.h> 12#include <linux/fs_struct.h>
12#include <linux/swap.h> 13#include <linux/swap.h>
13 14
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index db5ee4b4f47a..ad7d0c155de4 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -59,6 +59,7 @@
59#include <linux/idr.h> 59#include <linux/idr.h>
60#include <linux/kref.h> 60#include <linux/kref.h>
61#include <linux/net.h> 61#include <linux/net.h>
62#include <linux/export.h>
62#include <net/tcp.h> 63#include <net/tcp.h>
63 64
64#include <asm/uaccess.h> 65#include <asm/uaccess.h>
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 56f82cb912e3..0e28e242226d 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -30,6 +30,7 @@
30#include <linux/sysctl.h> 30#include <linux/sysctl.h>
31#include <linux/spinlock.h> 31#include <linux/spinlock.h>
32#include <linux/debugfs.h> 32#include <linux/debugfs.h>
33#include <linux/export.h>
33 34
34#include "cluster/heartbeat.h" 35#include "cluster/heartbeat.h"
35#include "cluster/nodemanager.h" 36#include "cluster/nodemanager.h"
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 2db1bd3173b2..851ba3dcdc29 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1652,46 +1652,12 @@ out:
1652 return error; 1652 return error;
1653} 1653}
1654 1654
1655static int proc_pid_fd_link_getattr(struct vfsmount *mnt, struct dentry *dentry,
1656 struct kstat *stat)
1657{
1658 struct inode *inode = dentry->d_inode;
1659 struct task_struct *task = get_proc_task(inode);
1660 int rc;
1661
1662 if (task == NULL)
1663 return -ESRCH;
1664
1665 rc = -EACCES;
1666 if (lock_trace(task))
1667 goto out_task;
1668
1669 generic_fillattr(inode, stat);
1670 unlock_trace(task);
1671 rc = 0;
1672out_task:
1673 put_task_struct(task);
1674 return rc;
1675}
1676
1677static const struct inode_operations proc_pid_link_inode_operations = { 1655static const struct inode_operations proc_pid_link_inode_operations = {
1678 .readlink = proc_pid_readlink, 1656 .readlink = proc_pid_readlink,
1679 .follow_link = proc_pid_follow_link, 1657 .follow_link = proc_pid_follow_link,
1680 .setattr = proc_setattr, 1658 .setattr = proc_setattr,
1681}; 1659};
1682 1660
1683static const struct inode_operations proc_fdinfo_link_inode_operations = {
1684 .setattr = proc_setattr,
1685 .getattr = proc_pid_fd_link_getattr,
1686};
1687
1688static const struct inode_operations proc_fd_link_inode_operations = {
1689 .readlink = proc_pid_readlink,
1690 .follow_link = proc_pid_follow_link,
1691 .setattr = proc_setattr,
1692 .getattr = proc_pid_fd_link_getattr,
1693};
1694
1695 1661
1696/* building an inode */ 1662/* building an inode */
1697 1663
@@ -1923,61 +1889,49 @@ out:
1923 1889
1924static int proc_fd_info(struct inode *inode, struct path *path, char *info) 1890static int proc_fd_info(struct inode *inode, struct path *path, char *info)
1925{ 1891{
1926 struct task_struct *task; 1892 struct task_struct *task = get_proc_task(inode);
1927 struct files_struct *files; 1893 struct files_struct *files = NULL;
1928 struct file *file; 1894 struct file *file;
1929 int fd = proc_fd(inode); 1895 int fd = proc_fd(inode);
1930 int rc;
1931
1932 task = get_proc_task(inode);
1933 if (!task)
1934 return -ENOENT;
1935
1936 rc = -EACCES;
1937 if (lock_trace(task))
1938 goto out_task;
1939
1940 rc = -ENOENT;
1941 files = get_files_struct(task);
1942 if (files == NULL)
1943 goto out_unlock;
1944 1896
1945 /* 1897 if (task) {
1946 * We are not taking a ref to the file structure, so we must 1898 files = get_files_struct(task);
1947 * hold ->file_lock. 1899 put_task_struct(task);
1948 */ 1900 }
1949 spin_lock(&files->file_lock); 1901 if (files) {
1950 file = fcheck_files(files, fd); 1902 /*
1951 if (file) { 1903 * We are not taking a ref to the file structure, so we must
1952 unsigned int f_flags; 1904 * hold ->file_lock.
1953 struct fdtable *fdt; 1905 */
1954 1906 spin_lock(&files->file_lock);
1955 fdt = files_fdtable(files); 1907 file = fcheck_files(files, fd);
1956 f_flags = file->f_flags & ~O_CLOEXEC; 1908 if (file) {
1957 if (FD_ISSET(fd, fdt->close_on_exec)) 1909 unsigned int f_flags;
1958 f_flags |= O_CLOEXEC; 1910 struct fdtable *fdt;
1959 1911
1960 if (path) { 1912 fdt = files_fdtable(files);
1961 *path = file->f_path; 1913 f_flags = file->f_flags & ~O_CLOEXEC;
1962 path_get(&file->f_path); 1914 if (FD_ISSET(fd, fdt->close_on_exec))
1915 f_flags |= O_CLOEXEC;
1916
1917 if (path) {
1918 *path = file->f_path;
1919 path_get(&file->f_path);
1920 }
1921 if (info)
1922 snprintf(info, PROC_FDINFO_MAX,
1923 "pos:\t%lli\n"
1924 "flags:\t0%o\n",
1925 (long long) file->f_pos,
1926 f_flags);
1927 spin_unlock(&files->file_lock);
1928 put_files_struct(files);
1929 return 0;
1963 } 1930 }
1964 if (info) 1931 spin_unlock(&files->file_lock);
1965 snprintf(info, PROC_FDINFO_MAX, 1932 put_files_struct(files);
1966 "pos:\t%lli\n" 1933 }
1967 "flags:\t0%o\n", 1934 return -ENOENT;
1968 (long long) file->f_pos,
1969 f_flags);
1970 rc = 0;
1971 } else
1972 rc = -ENOENT;
1973 spin_unlock(&files->file_lock);
1974 put_files_struct(files);
1975
1976out_unlock:
1977 unlock_trace(task);
1978out_task:
1979 put_task_struct(task);
1980 return rc;
1981} 1935}
1982 1936
1983static int proc_fd_link(struct inode *inode, struct path *path) 1937static int proc_fd_link(struct inode *inode, struct path *path)
@@ -2072,7 +2026,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir,
2072 spin_unlock(&files->file_lock); 2026 spin_unlock(&files->file_lock);
2073 put_files_struct(files); 2027 put_files_struct(files);
2074 2028
2075 inode->i_op = &proc_fd_link_inode_operations; 2029 inode->i_op = &proc_pid_link_inode_operations;
2076 inode->i_size = 64; 2030 inode->i_size = 64;
2077 ei->op.proc_get_link = proc_fd_link; 2031 ei->op.proc_get_link = proc_fd_link;
2078 d_set_d_op(dentry, &tid_fd_dentry_operations); 2032 d_set_d_op(dentry, &tid_fd_dentry_operations);
@@ -2104,12 +2058,7 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
2104 if (fd == ~0U) 2058 if (fd == ~0U)
2105 goto out; 2059 goto out;
2106 2060
2107 result = ERR_PTR(-EACCES);
2108 if (lock_trace(task))
2109 goto out;
2110
2111 result = instantiate(dir, dentry, task, &fd); 2061 result = instantiate(dir, dentry, task, &fd);
2112 unlock_trace(task);
2113out: 2062out:
2114 put_task_struct(task); 2063 put_task_struct(task);
2115out_no_task: 2064out_no_task:
@@ -2129,28 +2078,23 @@ static int proc_readfd_common(struct file * filp, void * dirent,
2129 retval = -ENOENT; 2078 retval = -ENOENT;
2130 if (!p) 2079 if (!p)
2131 goto out_no_task; 2080 goto out_no_task;
2132
2133 retval = -EACCES;
2134 if (lock_trace(p))
2135 goto out;
2136
2137 retval = 0; 2081 retval = 0;
2138 2082
2139 fd = filp->f_pos; 2083 fd = filp->f_pos;
2140 switch (fd) { 2084 switch (fd) {
2141 case 0: 2085 case 0:
2142 if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) 2086 if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
2143 goto out_unlock; 2087 goto out;
2144 filp->f_pos++; 2088 filp->f_pos++;
2145 case 1: 2089 case 1:
2146 ino = parent_ino(dentry); 2090 ino = parent_ino(dentry);
2147 if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) 2091 if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
2148 goto out_unlock; 2092 goto out;
2149 filp->f_pos++; 2093 filp->f_pos++;
2150 default: 2094 default:
2151 files = get_files_struct(p); 2095 files = get_files_struct(p);
2152 if (!files) 2096 if (!files)
2153 goto out_unlock; 2097 goto out;
2154 rcu_read_lock(); 2098 rcu_read_lock();
2155 for (fd = filp->f_pos-2; 2099 for (fd = filp->f_pos-2;
2156 fd < files_fdtable(files)->max_fds; 2100 fd < files_fdtable(files)->max_fds;
@@ -2174,9 +2118,6 @@ static int proc_readfd_common(struct file * filp, void * dirent,
2174 rcu_read_unlock(); 2118 rcu_read_unlock();
2175 put_files_struct(files); 2119 put_files_struct(files);
2176 } 2120 }
2177
2178out_unlock:
2179 unlock_trace(p);
2180out: 2121out:
2181 put_task_struct(p); 2122 put_task_struct(p);
2182out_no_task: 2123out_no_task:
@@ -2254,7 +2195,6 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
2254 ei->fd = fd; 2195 ei->fd = fd;
2255 inode->i_mode = S_IFREG | S_IRUSR; 2196 inode->i_mode = S_IFREG | S_IRUSR;
2256 inode->i_fop = &proc_fdinfo_file_operations; 2197 inode->i_fop = &proc_fdinfo_file_operations;
2257 inode->i_op = &proc_fdinfo_link_inode_operations;
2258 d_set_d_op(dentry, &tid_fd_dentry_operations); 2198 d_set_d_op(dentry, &tid_fd_dentry_operations);
2259 d_add(dentry, inode); 2199 d_add(dentry, inode);
2260 /* Close the race of the process dying before we return the dentry */ 2200 /* Close the race of the process dying before we return the dentry */
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index cd99bf557650..b0f450a2bb7c 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -12,6 +12,7 @@
12#include <linux/user.h> 12#include <linux/user.h>
13#include <linux/elf.h> 13#include <linux/elf.h>
14#include <linux/elfcore.h> 14#include <linux/elfcore.h>
15#include <linux/export.h>
15#include <linux/slab.h> 16#include <linux/slab.h>
16#include <linux/highmem.h> 17#include <linux/highmem.h>
17#include <linux/bootmem.h> 18#include <linux/bootmem.h>
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index aae0edb95c6c..35f4b0ecdeb3 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -286,7 +286,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
286 /* caller already holds s_umount */ 286 /* caller already holds s_umount */
287 if (sb->s_flags & MS_RDONLY) 287 if (sb->s_flags & MS_RDONLY)
288 return -EROFS; 288 return -EROFS;
289 writeback_inodes_sb(sb); 289 writeback_inodes_sb(sb, WB_REASON_SYNC);
290 return 0; 290 return 0;
291 default: 291 default:
292 return -EINVAL; 292 return -EINVAL;
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 048b59d5b2f0..c70111ebefd4 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -78,6 +78,28 @@ config SQUASHFS_XZ
78 78
79 If unsure, say N. 79 If unsure, say N.
80 80
81config SQUASHFS_4K_DEVBLK_SIZE
82 bool "Use 4K device block size?"
83 depends on SQUASHFS
84 help
85 By default Squashfs sets the dev block size (sb_min_blocksize)
86 to 1K or the smallest block size supported by the block device
87 (if larger). This, because blocks are packed together and
88 unaligned in Squashfs, should reduce latency.
89
90 This, however, gives poor performance on MTD NAND devices where
91 the optimal I/O size is 4K (even though the devices can support
92 smaller block sizes).
93
94 Using a 4K device block size may also improve overall I/O
95 performance for some file access patterns (e.g. sequential
96 accesses of files in filesystem order) on all media.
97
98 Setting this option will force Squashfs to use a 4K device block
99 size by default.
100
101 If unsure, say N.
102
81config SQUASHFS_EMBEDDED 103config SQUASHFS_EMBEDDED
82 bool "Additional option for memory-constrained systems" 104 bool "Additional option for memory-constrained systems"
83 depends on SQUASHFS 105 depends on SQUASHFS
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index b4a4e539a08c..e8e14645de9a 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -36,6 +36,13 @@
36#define SQUASHFS_FILE_SIZE 131072 36#define SQUASHFS_FILE_SIZE 131072
37#define SQUASHFS_FILE_LOG 17 37#define SQUASHFS_FILE_LOG 17
38 38
39/* default size of block device I/O */
40#ifdef CONFIG_SQUASHFS_4K_DEVBLK_SIZE
41#define SQUASHFS_DEVBLK_SIZE 4096
42#else
43#define SQUASHFS_DEVBLK_SIZE 1024
44#endif
45
39#define SQUASHFS_FILE_MAX_SIZE 1048576 46#define SQUASHFS_FILE_MAX_SIZE 1048576
40#define SQUASHFS_FILE_MAX_LOG 20 47#define SQUASHFS_FILE_MAX_LOG 20
41 48
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 7438850c62d0..2da1715452ac 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -95,7 +95,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
95 } 95 }
96 msblk = sb->s_fs_info; 96 msblk = sb->s_fs_info;
97 97
98 msblk->devblksize = sb_min_blocksize(sb, BLOCK_SIZE); 98 msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE);
99 msblk->devblksize_log2 = ffz(~msblk->devblksize); 99 msblk->devblksize_log2 = ffz(~msblk->devblksize);
100 100
101 mutex_init(&msblk->read_data_mutex); 101 mutex_init(&msblk->read_data_mutex);
diff --git a/fs/statfs.c b/fs/statfs.c
index 8244924dec55..9cf04a118965 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -76,7 +76,7 @@ EXPORT_SYMBOL(vfs_statfs);
76int user_statfs(const char __user *pathname, struct kstatfs *st) 76int user_statfs(const char __user *pathname, struct kstatfs *st)
77{ 77{
78 struct path path; 78 struct path path;
79 int error = user_path(pathname, &path); 79 int error = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
80 if (!error) { 80 if (!error) {
81 error = vfs_statfs(&path, st); 81 error = vfs_statfs(&path, st);
82 path_put(&path); 82 path_put(&path);
diff --git a/fs/sync.c b/fs/sync.c
index c98a7477edfd..101b8ef901d7 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -43,7 +43,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
43 if (wait) 43 if (wait)
44 sync_inodes_sb(sb); 44 sync_inodes_sb(sb);
45 else 45 else
46 writeback_inodes_sb(sb); 46 writeback_inodes_sb(sb, WB_REASON_SYNC);
47 47
48 if (sb->s_op->sync_fs) 48 if (sb->s_op->sync_fs)
49 sb->s_op->sync_fs(sb, wait); 49 sb->s_op->sync_fs(sb, wait);
@@ -98,7 +98,7 @@ static void sync_filesystems(int wait)
98 */ 98 */
99SYSCALL_DEFINE0(sync) 99SYSCALL_DEFINE0(sync)
100{ 100{
101 wakeup_flusher_threads(0); 101 wakeup_flusher_threads(0, WB_REASON_SYNC);
102 sync_filesystems(0); 102 sync_filesystems(0);
103 sync_filesystems(1); 103 sync_filesystems(1);
104 if (unlikely(laptop_mode)) 104 if (unlikely(laptop_mode))
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 315de66e52b2..bc4f94b28706 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -63,7 +63,7 @@
63static void shrink_liability(struct ubifs_info *c, int nr_to_write) 63static void shrink_liability(struct ubifs_info *c, int nr_to_write)
64{ 64{
65 down_read(&c->vfs_sb->s_umount); 65 down_read(&c->vfs_sb->s_umount);
66 writeback_inodes_sb(c->vfs_sb); 66 writeback_inodes_sb(c->vfs_sb, WB_REASON_FS_FREE_SPACE);
67 up_read(&c->vfs_sb->s_umount); 67 up_read(&c->vfs_sb->s_umount);
68} 68}
69 69
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index eef109a1a927..b09ba2dd8b62 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -870,6 +870,22 @@ void dbg_dump_lpt_info(struct ubifs_info *c)
870 spin_unlock(&dbg_lock); 870 spin_unlock(&dbg_lock);
871} 871}
872 872
873void dbg_dump_sleb(const struct ubifs_info *c,
874 const struct ubifs_scan_leb *sleb, int offs)
875{
876 struct ubifs_scan_node *snod;
877
878 printk(KERN_DEBUG "(pid %d) start dumping scanned data from LEB %d:%d\n",
879 current->pid, sleb->lnum, offs);
880
881 list_for_each_entry(snod, &sleb->nodes, list) {
882 cond_resched();
883 printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", sleb->lnum,
884 snod->offs, snod->len);
885 dbg_dump_node(c, snod->node);
886 }
887}
888
873void dbg_dump_leb(const struct ubifs_info *c, int lnum) 889void dbg_dump_leb(const struct ubifs_info *c, int lnum)
874{ 890{
875 struct ubifs_scan_leb *sleb; 891 struct ubifs_scan_leb *sleb;
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index feb361e252ac..8d9c46810189 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -269,6 +269,8 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp);
269void dbg_dump_lprops(struct ubifs_info *c); 269void dbg_dump_lprops(struct ubifs_info *c);
270void dbg_dump_lpt_info(struct ubifs_info *c); 270void dbg_dump_lpt_info(struct ubifs_info *c);
271void dbg_dump_leb(const struct ubifs_info *c, int lnum); 271void dbg_dump_leb(const struct ubifs_info *c, int lnum);
272void dbg_dump_sleb(const struct ubifs_info *c,
273 const struct ubifs_scan_leb *sleb, int offs);
272void dbg_dump_znode(const struct ubifs_info *c, 274void dbg_dump_znode(const struct ubifs_info *c,
273 const struct ubifs_znode *znode); 275 const struct ubifs_znode *znode);
274void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat); 276void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat);
@@ -387,6 +389,9 @@ static inline void dbg_dump_lpt_info(struct ubifs_info *c) { return; }
387static inline void dbg_dump_leb(const struct ubifs_info *c, 389static inline void dbg_dump_leb(const struct ubifs_info *c,
388 int lnum) { return; } 390 int lnum) { return; }
389static inline void 391static inline void
392dbg_dump_sleb(const struct ubifs_info *c,
393 const struct ubifs_scan_leb *sleb, int offs) { return; }
394static inline void
390dbg_dump_znode(const struct ubifs_info *c, 395dbg_dump_znode(const struct ubifs_info *c,
391 const struct ubifs_znode *znode) { return; } 396 const struct ubifs_znode *znode) { return; }
392static inline void dbg_dump_heap(struct ubifs_info *c, 397static inline void dbg_dump_heap(struct ubifs_info *c,
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index af02790d9328..ee4f43f4bb99 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -983,7 +983,7 @@ int ubifs_recover_inl_heads(struct ubifs_info *c, void *sbuf)
983} 983}
984 984
985/** 985/**
986 * clean_an_unclean_leb - read and write a LEB to remove corruption. 986 * clean_an_unclean_leb - read and write a LEB to remove corruption.
987 * @c: UBIFS file-system description object 987 * @c: UBIFS file-system description object
988 * @ucleb: unclean LEB information 988 * @ucleb: unclean LEB information
989 * @sbuf: LEB-sized buffer to use 989 * @sbuf: LEB-sized buffer to use
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 93d938ad3d2a..6094c5a5d7a8 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -247,7 +247,7 @@ static int create_default_filesystem(struct ubifs_info *c)
247 mst->total_dirty = cpu_to_le64(tmp64); 247 mst->total_dirty = cpu_to_le64(tmp64);
248 248
249 /* The indexing LEB does not contribute to dark space */ 249 /* The indexing LEB does not contribute to dark space */
250 tmp64 = (c->main_lebs - 1) * c->dark_wm; 250 tmp64 = ((long long)(c->main_lebs - 1) * c->dark_wm);
251 mst->total_dark = cpu_to_le64(tmp64); 251 mst->total_dark = cpu_to_le64(tmp64);
252 252
253 mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ); 253 mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 33b13310ee0c..574d4ee9b625 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -189,7 +189,7 @@ xfs_end_io(
189 int error = 0; 189 int error = 0;
190 190
191 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 191 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
192 error = -EIO; 192 ioend->io_error = -EIO;
193 goto done; 193 goto done;
194 } 194 }
195 if (ioend->io_error) 195 if (ioend->io_error)
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 1a3513881bce..eac97ef81e2a 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -656,7 +656,7 @@ xfs_buf_item_committing(
656/* 656/*
657 * This is the ops vector shared by all buf log items. 657 * This is the ops vector shared by all buf log items.
658 */ 658 */
659static struct xfs_item_ops xfs_buf_item_ops = { 659static const struct xfs_item_ops xfs_buf_item_ops = {
660 .iop_size = xfs_buf_item_size, 660 .iop_size = xfs_buf_item_size,
661 .iop_format = xfs_buf_item_format, 661 .iop_format = xfs_buf_item_format,
662 .iop_pin = xfs_buf_item_pin, 662 .iop_pin = xfs_buf_item_pin,
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index bb3f71d236d2..0dee0b71029d 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -295,7 +295,7 @@ xfs_qm_dquot_logitem_committing(
295/* 295/*
296 * This is the ops vector for dquots 296 * This is the ops vector for dquots
297 */ 297 */
298static struct xfs_item_ops xfs_dquot_item_ops = { 298static const struct xfs_item_ops xfs_dquot_item_ops = {
299 .iop_size = xfs_qm_dquot_logitem_size, 299 .iop_size = xfs_qm_dquot_logitem_size,
300 .iop_format = xfs_qm_dquot_logitem_format, 300 .iop_format = xfs_qm_dquot_logitem_format,
301 .iop_pin = xfs_qm_dquot_logitem_pin, 301 .iop_pin = xfs_qm_dquot_logitem_pin,
@@ -483,7 +483,7 @@ xfs_qm_qoff_logitem_committing(
483{ 483{
484} 484}
485 485
486static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { 486static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
487 .iop_size = xfs_qm_qoff_logitem_size, 487 .iop_size = xfs_qm_qoff_logitem_size,
488 .iop_format = xfs_qm_qoff_logitem_format, 488 .iop_format = xfs_qm_qoff_logitem_format,
489 .iop_pin = xfs_qm_qoff_logitem_pin, 489 .iop_pin = xfs_qm_qoff_logitem_pin,
@@ -498,7 +498,7 @@ static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
498/* 498/*
499 * This is the ops vector shared by all quotaoff-start log items. 499 * This is the ops vector shared by all quotaoff-start log items.
500 */ 500 */
501static struct xfs_item_ops xfs_qm_qoff_logitem_ops = { 501static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
502 .iop_size = xfs_qm_qoff_logitem_size, 502 .iop_size = xfs_qm_qoff_logitem_size,
503 .iop_format = xfs_qm_qoff_logitem_format, 503 .iop_format = xfs_qm_qoff_logitem_format,
504 .iop_pin = xfs_qm_qoff_logitem_pin, 504 .iop_pin = xfs_qm_qoff_logitem_pin,
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index d22e62623437..35c2aff38b20 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -217,7 +217,7 @@ xfs_efi_item_committing(
217/* 217/*
218 * This is the ops vector shared by all efi log items. 218 * This is the ops vector shared by all efi log items.
219 */ 219 */
220static struct xfs_item_ops xfs_efi_item_ops = { 220static const struct xfs_item_ops xfs_efi_item_ops = {
221 .iop_size = xfs_efi_item_size, 221 .iop_size = xfs_efi_item_size,
222 .iop_format = xfs_efi_item_format, 222 .iop_format = xfs_efi_item_format,
223 .iop_pin = xfs_efi_item_pin, 223 .iop_pin = xfs_efi_item_pin,
@@ -477,7 +477,7 @@ xfs_efd_item_committing(
477/* 477/*
478 * This is the ops vector shared by all efd log items. 478 * This is the ops vector shared by all efd log items.
479 */ 479 */
480static struct xfs_item_ops xfs_efd_item_ops = { 480static const struct xfs_item_ops xfs_efd_item_ops = {
481 .iop_size = xfs_efd_item_size, 481 .iop_size = xfs_efd_item_size,
482 .iop_format = xfs_efd_item_format, 482 .iop_format = xfs_efd_item_format,
483 .iop_pin = xfs_efd_item_pin, 483 .iop_pin = xfs_efd_item_pin,
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index b7cf21ba240f..abaafdbb3e65 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -795,7 +795,7 @@ xfs_inode_item_committing(
795/* 795/*
796 * This is the ops vector shared by all buf log items. 796 * This is the ops vector shared by all buf log items.
797 */ 797 */
798static struct xfs_item_ops xfs_inode_item_ops = { 798static const struct xfs_item_ops xfs_inode_item_ops = {
799 .iop_size = xfs_inode_item_size, 799 .iop_size = xfs_inode_item_size,
800 .iop_format = xfs_inode_item_format, 800 .iop_format = xfs_inode_item_format,
801 .iop_pin = xfs_inode_item_pin, 801 .iop_pin = xfs_inode_item_pin,
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 2758a6277c52..a14cd89fe465 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -626,7 +626,7 @@ xfs_log_item_init(
626 struct xfs_mount *mp, 626 struct xfs_mount *mp,
627 struct xfs_log_item *item, 627 struct xfs_log_item *item,
628 int type, 628 int type,
629 struct xfs_item_ops *ops) 629 const struct xfs_item_ops *ops)
630{ 630{
631 item->li_mountp = mp; 631 item->li_mountp = mp;
632 item->li_ailp = mp->m_ail; 632 item->li_ailp = mp->m_ail;
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 78c9039994af..3f7bf451c034 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -137,7 +137,7 @@ struct xfs_trans;
137void xfs_log_item_init(struct xfs_mount *mp, 137void xfs_log_item_init(struct xfs_mount *mp,
138 struct xfs_log_item *item, 138 struct xfs_log_item *item,
139 int type, 139 int type,
140 struct xfs_item_ops *ops); 140 const struct xfs_item_ops *ops);
141 141
142xfs_lsn_t xfs_log_done(struct xfs_mount *mp, 142xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
143 struct xlog_ticket *ticket, 143 struct xlog_ticket *ticket,
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 5cff443f6cdb..0bbb1a41998b 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -674,7 +674,8 @@ xfs_qm_dqattach_one(
674 * disk and we didn't ask it to allocate; 674 * disk and we didn't ask it to allocate;
675 * ESRCH if quotas got turned off suddenly. 675 * ESRCH if quotas got turned off suddenly.
676 */ 676 */
677 error = xfs_qm_dqget(ip->i_mount, ip, id, type, XFS_QMOPT_DOWARN, &dqp); 677 error = xfs_qm_dqget(ip->i_mount, ip, id, type,
678 doalloc | XFS_QMOPT_DOWARN, &dqp);
678 if (error) 679 if (error)
679 return error; 680 return error;
680 681
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 603f3eb52041..3ae713c0abd9 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -326,7 +326,7 @@ typedef struct xfs_log_item {
326 struct xfs_log_item *); 326 struct xfs_log_item *);
327 /* buffer item iodone */ 327 /* buffer item iodone */
328 /* callback func */ 328 /* callback func */
329 struct xfs_item_ops *li_ops; /* function list */ 329 const struct xfs_item_ops *li_ops; /* function list */
330 330
331 /* delayed logging */ 331 /* delayed logging */
332 struct list_head li_cil; /* CIL pointers */ 332 struct list_head li_cil; /* CIL pointers */
@@ -341,7 +341,7 @@ typedef struct xfs_log_item {
341 { XFS_LI_IN_AIL, "IN_AIL" }, \ 341 { XFS_LI_IN_AIL, "IN_AIL" }, \
342 { XFS_LI_ABORTED, "ABORTED" } 342 { XFS_LI_ABORTED, "ABORTED" }
343 343
344typedef struct xfs_item_ops { 344struct xfs_item_ops {
345 uint (*iop_size)(xfs_log_item_t *); 345 uint (*iop_size)(xfs_log_item_t *);
346 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); 346 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
347 void (*iop_pin)(xfs_log_item_t *); 347 void (*iop_pin)(xfs_log_item_t *);
@@ -352,7 +352,7 @@ typedef struct xfs_item_ops {
352 void (*iop_push)(xfs_log_item_t *); 352 void (*iop_push)(xfs_log_item_t *);
353 bool (*iop_pushbuf)(xfs_log_item_t *); 353 bool (*iop_pushbuf)(xfs_log_item_t *);
354 void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); 354 void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
355} xfs_item_ops_t; 355};
356 356
357#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip) 357#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip)
358#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) 358#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp)
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 4ecf2a549060..ce9268a2f56b 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -112,7 +112,7 @@ xfs_readlink(
112 char *link) 112 char *link)
113{ 113{
114 xfs_mount_t *mp = ip->i_mount; 114 xfs_mount_t *mp = ip->i_mount;
115 int pathlen; 115 xfs_fsize_t pathlen;
116 int error = 0; 116 int error = 0;
117 117
118 trace_xfs_readlink(ip); 118 trace_xfs_readlink(ip);
@@ -122,13 +122,19 @@ xfs_readlink(
122 122
123 xfs_ilock(ip, XFS_ILOCK_SHARED); 123 xfs_ilock(ip, XFS_ILOCK_SHARED);
124 124
125 ASSERT(S_ISLNK(ip->i_d.di_mode));
126 ASSERT(ip->i_d.di_size <= MAXPATHLEN);
127
128 pathlen = ip->i_d.di_size; 125 pathlen = ip->i_d.di_size;
129 if (!pathlen) 126 if (!pathlen)
130 goto out; 127 goto out;
131 128
129 if (pathlen < 0 || pathlen > MAXPATHLEN) {
130 xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)",
131 __func__, (unsigned long long) ip->i_ino,
132 (long long) pathlen);
133 ASSERT(0);
134 return XFS_ERROR(EFSCORRUPTED);
135 }
136
137
132 if (ip->i_df.if_flags & XFS_IFINLINE) { 138 if (ip->i_df.if_flags & XFS_IFINLINE) {
133 memcpy(link, ip->i_df.if_u1.if_data, pathlen); 139 memcpy(link, ip->i_df.if_u1.if_data, pathlen);
134 link[pathlen] = '\0'; 140 link[pathlen] = '\0';