aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/bio-integrity.c1
-rw-r--r--fs/bio.c8
-rw-r--r--fs/block_dev.c15
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/acl.c17
-rw-r--r--fs/btrfs/backref.c776
-rw-r--r--fs/btrfs/backref.h62
-rw-r--r--fs/btrfs/btrfs_inode.h21
-rw-r--r--fs/btrfs/compression.c3
-rw-r--r--fs/btrfs/ctree.c27
-rw-r--r--fs/btrfs/ctree.h206
-rw-r--r--fs/btrfs/delayed-inode.c108
-rw-r--r--fs/btrfs/disk-io.c630
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c1108
-rw-r--r--fs/btrfs/extent_io.c640
-rw-r--r--fs/btrfs/extent_io.h23
-rw-r--r--fs/btrfs/file-item.c17
-rw-r--r--fs/btrfs/file.c25
-rw-r--r--fs/btrfs/free-space-cache.c994
-rw-r--r--fs/btrfs/inode-map.c34
-rw-r--r--fs/btrfs/inode.c545
-rw-r--r--fs/btrfs/ioctl.c238
-rw-r--r--fs/btrfs/ioctl.h29
-rw-r--r--fs/btrfs/print-tree.c8
-rw-r--r--fs/btrfs/reada.c951
-rw-r--r--fs/btrfs/relocation.c26
-rw-r--r--fs/btrfs/scrub.c660
-rw-r--r--fs/btrfs/super.c315
-rw-r--r--fs/btrfs/transaction.c156
-rw-r--r--fs/btrfs/tree-log.c19
-rw-r--r--fs/btrfs/volumes.c214
-rw-r--r--fs/btrfs/volumes.h24
-rw-r--r--fs/btrfs/xattr.c11
-rw-r--r--fs/buffer.c2
-rw-r--r--fs/cifs/cifsencrypt.c8
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsproto.h8
-rw-r--r--fs/cifs/connect.c5
-rw-r--r--fs/cifs/file.c142
-rw-r--r--fs/cifs/readdir.c10
-rw-r--r--fs/cifs/sess.c2
-rw-r--r--fs/cifs/smbencrypt.c65
-rw-r--r--fs/dcache.c88
-rw-r--r--fs/ecryptfs/crypto.c26
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h5
-rw-r--r--fs/ecryptfs/file.c23
-rw-r--r--fs/ecryptfs/inode.c52
-rw-r--r--fs/exofs/Kconfig2
-rw-r--r--fs/exofs/ore.c1
-rw-r--r--fs/exofs/super.c1
-rw-r--r--fs/ext4/balloc.c2
-rw-r--r--fs/ext4/inode.c5
-rw-r--r--fs/ext4/super.c6
-rw-r--r--fs/fs-writeback.c89
-rw-r--r--fs/fuse/cuse.c1
-rw-r--r--fs/gfs2/ops_fstype.c1
-rw-r--r--fs/hfs/trans.c2
-rw-r--r--fs/ioprio.c1
-rw-r--r--fs/jffs2/compr.c128
-rw-r--r--fs/jffs2/compr.h2
-rw-r--r--fs/jffs2/fs.c2
-rw-r--r--fs/jffs2/jffs2_fs_sb.h6
-rw-r--r--fs/jffs2/os-linux.h2
-rw-r--r--fs/jffs2/scan.c4
-rw-r--r--fs/jffs2/super.c119
-rw-r--r--fs/jffs2/wbuf.c9
-rw-r--r--fs/jfs/jfs_logmgr.c1
-rw-r--r--fs/logfs/super.c1
-rw-r--r--fs/minix/bitmap.c55
-rw-r--r--fs/minix/inode.c25
-rw-r--r--fs/minix/minix.h11
-rw-r--r--fs/namei.c16
-rw-r--r--fs/namespace.c52
-rw-r--r--fs/nfs/callback_xdr.c12
-rw-r--r--fs/nfs/dir.c2
-rw-r--r--fs/nfs/file.c100
-rw-r--r--fs/nfs/inode.c2
-rw-r--r--fs/nfs/internal.h2
-rw-r--r--fs/nfs/nfs3proc.c1
-rw-r--r--fs/nfs/nfs4filelayout.c8
-rw-r--r--fs/nfs/nfs4proc.c10
-rw-r--r--fs/nfs/nfs4xdr.c2
-rw-r--r--fs/nfs/objlayout/objio_osd.c872
-rw-r--r--fs/nfs/objlayout/objlayout.c209
-rw-r--r--fs/nfs/objlayout/objlayout.h48
-rw-r--r--fs/nfs/pagelist.c3
-rw-r--r--fs/nfs/pnfs.c52
-rw-r--r--fs/nfs/pnfs_dev.c1
-rw-r--r--fs/nfs/proc.c1
-rw-r--r--fs/nfs/read.c14
-rw-r--r--fs/nfs/super.c37
-rw-r--r--fs/nfs/write.c4
-rw-r--r--fs/nfsd/nfs4acl.c1
-rw-r--r--fs/nfsd/nfsctl.c1
-rw-r--r--fs/nfsd/nfssvc.c3
-rw-r--r--fs/ocfs2/alloc.c2
-rw-r--r--fs/ocfs2/aops.c69
-rw-r--r--fs/ocfs2/aops.h14
-rw-r--r--fs/ocfs2/cluster/heartbeat.c194
-rw-r--r--fs/ocfs2/cluster/netdebug.c102
-rw-r--r--fs/ocfs2/cluster/tcp.c139
-rw-r--r--fs/ocfs2/cluster/tcp.h2
-rw-r--r--fs/ocfs2/dir.c3
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h56
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c1
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c44
-rw-r--r--fs/ocfs2/dlm/dlmlock.c54
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c175
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c164
-rw-r--r--fs/ocfs2/dlm/dlmthread.c16
-rw-r--r--fs/ocfs2/dlmglue.c21
-rw-r--r--fs/ocfs2/extent_map.c96
-rw-r--r--fs/ocfs2/extent_map.h2
-rw-r--r--fs/ocfs2/file.c96
-rw-r--r--fs/ocfs2/inode.c2
-rw-r--r--fs/ocfs2/inode.h3
-rw-r--r--fs/ocfs2/ioctl.c11
-rw-r--r--fs/ocfs2/journal.c23
-rw-r--r--fs/ocfs2/journal.h5
-rw-r--r--fs/ocfs2/mmap.c53
-rw-r--r--fs/ocfs2/move_extents.c2
-rw-r--r--fs/ocfs2/ocfs2.h51
-rw-r--r--fs/ocfs2/quota_local.c23
-rw-r--r--fs/ocfs2/slot_map.c4
-rw-r--r--fs/ocfs2/stack_o2cb.c71
-rw-r--r--fs/ocfs2/super.c25
-rw-r--r--fs/ocfs2/xattr.c10
-rw-r--r--fs/proc/base.c146
-rw-r--r--fs/proc/meminfo.c7
-rw-r--r--fs/proc/stat.c4
-rw-r--r--fs/proc/vmcore.c1
-rw-r--r--fs/pstore/platform.c13
-rw-r--r--fs/quota/quota.c2
-rw-r--r--fs/seq_file.c6
-rw-r--r--fs/squashfs/Kconfig22
-rw-r--r--fs/squashfs/squashfs_fs.h7
-rw-r--r--fs/squashfs/super.c2
-rw-r--r--fs/statfs.c2
-rw-r--r--fs/sync.c4
-rw-r--r--fs/ubifs/budget.c2
-rw-r--r--fs/ubifs/debug.c16
-rw-r--r--fs/ubifs/debug.h5
-rw-r--r--fs/ubifs/recovery.c2
-rw-r--r--fs/ubifs/sb.c2
-rw-r--r--fs/xfs/xfs_acl.c2
-rw-r--r--fs/xfs/xfs_aops.c2
-rw-r--r--fs/xfs/xfs_attr_leaf.c64
-rw-r--r--fs/xfs/xfs_bmap.c20
-rw-r--r--fs/xfs/xfs_buf_item.c2
-rw-r--r--fs/xfs/xfs_dquot_item.c6
-rw-r--r--fs/xfs/xfs_export.c8
-rw-r--r--fs/xfs/xfs_extfree_item.c4
-rw-r--r--fs/xfs/xfs_inode.c21
-rw-r--r--fs/xfs/xfs_inode.h1
-rw-r--r--fs/xfs/xfs_inode_item.c2
-rw-r--r--fs/xfs/xfs_log.c350
-rw-r--r--fs/xfs/xfs_log.h2
-rw-r--r--fs/xfs/xfs_qm.c3
-rw-r--r--fs/xfs/xfs_sync.c11
-rw-r--r--fs/xfs/xfs_trace.h12
-rw-r--r--fs/xfs/xfs_trans.h6
-rw-r--r--fs/xfs/xfs_vnodeops.c14
163 files changed, 8550 insertions, 4041 deletions
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 9c5e6b2cd11..c2183f3917c 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -22,6 +22,7 @@
22 22
23#include <linux/blkdev.h> 23#include <linux/blkdev.h>
24#include <linux/mempool.h> 24#include <linux/mempool.h>
25#include <linux/export.h>
25#include <linux/bio.h> 26#include <linux/bio.h>
26#include <linux/workqueue.h> 27#include <linux/workqueue.h>
27#include <linux/slab.h> 28#include <linux/slab.h>
diff --git a/fs/bio.c b/fs/bio.c
index 9bfade8a609..b1fe82cf88c 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -255,7 +255,6 @@ void bio_init(struct bio *bio)
255{ 255{
256 memset(bio, 0, sizeof(*bio)); 256 memset(bio, 0, sizeof(*bio));
257 bio->bi_flags = 1 << BIO_UPTODATE; 257 bio->bi_flags = 1 << BIO_UPTODATE;
258 bio->bi_comp_cpu = -1;
259 atomic_set(&bio->bi_cnt, 1); 258 atomic_set(&bio->bi_cnt, 1);
260} 259}
261EXPORT_SYMBOL(bio_init); 260EXPORT_SYMBOL(bio_init);
@@ -338,7 +337,7 @@ static void bio_fs_destructor(struct bio *bio)
338 * RETURNS: 337 * RETURNS:
339 * Pointer to new bio on success, NULL on failure. 338 * Pointer to new bio on success, NULL on failure.
340 */ 339 */
341struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) 340struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
342{ 341{
343 struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); 342 struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
344 343
@@ -366,7 +365,7 @@ static void bio_kmalloc_destructor(struct bio *bio)
366 * %__GFP_WAIT, the allocation is guaranteed to succeed. 365 * %__GFP_WAIT, the allocation is guaranteed to succeed.
367 * 366 *
368 **/ 367 **/
369struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) 368struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
370{ 369{
371 struct bio *bio; 370 struct bio *bio;
372 371
@@ -697,7 +696,8 @@ static void bio_free_map_data(struct bio_map_data *bmd)
697 kfree(bmd); 696 kfree(bmd);
698} 697}
699 698
700static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, 699static struct bio_map_data *bio_alloc_map_data(int nr_segs,
700 unsigned int iov_count,
701 gfp_t gfp_mask) 701 gfp_t gfp_mask)
702{ 702{
703 struct bio_map_data *bmd; 703 struct bio_map_data *bmd;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 95f786ec7f0..b07f1da1de4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -971,7 +971,7 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty)
971 971
972 if (!bdev->bd_disk) 972 if (!bdev->bd_disk)
973 return; 973 return;
974 if (disk_partitionable(bdev->bd_disk)) 974 if (disk_part_scan_enabled(bdev->bd_disk))
975 bdev->bd_invalidated = 1; 975 bdev->bd_invalidated = 1;
976} 976}
977 977
@@ -1085,6 +1085,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
1085static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) 1085static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1086{ 1086{
1087 struct gendisk *disk; 1087 struct gendisk *disk;
1088 struct module *owner;
1088 int ret; 1089 int ret;
1089 int partno; 1090 int partno;
1090 int perm = 0; 1091 int perm = 0;
@@ -1110,6 +1111,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1110 disk = get_gendisk(bdev->bd_dev, &partno); 1111 disk = get_gendisk(bdev->bd_dev, &partno);
1111 if (!disk) 1112 if (!disk)
1112 goto out; 1113 goto out;
1114 owner = disk->fops->owner;
1113 1115
1114 disk_block_events(disk); 1116 disk_block_events(disk);
1115 mutex_lock_nested(&bdev->bd_mutex, for_part); 1117 mutex_lock_nested(&bdev->bd_mutex, for_part);
@@ -1137,8 +1139,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1137 bdev->bd_disk = NULL; 1139 bdev->bd_disk = NULL;
1138 mutex_unlock(&bdev->bd_mutex); 1140 mutex_unlock(&bdev->bd_mutex);
1139 disk_unblock_events(disk); 1141 disk_unblock_events(disk);
1140 module_put(disk->fops->owner);
1141 put_disk(disk); 1142 put_disk(disk);
1143 module_put(owner);
1142 goto restart; 1144 goto restart;
1143 } 1145 }
1144 } 1146 }
@@ -1194,8 +1196,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1194 goto out_unlock_bdev; 1196 goto out_unlock_bdev;
1195 } 1197 }
1196 /* only one opener holds refs to the module and disk */ 1198 /* only one opener holds refs to the module and disk */
1197 module_put(disk->fops->owner);
1198 put_disk(disk); 1199 put_disk(disk);
1200 module_put(owner);
1199 } 1201 }
1200 bdev->bd_openers++; 1202 bdev->bd_openers++;
1201 if (for_part) 1203 if (for_part)
@@ -1215,8 +1217,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1215 out_unlock_bdev: 1217 out_unlock_bdev:
1216 mutex_unlock(&bdev->bd_mutex); 1218 mutex_unlock(&bdev->bd_mutex);
1217 disk_unblock_events(disk); 1219 disk_unblock_events(disk);
1218 module_put(disk->fops->owner);
1219 put_disk(disk); 1220 put_disk(disk);
1221 module_put(owner);
1220 out: 1222 out:
1221 bdput(bdev); 1223 bdput(bdev);
1222 1224
@@ -1442,14 +1444,15 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
1442 if (!bdev->bd_openers) { 1444 if (!bdev->bd_openers) {
1443 struct module *owner = disk->fops->owner; 1445 struct module *owner = disk->fops->owner;
1444 1446
1445 put_disk(disk);
1446 module_put(owner);
1447 disk_put_part(bdev->bd_part); 1447 disk_put_part(bdev->bd_part);
1448 bdev->bd_part = NULL; 1448 bdev->bd_part = NULL;
1449 bdev->bd_disk = NULL; 1449 bdev->bd_disk = NULL;
1450 if (bdev != bdev->bd_contains) 1450 if (bdev != bdev->bd_contains)
1451 victim = bdev->bd_contains; 1451 victim = bdev->bd_contains;
1452 bdev->bd_contains = NULL; 1452 bdev->bd_contains = NULL;
1453
1454 put_disk(disk);
1455 module_put(owner);
1453 } 1456 }
1454 mutex_unlock(&bdev->bd_mutex); 1457 mutex_unlock(&bdev->bd_mutex);
1455 bdput(bdev); 1458 bdput(bdev);
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 40e6ac08c21..c0ddfd29c5e 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,6 +7,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o backref.o
11 12
12btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index eb159aaa5a1..89b156d85d6 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -59,22 +59,19 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
59 if (!value) 59 if (!value)
60 return ERR_PTR(-ENOMEM); 60 return ERR_PTR(-ENOMEM);
61 size = __btrfs_getxattr(inode, name, value, size); 61 size = __btrfs_getxattr(inode, name, value, size);
62 if (size > 0) { 62 }
63 acl = posix_acl_from_xattr(value, size); 63 if (size > 0) {
64 if (IS_ERR(acl)) { 64 acl = posix_acl_from_xattr(value, size);
65 kfree(value);
66 return acl;
67 }
68 set_cached_acl(inode, type, acl);
69 }
70 kfree(value);
71 } else if (size == -ENOENT || size == -ENODATA || size == 0) { 65 } else if (size == -ENOENT || size == -ENODATA || size == 0) {
72 /* FIXME, who returns -ENOENT? I think nobody */ 66 /* FIXME, who returns -ENOENT? I think nobody */
73 acl = NULL; 67 acl = NULL;
74 set_cached_acl(inode, type, acl);
75 } else { 68 } else {
76 acl = ERR_PTR(-EIO); 69 acl = ERR_PTR(-EIO);
77 } 70 }
71 kfree(value);
72
73 if (!IS_ERR(acl))
74 set_cached_acl(inode, type, acl);
78 75
79 return acl; 76 return acl;
80} 77}
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
new file mode 100644
index 00000000000..22c64fff1bd
--- /dev/null
+++ b/fs/btrfs/backref.c
@@ -0,0 +1,776 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "backref.h"
22
23struct __data_ref {
24 struct list_head list;
25 u64 inum;
26 u64 root;
27 u64 extent_data_item_offset;
28};
29
30struct __shared_ref {
31 struct list_head list;
32 u64 disk_byte;
33};
34
35static int __inode_info(u64 inum, u64 ioff, u8 key_type,
36 struct btrfs_root *fs_root, struct btrfs_path *path,
37 struct btrfs_key *found_key)
38{
39 int ret;
40 struct btrfs_key key;
41 struct extent_buffer *eb;
42
43 key.type = key_type;
44 key.objectid = inum;
45 key.offset = ioff;
46
47 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
48 if (ret < 0)
49 return ret;
50
51 eb = path->nodes[0];
52 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
53 ret = btrfs_next_leaf(fs_root, path);
54 if (ret)
55 return ret;
56 eb = path->nodes[0];
57 }
58
59 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
60 if (found_key->type != key.type || found_key->objectid != key.objectid)
61 return 1;
62
63 return 0;
64}
65
66/*
67 * this makes the path point to (inum INODE_ITEM ioff)
68 */
69int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
70 struct btrfs_path *path)
71{
72 struct btrfs_key key;
73 return __inode_info(inum, ioff, BTRFS_INODE_ITEM_KEY, fs_root, path,
74 &key);
75}
76
77static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
78 struct btrfs_path *path,
79 struct btrfs_key *found_key)
80{
81 return __inode_info(inum, ioff, BTRFS_INODE_REF_KEY, fs_root, path,
82 found_key);
83}
84
85/*
86 * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
87 * of the path are separated by '/' and the path is guaranteed to be
88 * 0-terminated. the path is only given within the current file system.
89 * Therefore, it never starts with a '/'. the caller is responsible to provide
90 * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
91 * the start point of the resulting string is returned. this pointer is within
92 * dest, normally.
93 * in case the path buffer would overflow, the pointer is decremented further
94 * as if output was written to the buffer, though no more output is actually
95 * generated. that way, the caller can determine how much space would be
96 * required for the path to fit into the buffer. in that case, the returned
97 * value will be smaller than dest. callers must check this!
98 */
99static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
100 struct btrfs_inode_ref *iref,
101 struct extent_buffer *eb_in, u64 parent,
102 char *dest, u32 size)
103{
104 u32 len;
105 int slot;
106 u64 next_inum;
107 int ret;
108 s64 bytes_left = size - 1;
109 struct extent_buffer *eb = eb_in;
110 struct btrfs_key found_key;
111
112 if (bytes_left >= 0)
113 dest[bytes_left] = '\0';
114
115 while (1) {
116 len = btrfs_inode_ref_name_len(eb, iref);
117 bytes_left -= len;
118 if (bytes_left >= 0)
119 read_extent_buffer(eb, dest + bytes_left,
120 (unsigned long)(iref + 1), len);
121 if (eb != eb_in)
122 free_extent_buffer(eb);
123 ret = inode_ref_info(parent, 0, fs_root, path, &found_key);
124 if (ret)
125 break;
126 next_inum = found_key.offset;
127
128 /* regular exit ahead */
129 if (parent == next_inum)
130 break;
131
132 slot = path->slots[0];
133 eb = path->nodes[0];
134 /* make sure we can use eb after releasing the path */
135 if (eb != eb_in)
136 atomic_inc(&eb->refs);
137 btrfs_release_path(path);
138
139 iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
140 parent = next_inum;
141 --bytes_left;
142 if (bytes_left >= 0)
143 dest[bytes_left] = '/';
144 }
145
146 btrfs_release_path(path);
147
148 if (ret)
149 return ERR_PTR(ret);
150
151 return dest + bytes_left;
152}
153
154/*
155 * this makes the path point to (logical EXTENT_ITEM *)
156 * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
157 * tree blocks and <0 on error.
158 */
159int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
160 struct btrfs_path *path, struct btrfs_key *found_key)
161{
162 int ret;
163 u64 flags;
164 u32 item_size;
165 struct extent_buffer *eb;
166 struct btrfs_extent_item *ei;
167 struct btrfs_key key;
168
169 key.type = BTRFS_EXTENT_ITEM_KEY;
170 key.objectid = logical;
171 key.offset = (u64)-1;
172
173 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
174 if (ret < 0)
175 return ret;
176 ret = btrfs_previous_item(fs_info->extent_root, path,
177 0, BTRFS_EXTENT_ITEM_KEY);
178 if (ret < 0)
179 return ret;
180
181 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
182 if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
183 found_key->objectid > logical ||
184 found_key->objectid + found_key->offset <= logical)
185 return -ENOENT;
186
187 eb = path->nodes[0];
188 item_size = btrfs_item_size_nr(eb, path->slots[0]);
189 BUG_ON(item_size < sizeof(*ei));
190
191 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
192 flags = btrfs_extent_flags(eb, ei);
193
194 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
195 return BTRFS_EXTENT_FLAG_TREE_BLOCK;
196 if (flags & BTRFS_EXTENT_FLAG_DATA)
197 return BTRFS_EXTENT_FLAG_DATA;
198
199 return -EIO;
200}
201
202/*
203 * helper function to iterate extent inline refs. ptr must point to a 0 value
204 * for the first call and may be modified. it is used to track state.
205 * if more refs exist, 0 is returned and the next call to
206 * __get_extent_inline_ref must pass the modified ptr parameter to get the
207 * next ref. after the last ref was processed, 1 is returned.
208 * returns <0 on error
209 */
210static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
211 struct btrfs_extent_item *ei, u32 item_size,
212 struct btrfs_extent_inline_ref **out_eiref,
213 int *out_type)
214{
215 unsigned long end;
216 u64 flags;
217 struct btrfs_tree_block_info *info;
218
219 if (!*ptr) {
220 /* first call */
221 flags = btrfs_extent_flags(eb, ei);
222 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
223 info = (struct btrfs_tree_block_info *)(ei + 1);
224 *out_eiref =
225 (struct btrfs_extent_inline_ref *)(info + 1);
226 } else {
227 *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1);
228 }
229 *ptr = (unsigned long)*out_eiref;
230 if ((void *)*ptr >= (void *)ei + item_size)
231 return -ENOENT;
232 }
233
234 end = (unsigned long)ei + item_size;
235 *out_eiref = (struct btrfs_extent_inline_ref *)*ptr;
236 *out_type = btrfs_extent_inline_ref_type(eb, *out_eiref);
237
238 *ptr += btrfs_extent_inline_ref_size(*out_type);
239 WARN_ON(*ptr > end);
240 if (*ptr == end)
241 return 1; /* last */
242
243 return 0;
244}
245
246/*
247 * reads the tree block backref for an extent. tree level and root are returned
248 * through out_level and out_root. ptr must point to a 0 value for the first
249 * call and may be modified (see __get_extent_inline_ref comment).
250 * returns 0 if data was provided, 1 if there was no more data to provide or
251 * <0 on error.
252 */
253int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
254 struct btrfs_extent_item *ei, u32 item_size,
255 u64 *out_root, u8 *out_level)
256{
257 int ret;
258 int type;
259 struct btrfs_tree_block_info *info;
260 struct btrfs_extent_inline_ref *eiref;
261
262 if (*ptr == (unsigned long)-1)
263 return 1;
264
265 while (1) {
266 ret = __get_extent_inline_ref(ptr, eb, ei, item_size,
267 &eiref, &type);
268 if (ret < 0)
269 return ret;
270
271 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
272 type == BTRFS_SHARED_BLOCK_REF_KEY)
273 break;
274
275 if (ret == 1)
276 return 1;
277 }
278
279 /* we can treat both ref types equally here */
280 info = (struct btrfs_tree_block_info *)(ei + 1);
281 *out_root = btrfs_extent_inline_ref_offset(eb, eiref);
282 *out_level = btrfs_tree_block_level(eb, info);
283
284 if (ret == 1)
285 *ptr = (unsigned long)-1;
286
287 return 0;
288}
289
290static int __data_list_add(struct list_head *head, u64 inum,
291 u64 extent_data_item_offset, u64 root)
292{
293 struct __data_ref *ref;
294
295 ref = kmalloc(sizeof(*ref), GFP_NOFS);
296 if (!ref)
297 return -ENOMEM;
298
299 ref->inum = inum;
300 ref->extent_data_item_offset = extent_data_item_offset;
301 ref->root = root;
302 list_add_tail(&ref->list, head);
303
304 return 0;
305}
306
307static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb,
308 struct btrfs_extent_data_ref *dref)
309{
310 return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref),
311 btrfs_extent_data_ref_offset(eb, dref),
312 btrfs_extent_data_ref_root(eb, dref));
313}
314
315static int __shared_list_add(struct list_head *head, u64 disk_byte)
316{
317 struct __shared_ref *ref;
318
319 ref = kmalloc(sizeof(*ref), GFP_NOFS);
320 if (!ref)
321 return -ENOMEM;
322
323 ref->disk_byte = disk_byte;
324 list_add_tail(&ref->list, head);
325
326 return 0;
327}
328
329static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info,
330 u64 logical, u64 inum,
331 u64 extent_data_item_offset,
332 u64 extent_offset,
333 struct btrfs_path *path,
334 struct list_head *data_refs,
335 iterate_extent_inodes_t *iterate,
336 void *ctx)
337{
338 u64 ref_root;
339 u32 item_size;
340 struct btrfs_key key;
341 struct extent_buffer *eb;
342 struct btrfs_extent_item *ei;
343 struct btrfs_extent_inline_ref *eiref;
344 struct __data_ref *ref;
345 int ret;
346 int type;
347 int last;
348 unsigned long ptr = 0;
349
350 WARN_ON(!list_empty(data_refs));
351 ret = extent_from_logical(fs_info, logical, path, &key);
352 if (ret & BTRFS_EXTENT_FLAG_DATA)
353 ret = -EIO;
354 if (ret < 0)
355 goto out;
356
357 eb = path->nodes[0];
358 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
359 item_size = btrfs_item_size_nr(eb, path->slots[0]);
360
361 ret = 0;
362 ref_root = 0;
363 /*
364 * as done in iterate_extent_inodes, we first build a list of refs to
365 * iterate, then free the path and then iterate them to avoid deadlocks.
366 */
367 do {
368 last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
369 &eiref, &type);
370 if (last < 0) {
371 ret = last;
372 goto out;
373 }
374 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
375 type == BTRFS_SHARED_BLOCK_REF_KEY) {
376 ref_root = btrfs_extent_inline_ref_offset(eb, eiref);
377 ret = __data_list_add(data_refs, inum,
378 extent_data_item_offset,
379 ref_root);
380 }
381 } while (!ret && !last);
382
383 btrfs_release_path(path);
384
385 if (ref_root == 0) {
386 printk(KERN_ERR "btrfs: failed to find tree block ref "
387 "for shared data backref %llu\n", logical);
388 WARN_ON(1);
389 ret = -EIO;
390 }
391
392out:
393 while (!list_empty(data_refs)) {
394 ref = list_first_entry(data_refs, struct __data_ref, list);
395 list_del(&ref->list);
396 if (!ret)
397 ret = iterate(ref->inum, extent_offset +
398 ref->extent_data_item_offset,
399 ref->root, ctx);
400 kfree(ref);
401 }
402
403 return ret;
404}
405
406static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
407 u64 logical, u64 orig_extent_item_objectid,
408 u64 extent_offset, struct btrfs_path *path,
409 struct list_head *data_refs,
410 iterate_extent_inodes_t *iterate,
411 void *ctx)
412{
413 u64 disk_byte;
414 struct btrfs_key key;
415 struct btrfs_file_extent_item *fi;
416 struct extent_buffer *eb;
417 int slot;
418 int nritems;
419 int ret;
420 int found = 0;
421
422 eb = read_tree_block(fs_info->tree_root, logical,
423 fs_info->tree_root->leafsize, 0);
424 if (!eb)
425 return -EIO;
426
427 /*
428 * from the shared data ref, we only have the leaf but we need
429 * the key. thus, we must look into all items and see that we
430 * find one (some) with a reference to our extent item.
431 */
432 nritems = btrfs_header_nritems(eb);
433 for (slot = 0; slot < nritems; ++slot) {
434 btrfs_item_key_to_cpu(eb, &key, slot);
435 if (key.type != BTRFS_EXTENT_DATA_KEY)
436 continue;
437 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
438 if (!fi) {
439 free_extent_buffer(eb);
440 return -EIO;
441 }
442 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
443 if (disk_byte != orig_extent_item_objectid) {
444 if (found)
445 break;
446 else
447 continue;
448 }
449 ++found;
450 ret = __iter_shared_inline_ref_inodes(fs_info, logical,
451 key.objectid,
452 key.offset,
453 extent_offset, path,
454 data_refs,
455 iterate, ctx);
456 if (ret)
457 break;
458 }
459
460 if (!found) {
461 printk(KERN_ERR "btrfs: failed to follow shared data backref "
462 "to parent %llu\n", logical);
463 WARN_ON(1);
464 ret = -EIO;
465 }
466
467 free_extent_buffer(eb);
468 return ret;
469}
470
471/*
472 * calls iterate() for every inode that references the extent identified by
473 * the given parameters. will use the path given as a parameter and return it
474 * released.
475 * when the iterator function returns a non-zero value, iteration stops.
476 */
477int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
478 struct btrfs_path *path,
479 u64 extent_item_objectid,
480 u64 extent_offset,
481 iterate_extent_inodes_t *iterate, void *ctx)
482{
483 unsigned long ptr = 0;
484 int last;
485 int ret;
486 int type;
487 u64 logical;
488 u32 item_size;
489 struct btrfs_extent_inline_ref *eiref;
490 struct btrfs_extent_data_ref *dref;
491 struct extent_buffer *eb;
492 struct btrfs_extent_item *ei;
493 struct btrfs_key key;
494 struct list_head data_refs = LIST_HEAD_INIT(data_refs);
495 struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
496 struct __data_ref *ref_d;
497 struct __shared_ref *ref_s;
498
499 eb = path->nodes[0];
500 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
501 item_size = btrfs_item_size_nr(eb, path->slots[0]);
502
503 /* first we iterate the inline refs, ... */
504 do {
505 last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
506 &eiref, &type);
507 if (last == -ENOENT) {
508 ret = 0;
509 break;
510 }
511 if (last < 0) {
512 ret = last;
513 break;
514 }
515
516 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
517 dref = (struct btrfs_extent_data_ref *)(&eiref->offset);
518 ret = __data_list_add_eb(&data_refs, eb, dref);
519 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
520 logical = btrfs_extent_inline_ref_offset(eb, eiref);
521 ret = __shared_list_add(&shared_refs, logical);
522 }
523 } while (!ret && !last);
524
525 /* ... then we proceed to in-tree references and ... */
526 while (!ret) {
527 ++path->slots[0];
528 if (path->slots[0] > btrfs_header_nritems(eb)) {
529 ret = btrfs_next_leaf(fs_info->extent_root, path);
530 if (ret) {
531 if (ret == 1)
532 ret = 0; /* we're done */
533 break;
534 }
535 eb = path->nodes[0];
536 }
537 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
538 if (key.objectid != extent_item_objectid)
539 break;
540 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
541 dref = btrfs_item_ptr(eb, path->slots[0],
542 struct btrfs_extent_data_ref);
543 ret = __data_list_add_eb(&data_refs, eb, dref);
544 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
545 ret = __shared_list_add(&shared_refs, key.offset);
546 }
547 }
548
549 btrfs_release_path(path);
550
551 /*
552 * ... only at the very end we can process the refs we found. this is
553 * because the iterator function we call is allowed to make tree lookups
554 * and we have to avoid deadlocks. additionally, we need more tree
555 * lookups ourselves for shared data refs.
556 */
557 while (!list_empty(&data_refs)) {
558 ref_d = list_first_entry(&data_refs, struct __data_ref, list);
559 list_del(&ref_d->list);
560 if (!ret)
561 ret = iterate(ref_d->inum, extent_offset +
562 ref_d->extent_data_item_offset,
563 ref_d->root, ctx);
564 kfree(ref_d);
565 }
566
567 while (!list_empty(&shared_refs)) {
568 ref_s = list_first_entry(&shared_refs, struct __shared_ref,
569 list);
570 list_del(&ref_s->list);
571 if (!ret)
572 ret = __iter_shared_inline_ref(fs_info,
573 ref_s->disk_byte,
574 extent_item_objectid,
575 extent_offset, path,
576 &data_refs,
577 iterate, ctx);
578 kfree(ref_s);
579 }
580
581 return ret;
582}
583
584int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
585 struct btrfs_path *path,
586 iterate_extent_inodes_t *iterate, void *ctx)
587{
588 int ret;
589 u64 offset;
590 struct btrfs_key found_key;
591
592 ret = extent_from_logical(fs_info, logical, path,
593 &found_key);
594 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
595 ret = -EINVAL;
596 if (ret < 0)
597 return ret;
598
599 offset = logical - found_key.objectid;
600 ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
601 offset, iterate, ctx);
602
603 return ret;
604}
605
606static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
607 struct btrfs_path *path,
608 iterate_irefs_t *iterate, void *ctx)
609{
610 int ret;
611 int slot;
612 u32 cur;
613 u32 len;
614 u32 name_len;
615 u64 parent = 0;
616 int found = 0;
617 struct extent_buffer *eb;
618 struct btrfs_item *item;
619 struct btrfs_inode_ref *iref;
620 struct btrfs_key found_key;
621
622 while (1) {
623 ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
624 &found_key);
625 if (ret < 0)
626 break;
627 if (ret) {
628 ret = found ? 0 : -ENOENT;
629 break;
630 }
631 ++found;
632
633 parent = found_key.offset;
634 slot = path->slots[0];
635 eb = path->nodes[0];
636 /* make sure we can use eb after releasing the path */
637 atomic_inc(&eb->refs);
638 btrfs_release_path(path);
639
640 item = btrfs_item_nr(eb, slot);
641 iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
642
643 for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
644 name_len = btrfs_inode_ref_name_len(eb, iref);
645 /* path must be released before calling iterate()! */
646 ret = iterate(parent, iref, eb, ctx);
647 if (ret) {
648 free_extent_buffer(eb);
649 break;
650 }
651 len = sizeof(*iref) + name_len;
652 iref = (struct btrfs_inode_ref *)((char *)iref + len);
653 }
654 free_extent_buffer(eb);
655 }
656
657 btrfs_release_path(path);
658
659 return ret;
660}
661
662/*
663 * returns 0 if the path could be dumped (probably truncated)
664 * returns <0 in case of an error
665 */
666static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
667 struct extent_buffer *eb, void *ctx)
668{
669 struct inode_fs_paths *ipath = ctx;
670 char *fspath;
671 char *fspath_min;
672 int i = ipath->fspath->elem_cnt;
673 const int s_ptr = sizeof(char *);
674 u32 bytes_left;
675
676 bytes_left = ipath->fspath->bytes_left > s_ptr ?
677 ipath->fspath->bytes_left - s_ptr : 0;
678
679 fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
680 fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb,
681 inum, fspath_min, bytes_left);
682 if (IS_ERR(fspath))
683 return PTR_ERR(fspath);
684
685 if (fspath > fspath_min) {
686 ipath->fspath->val[i] = (u64)(unsigned long)fspath;
687 ++ipath->fspath->elem_cnt;
688 ipath->fspath->bytes_left = fspath - fspath_min;
689 } else {
690 ++ipath->fspath->elem_missed;
691 ipath->fspath->bytes_missing += fspath_min - fspath;
692 ipath->fspath->bytes_left = 0;
693 }
694
695 return 0;
696}
697
698/*
699 * this dumps all file system paths to the inode into the ipath struct, provided
700 * is has been created large enough. each path is zero-terminated and accessed
701 * from ipath->fspath->val[i].
702 * when it returns, there are ipath->fspath->elem_cnt number of paths available
703 * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the
704 * number of missed paths in recored in ipath->fspath->elem_missed, otherwise,
705 * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would
706 * have been needed to return all paths.
707 */
708int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
709{
710 return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
711 inode_to_path, ipath);
712}
713
714/*
715 * allocates space to return multiple file system paths for an inode.
716 * total_bytes to allocate are passed, note that space usable for actual path
717 * information will be total_bytes - sizeof(struct inode_fs_paths).
718 * the returned pointer must be freed with free_ipath() in the end.
719 */
720struct btrfs_data_container *init_data_container(u32 total_bytes)
721{
722 struct btrfs_data_container *data;
723 size_t alloc_bytes;
724
725 alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
726 data = kmalloc(alloc_bytes, GFP_NOFS);
727 if (!data)
728 return ERR_PTR(-ENOMEM);
729
730 if (total_bytes >= sizeof(*data)) {
731 data->bytes_left = total_bytes - sizeof(*data);
732 data->bytes_missing = 0;
733 } else {
734 data->bytes_missing = sizeof(*data) - total_bytes;
735 data->bytes_left = 0;
736 }
737
738 data->elem_cnt = 0;
739 data->elem_missed = 0;
740
741 return data;
742}
743
744/*
745 * allocates space to return multiple file system paths for an inode.
746 * total_bytes to allocate are passed, note that space usable for actual path
747 * information will be total_bytes - sizeof(struct inode_fs_paths).
748 * the returned pointer must be freed with free_ipath() in the end.
749 */
750struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
751 struct btrfs_path *path)
752{
753 struct inode_fs_paths *ifp;
754 struct btrfs_data_container *fspath;
755
756 fspath = init_data_container(total_bytes);
757 if (IS_ERR(fspath))
758 return (void *)fspath;
759
760 ifp = kmalloc(sizeof(*ifp), GFP_NOFS);
761 if (!ifp) {
762 kfree(fspath);
763 return ERR_PTR(-ENOMEM);
764 }
765
766 ifp->btrfs_path = path;
767 ifp->fspath = fspath;
768 ifp->fs_root = fs_root;
769
770 return ifp;
771}
772
773void free_ipath(struct inode_fs_paths *ipath)
774{
775 kfree(ipath);
776}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
new file mode 100644
index 00000000000..92618837cb8
--- /dev/null
+++ b/fs/btrfs/backref.h
@@ -0,0 +1,62 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_BACKREF__
20#define __BTRFS_BACKREF__
21
22#include "ioctl.h"
23
24struct inode_fs_paths {
25 struct btrfs_path *btrfs_path;
26 struct btrfs_root *fs_root;
27 struct btrfs_data_container *fspath;
28};
29
30typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
31 void *ctx);
32typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref,
33 struct extent_buffer *eb, void *ctx);
34
35int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
36 struct btrfs_path *path);
37
38int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
39 struct btrfs_path *path, struct btrfs_key *found_key);
40
41int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
42 struct btrfs_extent_item *ei, u32 item_size,
43 u64 *out_root, u8 *out_level);
44
45int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
46 struct btrfs_path *path,
47 u64 extent_item_objectid,
48 u64 extent_offset,
49 iterate_extent_inodes_t *iterate, void *ctx);
50
51int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
52 struct btrfs_path *path,
53 iterate_extent_inodes_t *iterate, void *ctx);
54
55int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
56
57struct btrfs_data_container *init_data_container(u32 total_bytes);
58struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
59 struct btrfs_path *path);
60void free_ipath(struct inode_fs_paths *ipath);
61
62#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index d9f99a16edd..634608d2a6d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -103,11 +103,6 @@ struct btrfs_inode {
103 */ 103 */
104 u64 delalloc_bytes; 104 u64 delalloc_bytes;
105 105
106 /* total number of bytes that may be used for this inode for
107 * delalloc
108 */
109 u64 reserved_bytes;
110
111 /* 106 /*
112 * the size of the file stored in the metadata on disk. data=ordered 107 * the size of the file stored in the metadata on disk. data=ordered
113 * means the in-memory i_size might be larger than the size on disk 108 * means the in-memory i_size might be larger than the size on disk
@@ -115,9 +110,6 @@ struct btrfs_inode {
115 */ 110 */
116 u64 disk_i_size; 111 u64 disk_i_size;
117 112
118 /* flags field from the on disk inode */
119 u32 flags;
120
121 /* 113 /*
122 * if this is a directory then index_cnt is the counter for the index 114 * if this is a directory then index_cnt is the counter for the index
123 * number for new files that are created 115 * number for new files that are created
@@ -132,6 +124,15 @@ struct btrfs_inode {
132 u64 last_unlink_trans; 124 u64 last_unlink_trans;
133 125
134 /* 126 /*
127 * Number of bytes outstanding that are going to need csums. This is
128 * used in ENOSPC accounting.
129 */
130 u64 csum_bytes;
131
132 /* flags field from the on disk inode */
133 u32 flags;
134
135 /*
135 * Counters to keep track of the number of extent item's we may use due 136 * Counters to keep track of the number of extent item's we may use due
136 * to delalloc and such. outstanding_extents is the number of extent 137 * to delalloc and such. outstanding_extents is the number of extent
137 * items we think we'll end up using, and reserved_extents is the number 138 * items we think we'll end up using, and reserved_extents is the number
@@ -146,14 +147,12 @@ struct btrfs_inode {
146 * the btrfs file release call will add this inode to the 147 * the btrfs file release call will add this inode to the
147 * ordered operations list so that we make sure to flush out any 148 * ordered operations list so that we make sure to flush out any
148 * new data the application may have written before commit. 149 * new data the application may have written before commit.
149 *
150 * yes, its silly to have a single bitflag, but we might grow more
151 * of these.
152 */ 150 */
153 unsigned ordered_data_close:1; 151 unsigned ordered_data_close:1;
154 unsigned orphan_meta_reserved:1; 152 unsigned orphan_meta_reserved:1;
155 unsigned dummy_inode:1; 153 unsigned dummy_inode:1;
156 unsigned in_defrag:1; 154 unsigned in_defrag:1;
155 unsigned delalloc_meta_reserved:1;
157 156
158 /* 157 /*
159 * always compress this one file 158 * always compress this one file
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8ec5d86f173..14f1c5a0b2d 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -85,7 +85,8 @@ struct compressed_bio {
85static inline int compressed_bio_size(struct btrfs_root *root, 85static inline int compressed_bio_size(struct btrfs_root *root,
86 unsigned long disk_size) 86 unsigned long disk_size)
87{ 87{
88 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); 88 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
89
89 return sizeof(struct compressed_bio) + 90 return sizeof(struct compressed_bio) +
90 ((disk_size + root->sectorsize - 1) / root->sectorsize) * 91 ((disk_size + root->sectorsize - 1) / root->sectorsize) *
91 csum_size; 92 csum_size;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 011cab3aca8..dede441bdee 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -514,10 +514,25 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
514 struct btrfs_root *root, 514 struct btrfs_root *root,
515 struct extent_buffer *buf) 515 struct extent_buffer *buf)
516{ 516{
517 /* ensure we can see the force_cow */
518 smp_rmb();
519
520 /*
521 * We do not need to cow a block if
522 * 1) this block is not created or changed in this transaction;
523 * 2) this block does not belong to TREE_RELOC tree;
524 * 3) the root is not forced COW.
525 *
526 * What is forced COW:
527 * when we create snapshot during commiting the transaction,
528 * after we've finished coping src root, we must COW the shared
529 * block to ensure the metadata consistency.
530 */
517 if (btrfs_header_generation(buf) == trans->transid && 531 if (btrfs_header_generation(buf) == trans->transid &&
518 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && 532 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
519 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 533 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
520 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) 534 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
535 !root->force_cow)
521 return 0; 536 return 0;
522 return 1; 537 return 1;
523} 538}
@@ -902,9 +917,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
902 917
903 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 918 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
904 919
905 if (level < BTRFS_MAX_LEVEL - 1) 920 if (level < BTRFS_MAX_LEVEL - 1) {
906 parent = path->nodes[level + 1]; 921 parent = path->nodes[level + 1];
907 pslot = path->slots[level + 1]; 922 pslot = path->slots[level + 1];
923 }
908 924
909 /* 925 /*
910 * deal with the case where there is only one pointer in the root 926 * deal with the case where there is only one pointer in the root
@@ -1107,9 +1123,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1107 mid = path->nodes[level]; 1123 mid = path->nodes[level];
1108 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1124 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1109 1125
1110 if (level < BTRFS_MAX_LEVEL - 1) 1126 if (level < BTRFS_MAX_LEVEL - 1) {
1111 parent = path->nodes[level + 1]; 1127 parent = path->nodes[level + 1];
1112 pslot = path->slots[level + 1]; 1128 pslot = path->slots[level + 1];
1129 }
1113 1130
1114 if (!parent) 1131 if (!parent)
1115 return 1; 1132 return 1;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 03912c5c6f4..50634abef9b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -30,6 +30,7 @@
30#include <linux/kobject.h> 30#include <linux/kobject.h>
31#include <trace/events/btrfs.h> 31#include <trace/events/btrfs.h>
32#include <asm/kmap_types.h> 32#include <asm/kmap_types.h>
33#include <linux/pagemap.h>
33#include "extent_io.h" 34#include "extent_io.h"
34#include "extent_map.h" 35#include "extent_map.h"
35#include "async-thread.h" 36#include "async-thread.h"
@@ -360,6 +361,47 @@ struct btrfs_header {
360#define BTRFS_LABEL_SIZE 256 361#define BTRFS_LABEL_SIZE 256
361 362
362/* 363/*
364 * just in case we somehow lose the roots and are not able to mount,
365 * we store an array of the roots from previous transactions
366 * in the super.
367 */
368#define BTRFS_NUM_BACKUP_ROOTS 4
369struct btrfs_root_backup {
370 __le64 tree_root;
371 __le64 tree_root_gen;
372
373 __le64 chunk_root;
374 __le64 chunk_root_gen;
375
376 __le64 extent_root;
377 __le64 extent_root_gen;
378
379 __le64 fs_root;
380 __le64 fs_root_gen;
381
382 __le64 dev_root;
383 __le64 dev_root_gen;
384
385 __le64 csum_root;
386 __le64 csum_root_gen;
387
388 __le64 total_bytes;
389 __le64 bytes_used;
390 __le64 num_devices;
391 /* future */
392 __le64 unsed_64[4];
393
394 u8 tree_root_level;
395 u8 chunk_root_level;
396 u8 extent_root_level;
397 u8 fs_root_level;
398 u8 dev_root_level;
399 u8 csum_root_level;
400 /* future and to align */
401 u8 unused_8[10];
402} __attribute__ ((__packed__));
403
404/*
363 * the super block basically lists the main trees of the FS 405 * the super block basically lists the main trees of the FS
364 * it currently lacks any block count etc etc 406 * it currently lacks any block count etc etc
365 */ 407 */
@@ -405,6 +447,7 @@ struct btrfs_super_block {
405 /* future expansion */ 447 /* future expansion */
406 __le64 reserved[31]; 448 __le64 reserved[31];
407 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; 449 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
450 struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
408} __attribute__ ((__packed__)); 451} __attribute__ ((__packed__));
409 452
410/* 453/*
@@ -772,14 +815,8 @@ struct btrfs_space_info {
772struct btrfs_block_rsv { 815struct btrfs_block_rsv {
773 u64 size; 816 u64 size;
774 u64 reserved; 817 u64 reserved;
775 u64 freed[2];
776 struct btrfs_space_info *space_info; 818 struct btrfs_space_info *space_info;
777 struct list_head list;
778 spinlock_t lock; 819 spinlock_t lock;
779 atomic_t usage;
780 unsigned int priority:8;
781 unsigned int durable:1;
782 unsigned int refill_used:1;
783 unsigned int full:1; 820 unsigned int full:1;
784}; 821};
785 822
@@ -811,7 +848,8 @@ struct btrfs_free_cluster {
811enum btrfs_caching_type { 848enum btrfs_caching_type {
812 BTRFS_CACHE_NO = 0, 849 BTRFS_CACHE_NO = 0,
813 BTRFS_CACHE_STARTED = 1, 850 BTRFS_CACHE_STARTED = 1,
814 BTRFS_CACHE_FINISHED = 2, 851 BTRFS_CACHE_FAST = 2,
852 BTRFS_CACHE_FINISHED = 3,
815}; 853};
816 854
817enum btrfs_disk_cache_state { 855enum btrfs_disk_cache_state {
@@ -840,10 +878,10 @@ struct btrfs_block_group_cache {
840 spinlock_t lock; 878 spinlock_t lock;
841 u64 pinned; 879 u64 pinned;
842 u64 reserved; 880 u64 reserved;
843 u64 reserved_pinned;
844 u64 bytes_super; 881 u64 bytes_super;
845 u64 flags; 882 u64 flags;
846 u64 sectorsize; 883 u64 sectorsize;
884 u64 cache_generation;
847 unsigned int ro:1; 885 unsigned int ro:1;
848 unsigned int dirty:1; 886 unsigned int dirty:1;
849 unsigned int iref:1; 887 unsigned int iref:1;
@@ -899,6 +937,10 @@ struct btrfs_fs_info {
899 spinlock_t block_group_cache_lock; 937 spinlock_t block_group_cache_lock;
900 struct rb_root block_group_cache_tree; 938 struct rb_root block_group_cache_tree;
901 939
940 /* keep track of unallocated space */
941 spinlock_t free_chunk_lock;
942 u64 free_chunk_space;
943
902 struct extent_io_tree freed_extents[2]; 944 struct extent_io_tree freed_extents[2];
903 struct extent_io_tree *pinned_extents; 945 struct extent_io_tree *pinned_extents;
904 946
@@ -916,14 +958,11 @@ struct btrfs_fs_info {
916 struct btrfs_block_rsv trans_block_rsv; 958 struct btrfs_block_rsv trans_block_rsv;
917 /* block reservation for chunk tree */ 959 /* block reservation for chunk tree */
918 struct btrfs_block_rsv chunk_block_rsv; 960 struct btrfs_block_rsv chunk_block_rsv;
961 /* block reservation for delayed operations */
962 struct btrfs_block_rsv delayed_block_rsv;
919 963
920 struct btrfs_block_rsv empty_block_rsv; 964 struct btrfs_block_rsv empty_block_rsv;
921 965
922 /* list of block reservations that cross multiple transactions */
923 struct list_head durable_block_rsv_list;
924
925 struct mutex durable_block_rsv_mutex;
926
927 u64 generation; 966 u64 generation;
928 u64 last_trans_committed; 967 u64 last_trans_committed;
929 968
@@ -942,8 +981,8 @@ struct btrfs_fs_info {
942 wait_queue_head_t transaction_blocked_wait; 981 wait_queue_head_t transaction_blocked_wait;
943 wait_queue_head_t async_submit_wait; 982 wait_queue_head_t async_submit_wait;
944 983
945 struct btrfs_super_block super_copy; 984 struct btrfs_super_block *super_copy;
946 struct btrfs_super_block super_for_commit; 985 struct btrfs_super_block *super_for_commit;
947 struct block_device *__bdev; 986 struct block_device *__bdev;
948 struct super_block *sb; 987 struct super_block *sb;
949 struct inode *btree_inode; 988 struct inode *btree_inode;
@@ -1036,6 +1075,7 @@ struct btrfs_fs_info {
1036 struct btrfs_workers endio_freespace_worker; 1075 struct btrfs_workers endio_freespace_worker;
1037 struct btrfs_workers submit_workers; 1076 struct btrfs_workers submit_workers;
1038 struct btrfs_workers caching_workers; 1077 struct btrfs_workers caching_workers;
1078 struct btrfs_workers readahead_workers;
1039 1079
1040 /* 1080 /*
1041 * fixup workers take dirty pages that didn't properly go through 1081 * fixup workers take dirty pages that didn't properly go through
@@ -1119,6 +1159,13 @@ struct btrfs_fs_info {
1119 u64 fs_state; 1159 u64 fs_state;
1120 1160
1121 struct btrfs_delayed_root *delayed_root; 1161 struct btrfs_delayed_root *delayed_root;
1162
1163 /* readahead tree */
1164 spinlock_t reada_lock;
1165 struct radix_tree_root reada_tree;
1166
1167 /* next backup root to be overwritten */
1168 int backup_root_index;
1122}; 1169};
1123 1170
1124/* 1171/*
@@ -1225,6 +1272,8 @@ struct btrfs_root {
1225 * for stat. It may be used for more later 1272 * for stat. It may be used for more later
1226 */ 1273 */
1227 dev_t anon_dev; 1274 dev_t anon_dev;
1275
1276 int force_cow;
1228}; 1277};
1229 1278
1230struct btrfs_ioctl_defrag_range_args { 1279struct btrfs_ioctl_defrag_range_args {
@@ -1363,6 +1412,7 @@ struct btrfs_ioctl_defrag_range_args {
1363#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) 1412#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
1364#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) 1413#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
1365#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) 1414#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
1415#define BTRFS_MOUNT_RECOVERY (1 << 18)
1366 1416
1367#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1417#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1368#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1418#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1978,6 +2028,55 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root)
1978 return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY; 2028 return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
1979} 2029}
1980 2030
2031/* struct btrfs_root_backup */
2032BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
2033 tree_root, 64);
2034BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
2035 tree_root_gen, 64);
2036BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
2037 tree_root_level, 8);
2038
2039BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
2040 chunk_root, 64);
2041BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
2042 chunk_root_gen, 64);
2043BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
2044 chunk_root_level, 8);
2045
2046BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
2047 extent_root, 64);
2048BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
2049 extent_root_gen, 64);
2050BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
2051 extent_root_level, 8);
2052
2053BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
2054 fs_root, 64);
2055BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
2056 fs_root_gen, 64);
2057BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
2058 fs_root_level, 8);
2059
2060BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
2061 dev_root, 64);
2062BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
2063 dev_root_gen, 64);
2064BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
2065 dev_root_level, 8);
2066
2067BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
2068 csum_root, 64);
2069BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
2070 csum_root_gen, 64);
2071BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
2072 csum_root_level, 8);
2073BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
2074 total_bytes, 64);
2075BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
2076 bytes_used, 64);
2077BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
2078 num_devices, 64);
2079
1981/* struct btrfs_super_block */ 2080/* struct btrfs_super_block */
1982 2081
1983BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); 2082BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@ -2129,6 +2228,11 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
2129 (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); 2228 (space_info->flags & BTRFS_BLOCK_GROUP_DATA));
2130} 2229}
2131 2230
2231static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
2232{
2233 return mapping_gfp_mask(mapping) & ~__GFP_FS;
2234}
2235
2132/* extent-tree.c */ 2236/* extent-tree.c */
2133static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, 2237static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
2134 unsigned num_items) 2238 unsigned num_items)
@@ -2137,6 +2241,17 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
2137 3 * num_items; 2241 3 * num_items;
2138} 2242}
2139 2243
2244/*
2245 * Doing a truncate won't result in new nodes or leaves, just what we need for
2246 * COW.
2247 */
2248static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
2249 unsigned num_items)
2250{
2251 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
2252 num_items;
2253}
2254
2140void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 2255void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
2141int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2256int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2142 struct btrfs_root *root, unsigned long count); 2257 struct btrfs_root *root, unsigned long count);
@@ -2146,6 +2261,9 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
2146 u64 num_bytes, u64 *refs, u64 *flags); 2261 u64 num_bytes, u64 *refs, u64 *flags);
2147int btrfs_pin_extent(struct btrfs_root *root, 2262int btrfs_pin_extent(struct btrfs_root *root,
2148 u64 bytenr, u64 num, int reserved); 2263 u64 bytenr, u64 num, int reserved);
2264int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
2265 struct btrfs_root *root,
2266 u64 bytenr, u64 num_bytes);
2149int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 2267int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2150 struct btrfs_root *root, 2268 struct btrfs_root *root,
2151 u64 objectid, u64 offset, u64 bytenr); 2269 u64 objectid, u64 offset, u64 bytenr);
@@ -2196,8 +2314,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
2196 u64 root_objectid, u64 owner, u64 offset); 2314 u64 root_objectid, u64 owner, u64 offset);
2197 2315
2198int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); 2316int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
2199int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 2317int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
2200 u64 num_bytes, int reserve, int sinfo); 2318 u64 start, u64 len);
2201int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 2319int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
2202 struct btrfs_root *root); 2320 struct btrfs_root *root);
2203int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 2321int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
@@ -2240,25 +2358,26 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
2240struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); 2358struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
2241void btrfs_free_block_rsv(struct btrfs_root *root, 2359void btrfs_free_block_rsv(struct btrfs_root *root,
2242 struct btrfs_block_rsv *rsv); 2360 struct btrfs_block_rsv *rsv);
2243void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, 2361int btrfs_block_rsv_add(struct btrfs_root *root,
2244 struct btrfs_block_rsv *rsv);
2245int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
2246 struct btrfs_root *root,
2247 struct btrfs_block_rsv *block_rsv, 2362 struct btrfs_block_rsv *block_rsv,
2248 u64 num_bytes); 2363 u64 num_bytes);
2249int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, 2364int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
2250 struct btrfs_root *root, 2365 struct btrfs_block_rsv *block_rsv,
2366 u64 num_bytes);
2367int btrfs_block_rsv_check(struct btrfs_root *root,
2368 struct btrfs_block_rsv *block_rsv, int min_factor);
2369int btrfs_block_rsv_refill(struct btrfs_root *root,
2251 struct btrfs_block_rsv *block_rsv, 2370 struct btrfs_block_rsv *block_rsv,
2252 u64 min_reserved, int min_factor); 2371 u64 min_reserved);
2372int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
2373 struct btrfs_block_rsv *block_rsv,
2374 u64 min_reserved);
2253int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 2375int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2254 struct btrfs_block_rsv *dst_rsv, 2376 struct btrfs_block_rsv *dst_rsv,
2255 u64 num_bytes); 2377 u64 num_bytes);
2256void btrfs_block_rsv_release(struct btrfs_root *root, 2378void btrfs_block_rsv_release(struct btrfs_root *root,
2257 struct btrfs_block_rsv *block_rsv, 2379 struct btrfs_block_rsv *block_rsv,
2258 u64 num_bytes); 2380 u64 num_bytes);
2259int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
2260 struct btrfs_root *root,
2261 struct btrfs_block_rsv *rsv);
2262int btrfs_set_block_group_ro(struct btrfs_root *root, 2381int btrfs_set_block_group_ro(struct btrfs_root *root,
2263 struct btrfs_block_group_cache *cache); 2382 struct btrfs_block_group_cache *cache);
2264int btrfs_set_block_group_rw(struct btrfs_root *root, 2383int btrfs_set_block_group_rw(struct btrfs_root *root,
@@ -2379,6 +2498,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
2379 smp_mb(); 2498 smp_mb();
2380 return fs_info->closing; 2499 return fs_info->closing;
2381} 2500}
2501static inline void free_fs_info(struct btrfs_fs_info *fs_info)
2502{
2503 kfree(fs_info->delayed_root);
2504 kfree(fs_info->extent_root);
2505 kfree(fs_info->tree_root);
2506 kfree(fs_info->chunk_root);
2507 kfree(fs_info->dev_root);
2508 kfree(fs_info->csum_root);
2509 kfree(fs_info->super_copy);
2510 kfree(fs_info->super_for_commit);
2511 kfree(fs_info);
2512}
2382 2513
2383/* root-item.c */ 2514/* root-item.c */
2384int btrfs_find_root_ref(struct btrfs_root *tree_root, 2515int btrfs_find_root_ref(struct btrfs_root *tree_root,
@@ -2579,11 +2710,6 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
2579int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); 2710int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
2580int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); 2711int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2581int btrfs_orphan_cleanup(struct btrfs_root *root); 2712int btrfs_orphan_cleanup(struct btrfs_root *root);
2582void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2583 struct btrfs_pending_snapshot *pending,
2584 u64 *bytes_to_reserve);
2585void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2586 struct btrfs_pending_snapshot *pending);
2587void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, 2713void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2588 struct btrfs_root *root); 2714 struct btrfs_root *root);
2589int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); 2715int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
@@ -2697,4 +2823,20 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
2697int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 2823int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
2698 struct btrfs_scrub_progress *progress); 2824 struct btrfs_scrub_progress *progress);
2699 2825
2826/* reada.c */
2827struct reada_control {
2828 struct btrfs_root *root; /* tree to prefetch */
2829 struct btrfs_key key_start;
2830 struct btrfs_key key_end; /* exclusive */
2831 atomic_t elems;
2832 struct kref refcnt;
2833 wait_queue_head_t wait;
2834};
2835struct reada_control *btrfs_reada_add(struct btrfs_root *root,
2836 struct btrfs_key *start, struct btrfs_key *end);
2837int btrfs_reada_wait(void *handle);
2838void btrfs_reada_detach(void *handle);
2839int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
2840 u64 start, int err);
2841
2700#endif 2842#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index ae4d9cd1096..5b163572e0c 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -591,7 +591,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
591 return 0; 591 return 0;
592 592
593 src_rsv = trans->block_rsv; 593 src_rsv = trans->block_rsv;
594 dst_rsv = &root->fs_info->global_block_rsv; 594 dst_rsv = &root->fs_info->delayed_block_rsv;
595 595
596 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 596 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
597 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); 597 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
@@ -609,7 +609,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
609 if (!item->bytes_reserved) 609 if (!item->bytes_reserved)
610 return; 610 return;
611 611
612 rsv = &root->fs_info->global_block_rsv; 612 rsv = &root->fs_info->delayed_block_rsv;
613 btrfs_block_rsv_release(root, rsv, 613 btrfs_block_rsv_release(root, rsv,
614 item->bytes_reserved); 614 item->bytes_reserved);
615} 615}
@@ -617,24 +617,102 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
617static int btrfs_delayed_inode_reserve_metadata( 617static int btrfs_delayed_inode_reserve_metadata(
618 struct btrfs_trans_handle *trans, 618 struct btrfs_trans_handle *trans,
619 struct btrfs_root *root, 619 struct btrfs_root *root,
620 struct inode *inode,
620 struct btrfs_delayed_node *node) 621 struct btrfs_delayed_node *node)
621{ 622{
622 struct btrfs_block_rsv *src_rsv; 623 struct btrfs_block_rsv *src_rsv;
623 struct btrfs_block_rsv *dst_rsv; 624 struct btrfs_block_rsv *dst_rsv;
624 u64 num_bytes; 625 u64 num_bytes;
625 int ret; 626 int ret;
626 627 int release = false;
627 if (!trans->bytes_reserved)
628 return 0;
629 628
630 src_rsv = trans->block_rsv; 629 src_rsv = trans->block_rsv;
631 dst_rsv = &root->fs_info->global_block_rsv; 630 dst_rsv = &root->fs_info->delayed_block_rsv;
632 631
633 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 632 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
633
634 /*
635 * btrfs_dirty_inode will update the inode under btrfs_join_transaction
636 * which doesn't reserve space for speed. This is a problem since we
637 * still need to reserve space for this update, so try to reserve the
638 * space.
639 *
640 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
641 * we're accounted for.
642 */
643 if (!trans->bytes_reserved &&
644 src_rsv != &root->fs_info->delalloc_block_rsv) {
645 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
646 /*
647 * Since we're under a transaction reserve_metadata_bytes could
648 * try to commit the transaction which will make it return
649 * EAGAIN to make us stop the transaction we have, so return
650 * ENOSPC instead so that btrfs_dirty_inode knows what to do.
651 */
652 if (ret == -EAGAIN)
653 ret = -ENOSPC;
654 if (!ret)
655 node->bytes_reserved = num_bytes;
656 return ret;
657 } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
658 spin_lock(&BTRFS_I(inode)->lock);
659 if (BTRFS_I(inode)->delalloc_meta_reserved) {
660 BTRFS_I(inode)->delalloc_meta_reserved = 0;
661 spin_unlock(&BTRFS_I(inode)->lock);
662 release = true;
663 goto migrate;
664 }
665 spin_unlock(&BTRFS_I(inode)->lock);
666
667 /* Ok we didn't have space pre-reserved. This shouldn't happen
668 * too often but it can happen if we do delalloc to an existing
669 * inode which gets dirtied because of the time update, and then
670 * isn't touched again until after the transaction commits and
671 * then we try to write out the data. First try to be nice and
672 * reserve something strictly for us. If not be a pain and try
673 * to steal from the delalloc block rsv.
674 */
675 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
676 if (!ret)
677 goto out;
678
679 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
680 if (!ret)
681 goto out;
682
683 /*
684 * Ok this is a problem, let's just steal from the global rsv
685 * since this really shouldn't happen that often.
686 */
687 WARN_ON(1);
688 ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv,
689 dst_rsv, num_bytes);
690 goto out;
691 }
692
693migrate:
634 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); 694 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
695
696out:
697 /*
698 * Migrate only takes a reservation, it doesn't touch the size of the
699 * block_rsv. This is to simplify people who don't normally have things
700 * migrated from their block rsv. If they go to release their
701 * reservation, that will decrease the size as well, so if migrate
702 * reduced size we'd end up with a negative size. But for the
703 * delalloc_meta_reserved stuff we will only know to drop 1 reservation,
704 * but we could in fact do this reserve/migrate dance several times
705 * between the time we did the original reservation and we'd clean it
706 * up. So to take care of this, release the space for the meta
707 * reservation here. I think it may be time for a documentation page on
708 * how block rsvs. work.
709 */
635 if (!ret) 710 if (!ret)
636 node->bytes_reserved = num_bytes; 711 node->bytes_reserved = num_bytes;
637 712
713 if (release)
714 btrfs_block_rsv_release(root, src_rsv, num_bytes);
715
638 return ret; 716 return ret;
639} 717}
640 718
@@ -646,7 +724,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
646 if (!node->bytes_reserved) 724 if (!node->bytes_reserved)
647 return; 725 return;
648 726
649 rsv = &root->fs_info->global_block_rsv; 727 rsv = &root->fs_info->delayed_block_rsv;
650 btrfs_block_rsv_release(root, rsv, 728 btrfs_block_rsv_release(root, rsv,
651 node->bytes_reserved); 729 node->bytes_reserved);
652 node->bytes_reserved = 0; 730 node->bytes_reserved = 0;
@@ -1026,7 +1104,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1026 path->leave_spinning = 1; 1104 path->leave_spinning = 1;
1027 1105
1028 block_rsv = trans->block_rsv; 1106 block_rsv = trans->block_rsv;
1029 trans->block_rsv = &root->fs_info->global_block_rsv; 1107 trans->block_rsv = &root->fs_info->delayed_block_rsv;
1030 1108
1031 delayed_root = btrfs_get_delayed_root(root); 1109 delayed_root = btrfs_get_delayed_root(root);
1032 1110
@@ -1069,7 +1147,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
1069 path->leave_spinning = 1; 1147 path->leave_spinning = 1;
1070 1148
1071 block_rsv = trans->block_rsv; 1149 block_rsv = trans->block_rsv;
1072 trans->block_rsv = &node->root->fs_info->global_block_rsv; 1150 trans->block_rsv = &node->root->fs_info->delayed_block_rsv;
1073 1151
1074 ret = btrfs_insert_delayed_items(trans, path, node->root, node); 1152 ret = btrfs_insert_delayed_items(trans, path, node->root, node);
1075 if (!ret) 1153 if (!ret)
@@ -1149,7 +1227,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
1149 goto free_path; 1227 goto free_path;
1150 1228
1151 block_rsv = trans->block_rsv; 1229 block_rsv = trans->block_rsv;
1152 trans->block_rsv = &root->fs_info->global_block_rsv; 1230 trans->block_rsv = &root->fs_info->delayed_block_rsv;
1153 1231
1154 ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); 1232 ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);
1155 if (!ret) 1233 if (!ret)
@@ -1685,12 +1763,10 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
1685 goto release_node; 1763 goto release_node;
1686 } 1764 }
1687 1765
1688 ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); 1766 ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode,
1689 /* 1767 delayed_node);
1690 * we must reserve enough space when we start a new transaction, 1768 if (ret)
1691 * so reserving metadata failure is impossible 1769 goto release_node;
1692 */
1693 BUG_ON(ret);
1694 1770
1695 fill_stack_inode_item(trans, &delayed_node->inode_item, inode); 1771 fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
1696 delayed_node->inode_dirty = 1; 1772 delayed_node->inode_dirty = 1;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 07ea91879a9..632f8f3cc9d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -256,8 +256,7 @@ void btrfs_csum_final(u32 crc, char *result)
256static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, 256static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
257 int verify) 257 int verify)
258{ 258{
259 u16 csum_size = 259 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
260 btrfs_super_csum_size(&root->fs_info->super_copy);
261 char *result = NULL; 260 char *result = NULL;
262 unsigned long len; 261 unsigned long len;
263 unsigned long cur_len; 262 unsigned long cur_len;
@@ -367,7 +366,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
367 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); 366 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
368 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; 367 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
369 while (1) { 368 while (1) {
370 ret = read_extent_buffer_pages(io_tree, eb, start, 1, 369 ret = read_extent_buffer_pages(io_tree, eb, start,
370 WAIT_COMPLETE,
371 btree_get_extent, mirror_num); 371 btree_get_extent, mirror_num);
372 if (!ret && 372 if (!ret &&
373 !verify_parent_transid(io_tree, eb, parent_transid)) 373 !verify_parent_transid(io_tree, eb, parent_transid))
@@ -608,11 +608,48 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
608 end = min_t(u64, eb->len, PAGE_CACHE_SIZE); 608 end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
609 end = eb->start + end - 1; 609 end = eb->start + end - 1;
610err: 610err:
611 if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
612 clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
613 btree_readahead_hook(root, eb, eb->start, ret);
614 }
615
611 free_extent_buffer(eb); 616 free_extent_buffer(eb);
612out: 617out:
613 return ret; 618 return ret;
614} 619}
615 620
621static int btree_io_failed_hook(struct bio *failed_bio,
622 struct page *page, u64 start, u64 end,
623 int mirror_num, struct extent_state *state)
624{
625 struct extent_io_tree *tree;
626 unsigned long len;
627 struct extent_buffer *eb;
628 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
629
630 tree = &BTRFS_I(page->mapping->host)->io_tree;
631 if (page->private == EXTENT_PAGE_PRIVATE)
632 goto out;
633 if (!page->private)
634 goto out;
635
636 len = page->private >> 2;
637 WARN_ON(len == 0);
638
639 eb = alloc_extent_buffer(tree, start, len, page);
640 if (eb == NULL)
641 goto out;
642
643 if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
644 clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
645 btree_readahead_hook(root, eb, eb->start, -EIO);
646 }
647 free_extent_buffer(eb);
648
649out:
650 return -EIO; /* we fixed nothing */
651}
652
616static void end_workqueue_bio(struct bio *bio, int err) 653static void end_workqueue_bio(struct bio *bio, int err)
617{ 654{
618 struct end_io_wq *end_io_wq = bio->bi_private; 655 struct end_io_wq *end_io_wq = bio->bi_private;
@@ -908,7 +945,7 @@ static int btree_readpage(struct file *file, struct page *page)
908{ 945{
909 struct extent_io_tree *tree; 946 struct extent_io_tree *tree;
910 tree = &BTRFS_I(page->mapping->host)->io_tree; 947 tree = &BTRFS_I(page->mapping->host)->io_tree;
911 return extent_read_full_page(tree, page, btree_get_extent); 948 return extent_read_full_page(tree, page, btree_get_extent, 0);
912} 949}
913 950
914static int btree_releasepage(struct page *page, gfp_t gfp_flags) 951static int btree_releasepage(struct page *page, gfp_t gfp_flags)
@@ -974,11 +1011,43 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
974 if (!buf) 1011 if (!buf)
975 return 0; 1012 return 0;
976 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, 1013 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
977 buf, 0, 0, btree_get_extent, 0); 1014 buf, 0, WAIT_NONE, btree_get_extent, 0);
978 free_extent_buffer(buf); 1015 free_extent_buffer(buf);
979 return ret; 1016 return ret;
980} 1017}
981 1018
1019int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
1020 int mirror_num, struct extent_buffer **eb)
1021{
1022 struct extent_buffer *buf = NULL;
1023 struct inode *btree_inode = root->fs_info->btree_inode;
1024 struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
1025 int ret;
1026
1027 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
1028 if (!buf)
1029 return 0;
1030
1031 set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
1032
1033 ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
1034 btree_get_extent, mirror_num);
1035 if (ret) {
1036 free_extent_buffer(buf);
1037 return ret;
1038 }
1039
1040 if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
1041 free_extent_buffer(buf);
1042 return -EIO;
1043 } else if (extent_buffer_uptodate(io_tree, buf, NULL)) {
1044 *eb = buf;
1045 } else {
1046 free_extent_buffer(buf);
1047 }
1048 return 0;
1049}
1050
982struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 1051struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
983 u64 bytenr, u32 blocksize) 1052 u64 bytenr, u32 blocksize)
984{ 1053{
@@ -1135,10 +1204,12 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
1135 1204
1136 generation = btrfs_root_generation(&root->root_item); 1205 generation = btrfs_root_generation(&root->root_item);
1137 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 1206 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1207 root->commit_root = NULL;
1138 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1208 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1139 blocksize, generation); 1209 blocksize, generation);
1140 if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) { 1210 if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
1141 free_extent_buffer(root->node); 1211 free_extent_buffer(root->node);
1212 root->node = NULL;
1142 return -EIO; 1213 return -EIO;
1143 } 1214 }
1144 root->commit_root = btrfs_root_node(root); 1215 root->commit_root = btrfs_root_node(root);
@@ -1577,6 +1648,235 @@ sleep:
1577 return 0; 1648 return 0;
1578} 1649}
1579 1650
1651/*
1652 * this will find the highest generation in the array of
1653 * root backups. The index of the highest array is returned,
1654 * or -1 if we can't find anything.
1655 *
1656 * We check to make sure the array is valid by comparing the
1657 * generation of the latest root in the array with the generation
1658 * in the super block. If they don't match we pitch it.
1659 */
1660static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
1661{
1662 u64 cur;
1663 int newest_index = -1;
1664 struct btrfs_root_backup *root_backup;
1665 int i;
1666
1667 for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
1668 root_backup = info->super_copy->super_roots + i;
1669 cur = btrfs_backup_tree_root_gen(root_backup);
1670 if (cur == newest_gen)
1671 newest_index = i;
1672 }
1673
1674 /* check to see if we actually wrapped around */
1675 if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
1676 root_backup = info->super_copy->super_roots;
1677 cur = btrfs_backup_tree_root_gen(root_backup);
1678 if (cur == newest_gen)
1679 newest_index = 0;
1680 }
1681 return newest_index;
1682}
1683
1684
1685/*
1686 * find the oldest backup so we know where to store new entries
1687 * in the backup array. This will set the backup_root_index
1688 * field in the fs_info struct
1689 */
1690static void find_oldest_super_backup(struct btrfs_fs_info *info,
1691 u64 newest_gen)
1692{
1693 int newest_index = -1;
1694
1695 newest_index = find_newest_super_backup(info, newest_gen);
1696 /* if there was garbage in there, just move along */
1697 if (newest_index == -1) {
1698 info->backup_root_index = 0;
1699 } else {
1700 info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
1701 }
1702}
1703
1704/*
1705 * copy all the root pointers into the super backup array.
1706 * this will bump the backup pointer by one when it is
1707 * done
1708 */
1709static void backup_super_roots(struct btrfs_fs_info *info)
1710{
1711 int next_backup;
1712 struct btrfs_root_backup *root_backup;
1713 int last_backup;
1714
1715 next_backup = info->backup_root_index;
1716 last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
1717 BTRFS_NUM_BACKUP_ROOTS;
1718
1719 /*
1720 * just overwrite the last backup if we're at the same generation
1721 * this happens only at umount
1722 */
1723 root_backup = info->super_for_commit->super_roots + last_backup;
1724 if (btrfs_backup_tree_root_gen(root_backup) ==
1725 btrfs_header_generation(info->tree_root->node))
1726 next_backup = last_backup;
1727
1728 root_backup = info->super_for_commit->super_roots + next_backup;
1729
1730 /*
1731 * make sure all of our padding and empty slots get zero filled
1732 * regardless of which ones we use today
1733 */
1734 memset(root_backup, 0, sizeof(*root_backup));
1735
1736 info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
1737
1738 btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
1739 btrfs_set_backup_tree_root_gen(root_backup,
1740 btrfs_header_generation(info->tree_root->node));
1741
1742 btrfs_set_backup_tree_root_level(root_backup,
1743 btrfs_header_level(info->tree_root->node));
1744
1745 btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
1746 btrfs_set_backup_chunk_root_gen(root_backup,
1747 btrfs_header_generation(info->chunk_root->node));
1748 btrfs_set_backup_chunk_root_level(root_backup,
1749 btrfs_header_level(info->chunk_root->node));
1750
1751 btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
1752 btrfs_set_backup_extent_root_gen(root_backup,
1753 btrfs_header_generation(info->extent_root->node));
1754 btrfs_set_backup_extent_root_level(root_backup,
1755 btrfs_header_level(info->extent_root->node));
1756
1757 /*
1758 * we might commit during log recovery, which happens before we set
1759 * the fs_root. Make sure it is valid before we fill it in.
1760 */
1761 if (info->fs_root && info->fs_root->node) {
1762 btrfs_set_backup_fs_root(root_backup,
1763 info->fs_root->node->start);
1764 btrfs_set_backup_fs_root_gen(root_backup,
1765 btrfs_header_generation(info->fs_root->node));
1766 btrfs_set_backup_fs_root_level(root_backup,
1767 btrfs_header_level(info->fs_root->node));
1768 }
1769
1770 btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
1771 btrfs_set_backup_dev_root_gen(root_backup,
1772 btrfs_header_generation(info->dev_root->node));
1773 btrfs_set_backup_dev_root_level(root_backup,
1774 btrfs_header_level(info->dev_root->node));
1775
1776 btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
1777 btrfs_set_backup_csum_root_gen(root_backup,
1778 btrfs_header_generation(info->csum_root->node));
1779 btrfs_set_backup_csum_root_level(root_backup,
1780 btrfs_header_level(info->csum_root->node));
1781
1782 btrfs_set_backup_total_bytes(root_backup,
1783 btrfs_super_total_bytes(info->super_copy));
1784 btrfs_set_backup_bytes_used(root_backup,
1785 btrfs_super_bytes_used(info->super_copy));
1786 btrfs_set_backup_num_devices(root_backup,
1787 btrfs_super_num_devices(info->super_copy));
1788
1789 /*
1790 * if we don't copy this out to the super_copy, it won't get remembered
1791 * for the next commit
1792 */
1793 memcpy(&info->super_copy->super_roots,
1794 &info->super_for_commit->super_roots,
1795 sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
1796}
1797
1798/*
1799 * this copies info out of the root backup array and back into
1800 * the in-memory super block. It is meant to help iterate through
1801 * the array, so you send it the number of backups you've already
1802 * tried and the last backup index you used.
1803 *
1804 * this returns -1 when it has tried all the backups
1805 */
1806static noinline int next_root_backup(struct btrfs_fs_info *info,
1807 struct btrfs_super_block *super,
1808 int *num_backups_tried, int *backup_index)
1809{
1810 struct btrfs_root_backup *root_backup;
1811 int newest = *backup_index;
1812
1813 if (*num_backups_tried == 0) {
1814 u64 gen = btrfs_super_generation(super);
1815
1816 newest = find_newest_super_backup(info, gen);
1817 if (newest == -1)
1818 return -1;
1819
1820 *backup_index = newest;
1821 *num_backups_tried = 1;
1822 } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
1823 /* we've tried all the backups, all done */
1824 return -1;
1825 } else {
1826 /* jump to the next oldest backup */
1827 newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
1828 BTRFS_NUM_BACKUP_ROOTS;
1829 *backup_index = newest;
1830 *num_backups_tried += 1;
1831 }
1832 root_backup = super->super_roots + newest;
1833
1834 btrfs_set_super_generation(super,
1835 btrfs_backup_tree_root_gen(root_backup));
1836 btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
1837 btrfs_set_super_root_level(super,
1838 btrfs_backup_tree_root_level(root_backup));
1839 btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
1840
1841 /*
1842 * fixme: the total bytes and num_devices need to match or we should
1843 * need a fsck
1844 */
1845 btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
1846 btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
1847 return 0;
1848}
1849
1850/* helper to cleanup tree roots */
1851static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
1852{
1853 free_extent_buffer(info->tree_root->node);
1854 free_extent_buffer(info->tree_root->commit_root);
1855 free_extent_buffer(info->dev_root->node);
1856 free_extent_buffer(info->dev_root->commit_root);
1857 free_extent_buffer(info->extent_root->node);
1858 free_extent_buffer(info->extent_root->commit_root);
1859 free_extent_buffer(info->csum_root->node);
1860 free_extent_buffer(info->csum_root->commit_root);
1861
1862 info->tree_root->node = NULL;
1863 info->tree_root->commit_root = NULL;
1864 info->dev_root->node = NULL;
1865 info->dev_root->commit_root = NULL;
1866 info->extent_root->node = NULL;
1867 info->extent_root->commit_root = NULL;
1868 info->csum_root->node = NULL;
1869 info->csum_root->commit_root = NULL;
1870
1871 if (chunk_root) {
1872 free_extent_buffer(info->chunk_root->node);
1873 free_extent_buffer(info->chunk_root->commit_root);
1874 info->chunk_root->node = NULL;
1875 info->chunk_root->commit_root = NULL;
1876 }
1877}
1878
1879
1580struct btrfs_root *open_ctree(struct super_block *sb, 1880struct btrfs_root *open_ctree(struct super_block *sb,
1581 struct btrfs_fs_devices *fs_devices, 1881 struct btrfs_fs_devices *fs_devices,
1582 char *options) 1882 char *options)
@@ -1590,29 +1890,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1590 u64 features; 1890 u64 features;
1591 struct btrfs_key location; 1891 struct btrfs_key location;
1592 struct buffer_head *bh; 1892 struct buffer_head *bh;
1593 struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root), 1893 struct btrfs_super_block *disk_super;
1594 GFP_NOFS);
1595 struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
1596 GFP_NOFS);
1597 struct btrfs_root *tree_root = btrfs_sb(sb); 1894 struct btrfs_root *tree_root = btrfs_sb(sb);
1598 struct btrfs_fs_info *fs_info = NULL; 1895 struct btrfs_fs_info *fs_info = tree_root->fs_info;
1599 struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), 1896 struct btrfs_root *extent_root;
1600 GFP_NOFS); 1897 struct btrfs_root *csum_root;
1601 struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), 1898 struct btrfs_root *chunk_root;
1602 GFP_NOFS); 1899 struct btrfs_root *dev_root;
1603 struct btrfs_root *log_tree_root; 1900 struct btrfs_root *log_tree_root;
1604
1605 int ret; 1901 int ret;
1606 int err = -EINVAL; 1902 int err = -EINVAL;
1607 1903 int num_backups_tried = 0;
1608 struct btrfs_super_block *disk_super; 1904 int backup_index = 0;
1609 1905
1610 if (!extent_root || !tree_root || !tree_root->fs_info || 1906 extent_root = fs_info->extent_root =
1611 !chunk_root || !dev_root || !csum_root) { 1907 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1908 csum_root = fs_info->csum_root =
1909 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1910 chunk_root = fs_info->chunk_root =
1911 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1912 dev_root = fs_info->dev_root =
1913 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1914
1915 if (!extent_root || !csum_root || !chunk_root || !dev_root) {
1612 err = -ENOMEM; 1916 err = -ENOMEM;
1613 goto fail; 1917 goto fail;
1614 } 1918 }
1615 fs_info = tree_root->fs_info;
1616 1919
1617 ret = init_srcu_struct(&fs_info->subvol_srcu); 1920 ret = init_srcu_struct(&fs_info->subvol_srcu);
1618 if (ret) { 1921 if (ret) {
@@ -1648,15 +1951,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1648 spin_lock_init(&fs_info->fs_roots_radix_lock); 1951 spin_lock_init(&fs_info->fs_roots_radix_lock);
1649 spin_lock_init(&fs_info->delayed_iput_lock); 1952 spin_lock_init(&fs_info->delayed_iput_lock);
1650 spin_lock_init(&fs_info->defrag_inodes_lock); 1953 spin_lock_init(&fs_info->defrag_inodes_lock);
1954 spin_lock_init(&fs_info->free_chunk_lock);
1651 mutex_init(&fs_info->reloc_mutex); 1955 mutex_init(&fs_info->reloc_mutex);
1652 1956
1653 init_completion(&fs_info->kobj_unregister); 1957 init_completion(&fs_info->kobj_unregister);
1654 fs_info->tree_root = tree_root;
1655 fs_info->extent_root = extent_root;
1656 fs_info->csum_root = csum_root;
1657 fs_info->chunk_root = chunk_root;
1658 fs_info->dev_root = dev_root;
1659 fs_info->fs_devices = fs_devices;
1660 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 1958 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
1661 INIT_LIST_HEAD(&fs_info->space_info); 1959 INIT_LIST_HEAD(&fs_info->space_info);
1662 btrfs_mapping_init(&fs_info->mapping_tree); 1960 btrfs_mapping_init(&fs_info->mapping_tree);
@@ -1665,8 +1963,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1665 btrfs_init_block_rsv(&fs_info->trans_block_rsv); 1963 btrfs_init_block_rsv(&fs_info->trans_block_rsv);
1666 btrfs_init_block_rsv(&fs_info->chunk_block_rsv); 1964 btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
1667 btrfs_init_block_rsv(&fs_info->empty_block_rsv); 1965 btrfs_init_block_rsv(&fs_info->empty_block_rsv);
1668 INIT_LIST_HEAD(&fs_info->durable_block_rsv_list); 1966 btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
1669 mutex_init(&fs_info->durable_block_rsv_mutex);
1670 atomic_set(&fs_info->nr_async_submits, 0); 1967 atomic_set(&fs_info->nr_async_submits, 0);
1671 atomic_set(&fs_info->async_delalloc_pages, 0); 1968 atomic_set(&fs_info->async_delalloc_pages, 0);
1672 atomic_set(&fs_info->async_submit_draining, 0); 1969 atomic_set(&fs_info->async_submit_draining, 0);
@@ -1677,6 +1974,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1677 fs_info->metadata_ratio = 0; 1974 fs_info->metadata_ratio = 0;
1678 fs_info->defrag_inodes = RB_ROOT; 1975 fs_info->defrag_inodes = RB_ROOT;
1679 fs_info->trans_no_join = 0; 1976 fs_info->trans_no_join = 0;
1977 fs_info->free_chunk_space = 0;
1978
1979 /* readahead state */
1980 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
1981 spin_lock_init(&fs_info->reada_lock);
1680 1982
1681 fs_info->thread_pool_size = min_t(unsigned long, 1983 fs_info->thread_pool_size = min_t(unsigned long,
1682 num_online_cpus() + 2, 8); 1984 num_online_cpus() + 2, 8);
@@ -1766,14 +2068,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1766 goto fail_alloc; 2068 goto fail_alloc;
1767 } 2069 }
1768 2070
1769 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); 2071 memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
1770 memcpy(&fs_info->super_for_commit, &fs_info->super_copy, 2072 memcpy(fs_info->super_for_commit, fs_info->super_copy,
1771 sizeof(fs_info->super_for_commit)); 2073 sizeof(*fs_info->super_for_commit));
1772 brelse(bh); 2074 brelse(bh);
1773 2075
1774 memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE); 2076 memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
1775 2077
1776 disk_super = &fs_info->super_copy; 2078 disk_super = fs_info->super_copy;
1777 if (!btrfs_super_root(disk_super)) 2079 if (!btrfs_super_root(disk_super))
1778 goto fail_alloc; 2080 goto fail_alloc;
1779 2081
@@ -1783,6 +2085,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1783 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); 2085 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
1784 2086
1785 /* 2087 /*
2088 * run through our array of backup supers and setup
2089 * our ring pointer to the oldest one
2090 */
2091 generation = btrfs_super_generation(disk_super);
2092 find_oldest_super_backup(fs_info, generation);
2093
2094 /*
1786 * In the long term, we'll store the compression type in the super 2095 * In the long term, we'll store the compression type in the super
1787 * block, and it'll be used for per file compression control. 2096 * block, and it'll be used for per file compression control.
1788 */ 2097 */
@@ -1870,6 +2179,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1870 btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta", 2179 btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
1871 fs_info->thread_pool_size, 2180 fs_info->thread_pool_size,
1872 &fs_info->generic_worker); 2181 &fs_info->generic_worker);
2182 btrfs_init_workers(&fs_info->readahead_workers, "readahead",
2183 fs_info->thread_pool_size,
2184 &fs_info->generic_worker);
1873 2185
1874 /* 2186 /*
1875 * endios are largely parallel and should have a very 2187 * endios are largely parallel and should have a very
@@ -1880,6 +2192,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1880 2192
1881 fs_info->endio_write_workers.idle_thresh = 2; 2193 fs_info->endio_write_workers.idle_thresh = 2;
1882 fs_info->endio_meta_write_workers.idle_thresh = 2; 2194 fs_info->endio_meta_write_workers.idle_thresh = 2;
2195 fs_info->readahead_workers.idle_thresh = 2;
1883 2196
1884 btrfs_start_workers(&fs_info->workers, 1); 2197 btrfs_start_workers(&fs_info->workers, 1);
1885 btrfs_start_workers(&fs_info->generic_worker, 1); 2198 btrfs_start_workers(&fs_info->generic_worker, 1);
@@ -1893,6 +2206,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1893 btrfs_start_workers(&fs_info->endio_freespace_worker, 1); 2206 btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
1894 btrfs_start_workers(&fs_info->delayed_workers, 1); 2207 btrfs_start_workers(&fs_info->delayed_workers, 1);
1895 btrfs_start_workers(&fs_info->caching_workers, 1); 2208 btrfs_start_workers(&fs_info->caching_workers, 1);
2209 btrfs_start_workers(&fs_info->readahead_workers, 1);
1896 2210
1897 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 2211 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1898 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 2212 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1939,7 +2253,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1939 if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { 2253 if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
1940 printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", 2254 printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
1941 sb->s_id); 2255 sb->s_id);
1942 goto fail_chunk_root; 2256 goto fail_tree_roots;
1943 } 2257 }
1944 btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); 2258 btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
1945 chunk_root->commit_root = btrfs_root_node(chunk_root); 2259 chunk_root->commit_root = btrfs_root_node(chunk_root);
@@ -1954,11 +2268,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1954 if (ret) { 2268 if (ret) {
1955 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", 2269 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
1956 sb->s_id); 2270 sb->s_id);
1957 goto fail_chunk_root; 2271 goto fail_tree_roots;
1958 } 2272 }
1959 2273
1960 btrfs_close_extra_devices(fs_devices); 2274 btrfs_close_extra_devices(fs_devices);
1961 2275
2276retry_root_backup:
1962 blocksize = btrfs_level_size(tree_root, 2277 blocksize = btrfs_level_size(tree_root,
1963 btrfs_super_root_level(disk_super)); 2278 btrfs_super_root_level(disk_super));
1964 generation = btrfs_super_generation(disk_super); 2279 generation = btrfs_super_generation(disk_super);
@@ -1966,32 +2281,33 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1966 tree_root->node = read_tree_block(tree_root, 2281 tree_root->node = read_tree_block(tree_root,
1967 btrfs_super_root(disk_super), 2282 btrfs_super_root(disk_super),
1968 blocksize, generation); 2283 blocksize, generation);
1969 if (!tree_root->node) 2284 if (!tree_root->node ||
1970 goto fail_chunk_root; 2285 !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
1971 if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
1972 printk(KERN_WARNING "btrfs: failed to read tree root on %s\n", 2286 printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
1973 sb->s_id); 2287 sb->s_id);
1974 goto fail_tree_root; 2288
2289 goto recovery_tree_root;
1975 } 2290 }
2291
1976 btrfs_set_root_node(&tree_root->root_item, tree_root->node); 2292 btrfs_set_root_node(&tree_root->root_item, tree_root->node);
1977 tree_root->commit_root = btrfs_root_node(tree_root); 2293 tree_root->commit_root = btrfs_root_node(tree_root);
1978 2294
1979 ret = find_and_setup_root(tree_root, fs_info, 2295 ret = find_and_setup_root(tree_root, fs_info,
1980 BTRFS_EXTENT_TREE_OBJECTID, extent_root); 2296 BTRFS_EXTENT_TREE_OBJECTID, extent_root);
1981 if (ret) 2297 if (ret)
1982 goto fail_tree_root; 2298 goto recovery_tree_root;
1983 extent_root->track_dirty = 1; 2299 extent_root->track_dirty = 1;
1984 2300
1985 ret = find_and_setup_root(tree_root, fs_info, 2301 ret = find_and_setup_root(tree_root, fs_info,
1986 BTRFS_DEV_TREE_OBJECTID, dev_root); 2302 BTRFS_DEV_TREE_OBJECTID, dev_root);
1987 if (ret) 2303 if (ret)
1988 goto fail_extent_root; 2304 goto recovery_tree_root;
1989 dev_root->track_dirty = 1; 2305 dev_root->track_dirty = 1;
1990 2306
1991 ret = find_and_setup_root(tree_root, fs_info, 2307 ret = find_and_setup_root(tree_root, fs_info,
1992 BTRFS_CSUM_TREE_OBJECTID, csum_root); 2308 BTRFS_CSUM_TREE_OBJECTID, csum_root);
1993 if (ret) 2309 if (ret)
1994 goto fail_dev_root; 2310 goto recovery_tree_root;
1995 2311
1996 csum_root->track_dirty = 1; 2312 csum_root->track_dirty = 1;
1997 2313
@@ -2124,22 +2440,13 @@ fail_cleaner:
2124 2440
2125fail_block_groups: 2441fail_block_groups:
2126 btrfs_free_block_groups(fs_info); 2442 btrfs_free_block_groups(fs_info);
2127 free_extent_buffer(csum_root->node); 2443
2128 free_extent_buffer(csum_root->commit_root); 2444fail_tree_roots:
2129fail_dev_root: 2445 free_root_pointers(fs_info, 1);
2130 free_extent_buffer(dev_root->node); 2446
2131 free_extent_buffer(dev_root->commit_root);
2132fail_extent_root:
2133 free_extent_buffer(extent_root->node);
2134 free_extent_buffer(extent_root->commit_root);
2135fail_tree_root:
2136 free_extent_buffer(tree_root->node);
2137 free_extent_buffer(tree_root->commit_root);
2138fail_chunk_root:
2139 free_extent_buffer(chunk_root->node);
2140 free_extent_buffer(chunk_root->commit_root);
2141fail_sb_buffer: 2447fail_sb_buffer:
2142 btrfs_stop_workers(&fs_info->generic_worker); 2448 btrfs_stop_workers(&fs_info->generic_worker);
2449 btrfs_stop_workers(&fs_info->readahead_workers);
2143 btrfs_stop_workers(&fs_info->fixup_workers); 2450 btrfs_stop_workers(&fs_info->fixup_workers);
2144 btrfs_stop_workers(&fs_info->delalloc_workers); 2451 btrfs_stop_workers(&fs_info->delalloc_workers);
2145 btrfs_stop_workers(&fs_info->workers); 2452 btrfs_stop_workers(&fs_info->workers);
@@ -2152,25 +2459,37 @@ fail_sb_buffer:
2152 btrfs_stop_workers(&fs_info->delayed_workers); 2459 btrfs_stop_workers(&fs_info->delayed_workers);
2153 btrfs_stop_workers(&fs_info->caching_workers); 2460 btrfs_stop_workers(&fs_info->caching_workers);
2154fail_alloc: 2461fail_alloc:
2155 kfree(fs_info->delayed_root);
2156fail_iput: 2462fail_iput:
2463 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2464
2157 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2465 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2158 iput(fs_info->btree_inode); 2466 iput(fs_info->btree_inode);
2159
2160 btrfs_close_devices(fs_info->fs_devices);
2161 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2162fail_bdi: 2467fail_bdi:
2163 bdi_destroy(&fs_info->bdi); 2468 bdi_destroy(&fs_info->bdi);
2164fail_srcu: 2469fail_srcu:
2165 cleanup_srcu_struct(&fs_info->subvol_srcu); 2470 cleanup_srcu_struct(&fs_info->subvol_srcu);
2166fail: 2471fail:
2167 kfree(extent_root); 2472 btrfs_close_devices(fs_info->fs_devices);
2168 kfree(tree_root); 2473 free_fs_info(fs_info);
2169 kfree(fs_info);
2170 kfree(chunk_root);
2171 kfree(dev_root);
2172 kfree(csum_root);
2173 return ERR_PTR(err); 2474 return ERR_PTR(err);
2475
2476recovery_tree_root:
2477 if (!btrfs_test_opt(tree_root, RECOVERY))
2478 goto fail_tree_roots;
2479
2480 free_root_pointers(fs_info, 0);
2481
2482 /* don't use the log in recovery mode, it won't be valid */
2483 btrfs_set_super_log_root(disk_super, 0);
2484
2485 /* we can't trust the free space cache either */
2486 btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
2487
2488 ret = next_root_backup(fs_info, fs_info->super_copy,
2489 &num_backups_tried, &backup_index);
2490 if (ret == -1)
2491 goto fail_block_groups;
2492 goto retry_root_backup;
2174} 2493}
2175 2494
2176static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) 2495static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
@@ -2254,22 +2573,10 @@ static int write_dev_supers(struct btrfs_device *device,
2254 int errors = 0; 2573 int errors = 0;
2255 u32 crc; 2574 u32 crc;
2256 u64 bytenr; 2575 u64 bytenr;
2257 int last_barrier = 0;
2258 2576
2259 if (max_mirrors == 0) 2577 if (max_mirrors == 0)
2260 max_mirrors = BTRFS_SUPER_MIRROR_MAX; 2578 max_mirrors = BTRFS_SUPER_MIRROR_MAX;
2261 2579
2262 /* make sure only the last submit_bh does a barrier */
2263 if (do_barriers) {
2264 for (i = 0; i < max_mirrors; i++) {
2265 bytenr = btrfs_sb_offset(i);
2266 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
2267 device->total_bytes)
2268 break;
2269 last_barrier = i;
2270 }
2271 }
2272
2273 for (i = 0; i < max_mirrors; i++) { 2580 for (i = 0; i < max_mirrors; i++) {
2274 bytenr = btrfs_sb_offset(i); 2581 bytenr = btrfs_sb_offset(i);
2275 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) 2582 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
@@ -2315,17 +2622,136 @@ static int write_dev_supers(struct btrfs_device *device,
2315 bh->b_end_io = btrfs_end_buffer_write_sync; 2622 bh->b_end_io = btrfs_end_buffer_write_sync;
2316 } 2623 }
2317 2624
2318 if (i == last_barrier && do_barriers) 2625 /*
2319 ret = submit_bh(WRITE_FLUSH_FUA, bh); 2626 * we fua the first super. The others we allow
2320 else 2627 * to go down lazy.
2321 ret = submit_bh(WRITE_SYNC, bh); 2628 */
2322 2629 ret = submit_bh(WRITE_FUA, bh);
2323 if (ret) 2630 if (ret)
2324 errors++; 2631 errors++;
2325 } 2632 }
2326 return errors < i ? 0 : -1; 2633 return errors < i ? 0 : -1;
2327} 2634}
2328 2635
2636/*
2637 * endio for the write_dev_flush, this will wake anyone waiting
2638 * for the barrier when it is done
2639 */
2640static void btrfs_end_empty_barrier(struct bio *bio, int err)
2641{
2642 if (err) {
2643 if (err == -EOPNOTSUPP)
2644 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2645 clear_bit(BIO_UPTODATE, &bio->bi_flags);
2646 }
2647 if (bio->bi_private)
2648 complete(bio->bi_private);
2649 bio_put(bio);
2650}
2651
2652/*
2653 * trigger flushes for one the devices. If you pass wait == 0, the flushes are
2654 * sent down. With wait == 1, it waits for the previous flush.
2655 *
2656 * any device where the flush fails with eopnotsupp are flagged as not-barrier
2657 * capable
2658 */
2659static int write_dev_flush(struct btrfs_device *device, int wait)
2660{
2661 struct bio *bio;
2662 int ret = 0;
2663
2664 if (device->nobarriers)
2665 return 0;
2666
2667 if (wait) {
2668 bio = device->flush_bio;
2669 if (!bio)
2670 return 0;
2671
2672 wait_for_completion(&device->flush_wait);
2673
2674 if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
2675 printk("btrfs: disabling barriers on dev %s\n",
2676 device->name);
2677 device->nobarriers = 1;
2678 }
2679 if (!bio_flagged(bio, BIO_UPTODATE)) {
2680 ret = -EIO;
2681 }
2682
2683 /* drop the reference from the wait == 0 run */
2684 bio_put(bio);
2685 device->flush_bio = NULL;
2686
2687 return ret;
2688 }
2689
2690 /*
2691 * one reference for us, and we leave it for the
2692 * caller
2693 */
2694 device->flush_bio = NULL;;
2695 bio = bio_alloc(GFP_NOFS, 0);
2696 if (!bio)
2697 return -ENOMEM;
2698
2699 bio->bi_end_io = btrfs_end_empty_barrier;
2700 bio->bi_bdev = device->bdev;
2701 init_completion(&device->flush_wait);
2702 bio->bi_private = &device->flush_wait;
2703 device->flush_bio = bio;
2704
2705 bio_get(bio);
2706 submit_bio(WRITE_FLUSH, bio);
2707
2708 return 0;
2709}
2710
2711/*
2712 * send an empty flush down to each device in parallel,
2713 * then wait for them
2714 */
2715static int barrier_all_devices(struct btrfs_fs_info *info)
2716{
2717 struct list_head *head;
2718 struct btrfs_device *dev;
2719 int errors = 0;
2720 int ret;
2721
2722 /* send down all the barriers */
2723 head = &info->fs_devices->devices;
2724 list_for_each_entry_rcu(dev, head, dev_list) {
2725 if (!dev->bdev) {
2726 errors++;
2727 continue;
2728 }
2729 if (!dev->in_fs_metadata || !dev->writeable)
2730 continue;
2731
2732 ret = write_dev_flush(dev, 0);
2733 if (ret)
2734 errors++;
2735 }
2736
2737 /* wait for all the barriers */
2738 list_for_each_entry_rcu(dev, head, dev_list) {
2739 if (!dev->bdev) {
2740 errors++;
2741 continue;
2742 }
2743 if (!dev->in_fs_metadata || !dev->writeable)
2744 continue;
2745
2746 ret = write_dev_flush(dev, 1);
2747 if (ret)
2748 errors++;
2749 }
2750 if (errors)
2751 return -EIO;
2752 return 0;
2753}
2754
2329int write_all_supers(struct btrfs_root *root, int max_mirrors) 2755int write_all_supers(struct btrfs_root *root, int max_mirrors)
2330{ 2756{
2331 struct list_head *head; 2757 struct list_head *head;
@@ -2338,14 +2764,19 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2338 int total_errors = 0; 2764 int total_errors = 0;
2339 u64 flags; 2765 u64 flags;
2340 2766
2341 max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 2767 max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
2342 do_barriers = !btrfs_test_opt(root, NOBARRIER); 2768 do_barriers = !btrfs_test_opt(root, NOBARRIER);
2769 backup_super_roots(root->fs_info);
2343 2770
2344 sb = &root->fs_info->super_for_commit; 2771 sb = root->fs_info->super_for_commit;
2345 dev_item = &sb->dev_item; 2772 dev_item = &sb->dev_item;
2346 2773
2347 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2774 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2348 head = &root->fs_info->fs_devices->devices; 2775 head = &root->fs_info->fs_devices->devices;
2776
2777 if (do_barriers)
2778 barrier_all_devices(root->fs_info);
2779
2349 list_for_each_entry_rcu(dev, head, dev_list) { 2780 list_for_each_entry_rcu(dev, head, dev_list) {
2350 if (!dev->bdev) { 2781 if (!dev->bdev) {
2351 total_errors++; 2782 total_errors++;
@@ -2545,8 +2976,6 @@ int close_ctree(struct btrfs_root *root)
2545 /* clear out the rbtree of defraggable inodes */ 2976 /* clear out the rbtree of defraggable inodes */
2546 btrfs_run_defrag_inodes(root->fs_info); 2977 btrfs_run_defrag_inodes(root->fs_info);
2547 2978
2548 btrfs_put_block_group_cache(fs_info);
2549
2550 /* 2979 /*
2551 * Here come 2 situations when btrfs is broken to flip readonly: 2980 * Here come 2 situations when btrfs is broken to flip readonly:
2552 * 2981 *
@@ -2572,6 +3001,8 @@ int close_ctree(struct btrfs_root *root)
2572 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 3001 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2573 } 3002 }
2574 3003
3004 btrfs_put_block_group_cache(fs_info);
3005
2575 kthread_stop(root->fs_info->transaction_kthread); 3006 kthread_stop(root->fs_info->transaction_kthread);
2576 kthread_stop(root->fs_info->cleaner_kthread); 3007 kthread_stop(root->fs_info->cleaner_kthread);
2577 3008
@@ -2603,7 +3034,6 @@ int close_ctree(struct btrfs_root *root)
2603 del_fs_roots(fs_info); 3034 del_fs_roots(fs_info);
2604 3035
2605 iput(fs_info->btree_inode); 3036 iput(fs_info->btree_inode);
2606 kfree(fs_info->delayed_root);
2607 3037
2608 btrfs_stop_workers(&fs_info->generic_worker); 3038 btrfs_stop_workers(&fs_info->generic_worker);
2609 btrfs_stop_workers(&fs_info->fixup_workers); 3039 btrfs_stop_workers(&fs_info->fixup_workers);
@@ -2617,6 +3047,7 @@ int close_ctree(struct btrfs_root *root)
2617 btrfs_stop_workers(&fs_info->submit_workers); 3047 btrfs_stop_workers(&fs_info->submit_workers);
2618 btrfs_stop_workers(&fs_info->delayed_workers); 3048 btrfs_stop_workers(&fs_info->delayed_workers);
2619 btrfs_stop_workers(&fs_info->caching_workers); 3049 btrfs_stop_workers(&fs_info->caching_workers);
3050 btrfs_stop_workers(&fs_info->readahead_workers);
2620 3051
2621 btrfs_close_devices(fs_info->fs_devices); 3052 btrfs_close_devices(fs_info->fs_devices);
2622 btrfs_mapping_tree_free(&fs_info->mapping_tree); 3053 btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2624,12 +3055,7 @@ int close_ctree(struct btrfs_root *root)
2624 bdi_destroy(&fs_info->bdi); 3055 bdi_destroy(&fs_info->bdi);
2625 cleanup_srcu_struct(&fs_info->subvol_srcu); 3056 cleanup_srcu_struct(&fs_info->subvol_srcu);
2626 3057
2627 kfree(fs_info->extent_root); 3058 free_fs_info(fs_info);
2628 kfree(fs_info->tree_root);
2629 kfree(fs_info->chunk_root);
2630 kfree(fs_info->dev_root);
2631 kfree(fs_info->csum_root);
2632 kfree(fs_info);
2633 3059
2634 return 0; 3060 return 0;
2635} 3061}
@@ -2735,7 +3161,8 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2735 return ret; 3161 return ret;
2736} 3162}
2737 3163
2738int btree_lock_page_hook(struct page *page) 3164static int btree_lock_page_hook(struct page *page, void *data,
3165 void (*flush_fn)(void *))
2739{ 3166{
2740 struct inode *inode = page->mapping->host; 3167 struct inode *inode = page->mapping->host;
2741 struct btrfs_root *root = BTRFS_I(inode)->root; 3168 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2752,7 +3179,10 @@ int btree_lock_page_hook(struct page *page)
2752 if (!eb) 3179 if (!eb)
2753 goto out; 3180 goto out;
2754 3181
2755 btrfs_tree_lock(eb); 3182 if (!btrfs_try_tree_write_lock(eb)) {
3183 flush_fn(data);
3184 btrfs_tree_lock(eb);
3185 }
2756 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3186 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
2757 3187
2758 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3188 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
@@ -2767,7 +3197,10 @@ int btree_lock_page_hook(struct page *page)
2767 btrfs_tree_unlock(eb); 3197 btrfs_tree_unlock(eb);
2768 free_extent_buffer(eb); 3198 free_extent_buffer(eb);
2769out: 3199out:
2770 lock_page(page); 3200 if (!trylock_page(page)) {
3201 flush_fn(data);
3202 lock_page(page);
3203 }
2771 return 0; 3204 return 0;
2772} 3205}
2773 3206
@@ -3123,6 +3556,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
3123static struct extent_io_ops btree_extent_io_ops = { 3556static struct extent_io_ops btree_extent_io_ops = {
3124 .write_cache_pages_lock_hook = btree_lock_page_hook, 3557 .write_cache_pages_lock_hook = btree_lock_page_hook,
3125 .readpage_end_io_hook = btree_readpage_end_io_hook, 3558 .readpage_end_io_hook = btree_readpage_end_io_hook,
3559 .readpage_io_failed_hook = btree_io_failed_hook,
3126 .submit_bio_hook = btree_submit_bio_hook, 3560 .submit_bio_hook = btree_submit_bio_hook,
3127 /* note we're sharing with inode.c for the merge bio hook */ 3561 /* note we're sharing with inode.c for the merge bio hook */
3128 .merge_bio_hook = btrfs_merge_bio_hook, 3562 .merge_bio_hook = btrfs_merge_bio_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index bec3ea4bd67..c99d0a8f13f 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -40,6 +40,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
40 u32 blocksize, u64 parent_transid); 40 u32 blocksize, u64 parent_transid);
41int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, 41int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
42 u64 parent_transid); 42 u64 parent_transid);
43int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
44 int mirror_num, struct extent_buffer **eb);
43struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, 45struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
44 u64 bytenr, u32 blocksize); 46 u64 bytenr, u32 blocksize);
45int clean_tree_block(struct btrfs_trans_handle *trans, 47int clean_tree_block(struct btrfs_trans_handle *trans,
@@ -83,8 +85,6 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
83 struct btrfs_fs_info *fs_info); 85 struct btrfs_fs_info *fs_info);
84int btrfs_add_log_tree(struct btrfs_trans_handle *trans, 86int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
85 struct btrfs_root *root); 87 struct btrfs_root *root);
86int btree_lock_page_hook(struct page *page);
87
88 88
89#ifdef CONFIG_DEBUG_LOCK_ALLOC 89#ifdef CONFIG_DEBUG_LOCK_ALLOC
90void btrfs_init_lockdep(void); 90void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5be06a2462..2ad813674d7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -23,6 +23,7 @@
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ratelimit.h>
26#include "compat.h" 27#include "compat.h"
27#include "hash.h" 28#include "hash.h"
28#include "ctree.h" 29#include "ctree.h"
@@ -52,6 +53,21 @@ enum {
52 CHUNK_ALLOC_LIMITED = 2, 53 CHUNK_ALLOC_LIMITED = 2,
53}; 54};
54 55
56/*
57 * Control how reservations are dealt with.
58 *
59 * RESERVE_FREE - freeing a reservation.
60 * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
61 * ENOSPC accounting
62 * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
63 * bytes_may_use as the ENOSPC accounting is done elsewhere
64 */
65enum {
66 RESERVE_FREE = 0,
67 RESERVE_ALLOC = 1,
68 RESERVE_ALLOC_NO_ACCOUNT = 2,
69};
70
55static int update_block_group(struct btrfs_trans_handle *trans, 71static int update_block_group(struct btrfs_trans_handle *trans,
56 struct btrfs_root *root, 72 struct btrfs_root *root,
57 u64 bytenr, u64 num_bytes, int alloc); 73 u64 bytenr, u64 num_bytes, int alloc);
@@ -81,6 +97,8 @@ static int find_next_key(struct btrfs_path *path, int level,
81 struct btrfs_key *key); 97 struct btrfs_key *key);
82static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 98static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
83 int dump_block_groups); 99 int dump_block_groups);
100static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
101 u64 num_bytes, int reserve);
84 102
85static noinline int 103static noinline int
86block_group_cache_done(struct btrfs_block_group_cache *cache) 104block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -104,7 +122,6 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
104 if (atomic_dec_and_test(&cache->count)) { 122 if (atomic_dec_and_test(&cache->count)) {
105 WARN_ON(cache->pinned > 0); 123 WARN_ON(cache->pinned > 0);
106 WARN_ON(cache->reserved > 0); 124 WARN_ON(cache->reserved > 0);
107 WARN_ON(cache->reserved_pinned > 0);
108 kfree(cache->free_space_ctl); 125 kfree(cache->free_space_ctl);
109 kfree(cache); 126 kfree(cache);
110 } 127 }
@@ -450,13 +467,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
450 struct btrfs_root *root, 467 struct btrfs_root *root,
451 int load_cache_only) 468 int load_cache_only)
452{ 469{
470 DEFINE_WAIT(wait);
453 struct btrfs_fs_info *fs_info = cache->fs_info; 471 struct btrfs_fs_info *fs_info = cache->fs_info;
454 struct btrfs_caching_control *caching_ctl; 472 struct btrfs_caching_control *caching_ctl;
455 int ret = 0; 473 int ret = 0;
456 474
457 smp_mb(); 475 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
458 if (cache->cached != BTRFS_CACHE_NO) 476 BUG_ON(!caching_ctl);
477
478 INIT_LIST_HEAD(&caching_ctl->list);
479 mutex_init(&caching_ctl->mutex);
480 init_waitqueue_head(&caching_ctl->wait);
481 caching_ctl->block_group = cache;
482 caching_ctl->progress = cache->key.objectid;
483 atomic_set(&caching_ctl->count, 1);
484 caching_ctl->work.func = caching_thread;
485
486 spin_lock(&cache->lock);
487 /*
488 * This should be a rare occasion, but this could happen I think in the
489 * case where one thread starts to load the space cache info, and then
490 * some other thread starts a transaction commit which tries to do an
491 * allocation while the other thread is still loading the space cache
492 * info. The previous loop should have kept us from choosing this block
493 * group, but if we've moved to the state where we will wait on caching
494 * block groups we need to first check if we're doing a fast load here,
495 * so we can wait for it to finish, otherwise we could end up allocating
496 * from a block group who's cache gets evicted for one reason or
497 * another.
498 */
499 while (cache->cached == BTRFS_CACHE_FAST) {
500 struct btrfs_caching_control *ctl;
501
502 ctl = cache->caching_ctl;
503 atomic_inc(&ctl->count);
504 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
505 spin_unlock(&cache->lock);
506
507 schedule();
508
509 finish_wait(&ctl->wait, &wait);
510 put_caching_control(ctl);
511 spin_lock(&cache->lock);
512 }
513
514 if (cache->cached != BTRFS_CACHE_NO) {
515 spin_unlock(&cache->lock);
516 kfree(caching_ctl);
459 return 0; 517 return 0;
518 }
519 WARN_ON(cache->caching_ctl);
520 cache->caching_ctl = caching_ctl;
521 cache->cached = BTRFS_CACHE_FAST;
522 spin_unlock(&cache->lock);
460 523
461 /* 524 /*
462 * We can't do the read from on-disk cache during a commit since we need 525 * We can't do the read from on-disk cache during a commit since we need
@@ -465,57 +528,53 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
465 * we likely hold important locks. 528 * we likely hold important locks.
466 */ 529 */
467 if (trans && (!trans->transaction->in_commit) && 530 if (trans && (!trans->transaction->in_commit) &&
468 (root && root != root->fs_info->tree_root)) { 531 (root && root != root->fs_info->tree_root) &&
469 spin_lock(&cache->lock); 532 btrfs_test_opt(root, SPACE_CACHE)) {
470 if (cache->cached != BTRFS_CACHE_NO) {
471 spin_unlock(&cache->lock);
472 return 0;
473 }
474 cache->cached = BTRFS_CACHE_STARTED;
475 spin_unlock(&cache->lock);
476
477 ret = load_free_space_cache(fs_info, cache); 533 ret = load_free_space_cache(fs_info, cache);
478 534
479 spin_lock(&cache->lock); 535 spin_lock(&cache->lock);
480 if (ret == 1) { 536 if (ret == 1) {
537 cache->caching_ctl = NULL;
481 cache->cached = BTRFS_CACHE_FINISHED; 538 cache->cached = BTRFS_CACHE_FINISHED;
482 cache->last_byte_to_unpin = (u64)-1; 539 cache->last_byte_to_unpin = (u64)-1;
483 } else { 540 } else {
484 cache->cached = BTRFS_CACHE_NO; 541 if (load_cache_only) {
542 cache->caching_ctl = NULL;
543 cache->cached = BTRFS_CACHE_NO;
544 } else {
545 cache->cached = BTRFS_CACHE_STARTED;
546 }
485 } 547 }
486 spin_unlock(&cache->lock); 548 spin_unlock(&cache->lock);
549 wake_up(&caching_ctl->wait);
487 if (ret == 1) { 550 if (ret == 1) {
551 put_caching_control(caching_ctl);
488 free_excluded_extents(fs_info->extent_root, cache); 552 free_excluded_extents(fs_info->extent_root, cache);
489 return 0; 553 return 0;
490 } 554 }
555 } else {
556 /*
557 * We are not going to do the fast caching, set cached to the
558 * appropriate value and wakeup any waiters.
559 */
560 spin_lock(&cache->lock);
561 if (load_cache_only) {
562 cache->caching_ctl = NULL;
563 cache->cached = BTRFS_CACHE_NO;
564 } else {
565 cache->cached = BTRFS_CACHE_STARTED;
566 }
567 spin_unlock(&cache->lock);
568 wake_up(&caching_ctl->wait);
491 } 569 }
492 570
493 if (load_cache_only) 571 if (load_cache_only) {
494 return 0; 572 put_caching_control(caching_ctl);
495
496 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
497 BUG_ON(!caching_ctl);
498
499 INIT_LIST_HEAD(&caching_ctl->list);
500 mutex_init(&caching_ctl->mutex);
501 init_waitqueue_head(&caching_ctl->wait);
502 caching_ctl->block_group = cache;
503 caching_ctl->progress = cache->key.objectid;
504 /* one for caching kthread, one for caching block group list */
505 atomic_set(&caching_ctl->count, 2);
506 caching_ctl->work.func = caching_thread;
507
508 spin_lock(&cache->lock);
509 if (cache->cached != BTRFS_CACHE_NO) {
510 spin_unlock(&cache->lock);
511 kfree(caching_ctl);
512 return 0; 573 return 0;
513 } 574 }
514 cache->caching_ctl = caching_ctl;
515 cache->cached = BTRFS_CACHE_STARTED;
516 spin_unlock(&cache->lock);
517 575
518 down_write(&fs_info->extent_commit_sem); 576 down_write(&fs_info->extent_commit_sem);
577 atomic_inc(&caching_ctl->count);
519 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 578 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
520 up_write(&fs_info->extent_commit_sem); 579 up_write(&fs_info->extent_commit_sem);
521 580
@@ -1770,18 +1829,18 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1770{ 1829{
1771 int ret; 1830 int ret;
1772 u64 discarded_bytes = 0; 1831 u64 discarded_bytes = 0;
1773 struct btrfs_multi_bio *multi = NULL; 1832 struct btrfs_bio *bbio = NULL;
1774 1833
1775 1834
1776 /* Tell the block device(s) that the sectors can be discarded */ 1835 /* Tell the block device(s) that the sectors can be discarded */
1777 ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, 1836 ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
1778 bytenr, &num_bytes, &multi, 0); 1837 bytenr, &num_bytes, &bbio, 0);
1779 if (!ret) { 1838 if (!ret) {
1780 struct btrfs_bio_stripe *stripe = multi->stripes; 1839 struct btrfs_bio_stripe *stripe = bbio->stripes;
1781 int i; 1840 int i;
1782 1841
1783 1842
1784 for (i = 0; i < multi->num_stripes; i++, stripe++) { 1843 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1785 if (!stripe->dev->can_discard) 1844 if (!stripe->dev->can_discard)
1786 continue; 1845 continue;
1787 1846
@@ -1800,7 +1859,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1800 */ 1859 */
1801 ret = 0; 1860 ret = 0;
1802 } 1861 }
1803 kfree(multi); 1862 kfree(bbio);
1804 } 1863 }
1805 1864
1806 if (actual_bytes) 1865 if (actual_bytes)
@@ -2700,6 +2759,13 @@ again:
2700 goto again; 2759 goto again;
2701 } 2760 }
2702 2761
2762 /* We've already setup this transaction, go ahead and exit */
2763 if (block_group->cache_generation == trans->transid &&
2764 i_size_read(inode)) {
2765 dcs = BTRFS_DC_SETUP;
2766 goto out_put;
2767 }
2768
2703 /* 2769 /*
2704 * We want to set the generation to 0, that way if anything goes wrong 2770 * We want to set the generation to 0, that way if anything goes wrong
2705 * from here on out we know not to trust this cache when we load up next 2771 * from here on out we know not to trust this cache when we load up next
@@ -2749,12 +2815,15 @@ again:
2749 if (!ret) 2815 if (!ret)
2750 dcs = BTRFS_DC_SETUP; 2816 dcs = BTRFS_DC_SETUP;
2751 btrfs_free_reserved_data_space(inode, num_pages); 2817 btrfs_free_reserved_data_space(inode, num_pages);
2818
2752out_put: 2819out_put:
2753 iput(inode); 2820 iput(inode);
2754out_free: 2821out_free:
2755 btrfs_release_path(path); 2822 btrfs_release_path(path);
2756out: 2823out:
2757 spin_lock(&block_group->lock); 2824 spin_lock(&block_group->lock);
2825 if (!ret)
2826 block_group->cache_generation = trans->transid;
2758 block_group->disk_cache_state = dcs; 2827 block_group->disk_cache_state = dcs;
2759 spin_unlock(&block_group->lock); 2828 spin_unlock(&block_group->lock);
2760 2829
@@ -3122,16 +3191,13 @@ commit_trans:
3122 return -ENOSPC; 3191 return -ENOSPC;
3123 } 3192 }
3124 data_sinfo->bytes_may_use += bytes; 3193 data_sinfo->bytes_may_use += bytes;
3125 BTRFS_I(inode)->reserved_bytes += bytes;
3126 spin_unlock(&data_sinfo->lock); 3194 spin_unlock(&data_sinfo->lock);
3127 3195
3128 return 0; 3196 return 0;
3129} 3197}
3130 3198
3131/* 3199/*
3132 * called when we are clearing an delalloc extent from the 3200 * Called if we need to clear a data reservation for this inode.
3133 * inode's io_tree or there was an error for whatever reason
3134 * after calling btrfs_check_data_free_space
3135 */ 3201 */
3136void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) 3202void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3137{ 3203{
@@ -3144,7 +3210,6 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3144 data_sinfo = BTRFS_I(inode)->space_info; 3210 data_sinfo = BTRFS_I(inode)->space_info;
3145 spin_lock(&data_sinfo->lock); 3211 spin_lock(&data_sinfo->lock);
3146 data_sinfo->bytes_may_use -= bytes; 3212 data_sinfo->bytes_may_use -= bytes;
3147 BTRFS_I(inode)->reserved_bytes -= bytes;
3148 spin_unlock(&data_sinfo->lock); 3213 spin_unlock(&data_sinfo->lock);
3149} 3214}
3150 3215
@@ -3165,6 +3230,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
3165 struct btrfs_space_info *sinfo, u64 alloc_bytes, 3230 struct btrfs_space_info *sinfo, u64 alloc_bytes,
3166 int force) 3231 int force)
3167{ 3232{
3233 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3168 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3234 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3169 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; 3235 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3170 u64 thresh; 3236 u64 thresh;
@@ -3173,11 +3239,18 @@ static int should_alloc_chunk(struct btrfs_root *root,
3173 return 1; 3239 return 1;
3174 3240
3175 /* 3241 /*
3242 * We need to take into account the global rsv because for all intents
3243 * and purposes it's used space. Don't worry about locking the
3244 * global_rsv, it doesn't change except when the transaction commits.
3245 */
3246 num_allocated += global_rsv->size;
3247
3248 /*
3176 * in limited mode, we want to have some free space up to 3249 * in limited mode, we want to have some free space up to
3177 * about 1% of the FS size. 3250 * about 1% of the FS size.
3178 */ 3251 */
3179 if (force == CHUNK_ALLOC_LIMITED) { 3252 if (force == CHUNK_ALLOC_LIMITED) {
3180 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); 3253 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3181 thresh = max_t(u64, 64 * 1024 * 1024, 3254 thresh = max_t(u64, 64 * 1024 * 1024,
3182 div_factor_fine(thresh, 1)); 3255 div_factor_fine(thresh, 1));
3183 3256
@@ -3199,7 +3272,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
3199 if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) 3272 if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
3200 return 0; 3273 return 0;
3201 3274
3202 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); 3275 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3203 3276
3204 /* 256MB or 5% of the FS */ 3277 /* 256MB or 5% of the FS */
3205 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); 3278 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
@@ -3302,24 +3375,26 @@ out:
3302/* 3375/*
3303 * shrink metadata reservation for delalloc 3376 * shrink metadata reservation for delalloc
3304 */ 3377 */
3305static int shrink_delalloc(struct btrfs_trans_handle *trans, 3378static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
3306 struct btrfs_root *root, u64 to_reclaim, int sync) 3379 bool wait_ordered)
3307{ 3380{
3308 struct btrfs_block_rsv *block_rsv; 3381 struct btrfs_block_rsv *block_rsv;
3309 struct btrfs_space_info *space_info; 3382 struct btrfs_space_info *space_info;
3383 struct btrfs_trans_handle *trans;
3310 u64 reserved; 3384 u64 reserved;
3311 u64 max_reclaim; 3385 u64 max_reclaim;
3312 u64 reclaimed = 0; 3386 u64 reclaimed = 0;
3313 long time_left; 3387 long time_left;
3314 int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3388 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3315 int loops = 0; 3389 int loops = 0;
3316 unsigned long progress; 3390 unsigned long progress;
3317 3391
3392 trans = (struct btrfs_trans_handle *)current->journal_info;
3318 block_rsv = &root->fs_info->delalloc_block_rsv; 3393 block_rsv = &root->fs_info->delalloc_block_rsv;
3319 space_info = block_rsv->space_info; 3394 space_info = block_rsv->space_info;
3320 3395
3321 smp_mb(); 3396 smp_mb();
3322 reserved = space_info->bytes_reserved; 3397 reserved = space_info->bytes_may_use;
3323 progress = space_info->reservation_progress; 3398 progress = space_info->reservation_progress;
3324 3399
3325 if (reserved == 0) 3400 if (reserved == 0)
@@ -3334,18 +3409,20 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3334 } 3409 }
3335 3410
3336 max_reclaim = min(reserved, to_reclaim); 3411 max_reclaim = min(reserved, to_reclaim);
3337 3412 nr_pages = max_t(unsigned long, nr_pages,
3413 max_reclaim >> PAGE_CACHE_SHIFT);
3338 while (loops < 1024) { 3414 while (loops < 1024) {
3339 /* have the flusher threads jump in and do some IO */ 3415 /* have the flusher threads jump in and do some IO */
3340 smp_mb(); 3416 smp_mb();
3341 nr_pages = min_t(unsigned long, nr_pages, 3417 nr_pages = min_t(unsigned long, nr_pages,
3342 root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT); 3418 root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
3343 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); 3419 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
3420 WB_REASON_FS_FREE_SPACE);
3344 3421
3345 spin_lock(&space_info->lock); 3422 spin_lock(&space_info->lock);
3346 if (reserved > space_info->bytes_reserved) 3423 if (reserved > space_info->bytes_may_use)
3347 reclaimed += reserved - space_info->bytes_reserved; 3424 reclaimed += reserved - space_info->bytes_may_use;
3348 reserved = space_info->bytes_reserved; 3425 reserved = space_info->bytes_may_use;
3349 spin_unlock(&space_info->lock); 3426 spin_unlock(&space_info->lock);
3350 3427
3351 loops++; 3428 loops++;
@@ -3356,11 +3433,15 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3356 if (trans && trans->transaction->blocked) 3433 if (trans && trans->transaction->blocked)
3357 return -EAGAIN; 3434 return -EAGAIN;
3358 3435
3359 time_left = schedule_timeout_interruptible(1); 3436 if (wait_ordered && !trans) {
3437 btrfs_wait_ordered_extents(root, 0, 0);
3438 } else {
3439 time_left = schedule_timeout_interruptible(1);
3360 3440
3361 /* We were interrupted, exit */ 3441 /* We were interrupted, exit */
3362 if (time_left) 3442 if (time_left)
3363 break; 3443 break;
3444 }
3364 3445
3365 /* we've kicked the IO a few times, if anything has been freed, 3446 /* we've kicked the IO a few times, if anything has been freed,
3366 * exit. There is no sense in looping here for a long time 3447 * exit. There is no sense in looping here for a long time
@@ -3375,34 +3456,90 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3375 } 3456 }
3376 3457
3377 } 3458 }
3378 if (reclaimed >= to_reclaim && !trans) 3459
3379 btrfs_wait_ordered_extents(root, 0, 0);
3380 return reclaimed >= to_reclaim; 3460 return reclaimed >= to_reclaim;
3381} 3461}
3382 3462
3383/* 3463/**
3384 * Retries tells us how many times we've called reserve_metadata_bytes. The 3464 * maybe_commit_transaction - possibly commit the transaction if its ok to
3385 * idea is if this is the first call (retries == 0) then we will add to our 3465 * @root - the root we're allocating for
3386 * reserved count if we can't make the allocation in order to hold our place 3466 * @bytes - the number of bytes we want to reserve
3387 * while we go and try and free up space. That way for retries > 1 we don't try 3467 * @force - force the commit
3388 * and add space, we just check to see if the amount of unused space is >= the
3389 * total space, meaning that our reservation is valid.
3390 * 3468 *
3391 * However if we don't intend to retry this reservation, pass -1 as retries so 3469 * This will check to make sure that committing the transaction will actually
3392 * that it short circuits this logic. 3470 * get us somewhere and then commit the transaction if it does. Otherwise it
3471 * will return -ENOSPC.
3393 */ 3472 */
3394static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, 3473static int may_commit_transaction(struct btrfs_root *root,
3395 struct btrfs_root *root, 3474 struct btrfs_space_info *space_info,
3475 u64 bytes, int force)
3476{
3477 struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
3478 struct btrfs_trans_handle *trans;
3479
3480 trans = (struct btrfs_trans_handle *)current->journal_info;
3481 if (trans)
3482 return -EAGAIN;
3483
3484 if (force)
3485 goto commit;
3486
3487 /* See if there is enough pinned space to make this reservation */
3488 spin_lock(&space_info->lock);
3489 if (space_info->bytes_pinned >= bytes) {
3490 spin_unlock(&space_info->lock);
3491 goto commit;
3492 }
3493 spin_unlock(&space_info->lock);
3494
3495 /*
3496 * See if there is some space in the delayed insertion reservation for
3497 * this reservation.
3498 */
3499 if (space_info != delayed_rsv->space_info)
3500 return -ENOSPC;
3501
3502 spin_lock(&delayed_rsv->lock);
3503 if (delayed_rsv->size < bytes) {
3504 spin_unlock(&delayed_rsv->lock);
3505 return -ENOSPC;
3506 }
3507 spin_unlock(&delayed_rsv->lock);
3508
3509commit:
3510 trans = btrfs_join_transaction(root);
3511 if (IS_ERR(trans))
3512 return -ENOSPC;
3513
3514 return btrfs_commit_transaction(trans, root);
3515}
3516
3517/**
3518 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
3519 * @root - the root we're allocating for
3520 * @block_rsv - the block_rsv we're allocating for
3521 * @orig_bytes - the number of bytes we want
3522 * @flush - wether or not we can flush to make our reservation
3523 *
3524 * This will reserve orgi_bytes number of bytes from the space info associated
3525 * with the block_rsv. If there is not enough space it will make an attempt to
3526 * flush out space to make room. It will do this by flushing delalloc if
3527 * possible or committing the transaction. If flush is 0 then no attempts to
3528 * regain reservations will be made and this will fail if there is not enough
3529 * space already.
3530 */
3531static int reserve_metadata_bytes(struct btrfs_root *root,
3396 struct btrfs_block_rsv *block_rsv, 3532 struct btrfs_block_rsv *block_rsv,
3397 u64 orig_bytes, int flush) 3533 u64 orig_bytes, int flush)
3398{ 3534{
3399 struct btrfs_space_info *space_info = block_rsv->space_info; 3535 struct btrfs_space_info *space_info = block_rsv->space_info;
3400 u64 unused; 3536 u64 used;
3401 u64 num_bytes = orig_bytes; 3537 u64 num_bytes = orig_bytes;
3402 int retries = 0; 3538 int retries = 0;
3403 int ret = 0; 3539 int ret = 0;
3404 bool committed = false; 3540 bool committed = false;
3405 bool flushing = false; 3541 bool flushing = false;
3542 bool wait_ordered = false;
3406 3543
3407again: 3544again:
3408 ret = 0; 3545 ret = 0;
@@ -3419,7 +3556,7 @@ again:
3419 * deadlock since we are waiting for the flusher to finish, but 3556 * deadlock since we are waiting for the flusher to finish, but
3420 * hold the current transaction open. 3557 * hold the current transaction open.
3421 */ 3558 */
3422 if (trans) 3559 if (current->journal_info)
3423 return -EAGAIN; 3560 return -EAGAIN;
3424 ret = wait_event_interruptible(space_info->wait, 3561 ret = wait_event_interruptible(space_info->wait,
3425 !space_info->flush); 3562 !space_info->flush);
@@ -3431,9 +3568,9 @@ again:
3431 } 3568 }
3432 3569
3433 ret = -ENOSPC; 3570 ret = -ENOSPC;
3434 unused = space_info->bytes_used + space_info->bytes_reserved + 3571 used = space_info->bytes_used + space_info->bytes_reserved +
3435 space_info->bytes_pinned + space_info->bytes_readonly + 3572 space_info->bytes_pinned + space_info->bytes_readonly +
3436 space_info->bytes_may_use; 3573 space_info->bytes_may_use;
3437 3574
3438 /* 3575 /*
3439 * The idea here is that we've not already over-reserved the block group 3576 * The idea here is that we've not already over-reserved the block group
@@ -3442,10 +3579,9 @@ again:
3442 * lets start flushing stuff first and then come back and try to make 3579 * lets start flushing stuff first and then come back and try to make
3443 * our reservation. 3580 * our reservation.
3444 */ 3581 */
3445 if (unused <= space_info->total_bytes) { 3582 if (used <= space_info->total_bytes) {
3446 unused = space_info->total_bytes - unused; 3583 if (used + orig_bytes <= space_info->total_bytes) {
3447 if (unused >= num_bytes) { 3584 space_info->bytes_may_use += orig_bytes;
3448 space_info->bytes_reserved += orig_bytes;
3449 ret = 0; 3585 ret = 0;
3450 } else { 3586 } else {
3451 /* 3587 /*
@@ -3461,10 +3597,64 @@ again:
3461 * amount plus the amount of bytes that we need for this 3597 * amount plus the amount of bytes that we need for this
3462 * reservation. 3598 * reservation.
3463 */ 3599 */
3464 num_bytes = unused - space_info->total_bytes + 3600 wait_ordered = true;
3601 num_bytes = used - space_info->total_bytes +
3465 (orig_bytes * (retries + 1)); 3602 (orig_bytes * (retries + 1));
3466 } 3603 }
3467 3604
3605 if (ret) {
3606 u64 profile = btrfs_get_alloc_profile(root, 0);
3607 u64 avail;
3608
3609 /*
3610 * If we have a lot of space that's pinned, don't bother doing
3611 * the overcommit dance yet and just commit the transaction.
3612 */
3613 avail = (space_info->total_bytes - space_info->bytes_used) * 8;
3614 do_div(avail, 10);
3615 if (space_info->bytes_pinned >= avail && flush && !committed) {
3616 space_info->flush = 1;
3617 flushing = true;
3618 spin_unlock(&space_info->lock);
3619 ret = may_commit_transaction(root, space_info,
3620 orig_bytes, 1);
3621 if (ret)
3622 goto out;
3623 committed = true;
3624 goto again;
3625 }
3626
3627 spin_lock(&root->fs_info->free_chunk_lock);
3628 avail = root->fs_info->free_chunk_space;
3629
3630 /*
3631 * If we have dup, raid1 or raid10 then only half of the free
3632 * space is actually useable.
3633 */
3634 if (profile & (BTRFS_BLOCK_GROUP_DUP |
3635 BTRFS_BLOCK_GROUP_RAID1 |
3636 BTRFS_BLOCK_GROUP_RAID10))
3637 avail >>= 1;
3638
3639 /*
3640 * If we aren't flushing don't let us overcommit too much, say
3641 * 1/8th of the space. If we can flush, let it overcommit up to
3642 * 1/2 of the space.
3643 */
3644 if (flush)
3645 avail >>= 3;
3646 else
3647 avail >>= 1;
3648 spin_unlock(&root->fs_info->free_chunk_lock);
3649
3650 if (used + num_bytes < space_info->total_bytes + avail) {
3651 space_info->bytes_may_use += orig_bytes;
3652 ret = 0;
3653 } else {
3654 wait_ordered = true;
3655 }
3656 }
3657
3468 /* 3658 /*
3469 * Couldn't make our reservation, save our place so while we're trying 3659 * Couldn't make our reservation, save our place so while we're trying
3470 * to reclaim space we can actually use it instead of somebody else 3660 * to reclaim space we can actually use it instead of somebody else
@@ -3484,7 +3674,7 @@ again:
3484 * We do synchronous shrinking since we don't actually unreserve 3674 * We do synchronous shrinking since we don't actually unreserve
3485 * metadata until after the IO is completed. 3675 * metadata until after the IO is completed.
3486 */ 3676 */
3487 ret = shrink_delalloc(trans, root, num_bytes, 1); 3677 ret = shrink_delalloc(root, num_bytes, wait_ordered);
3488 if (ret < 0) 3678 if (ret < 0)
3489 goto out; 3679 goto out;
3490 3680
@@ -3496,35 +3686,17 @@ again:
3496 * so go back around and try again. 3686 * so go back around and try again.
3497 */ 3687 */
3498 if (retries < 2) { 3688 if (retries < 2) {
3689 wait_ordered = true;
3499 retries++; 3690 retries++;
3500 goto again; 3691 goto again;
3501 } 3692 }
3502 3693
3503 /*
3504 * Not enough space to be reclaimed, don't bother committing the
3505 * transaction.
3506 */
3507 spin_lock(&space_info->lock);
3508 if (space_info->bytes_pinned < orig_bytes)
3509 ret = -ENOSPC;
3510 spin_unlock(&space_info->lock);
3511 if (ret)
3512 goto out;
3513
3514 ret = -EAGAIN;
3515 if (trans)
3516 goto out;
3517
3518 ret = -ENOSPC; 3694 ret = -ENOSPC;
3519 if (committed) 3695 if (committed)
3520 goto out; 3696 goto out;
3521 3697
3522 trans = btrfs_join_transaction(root); 3698 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3523 if (IS_ERR(trans))
3524 goto out;
3525 ret = btrfs_commit_transaction(trans, root);
3526 if (!ret) { 3699 if (!ret) {
3527 trans = NULL;
3528 committed = true; 3700 committed = true;
3529 goto again; 3701 goto again;
3530 } 3702 }
@@ -3542,10 +3714,12 @@ out:
3542static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans, 3714static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
3543 struct btrfs_root *root) 3715 struct btrfs_root *root)
3544{ 3716{
3545 struct btrfs_block_rsv *block_rsv; 3717 struct btrfs_block_rsv *block_rsv = NULL;
3546 if (root->ref_cows) 3718
3719 if (root->ref_cows || root == root->fs_info->csum_root)
3547 block_rsv = trans->block_rsv; 3720 block_rsv = trans->block_rsv;
3548 else 3721
3722 if (!block_rsv)
3549 block_rsv = root->block_rsv; 3723 block_rsv = root->block_rsv;
3550 3724
3551 if (!block_rsv) 3725 if (!block_rsv)
@@ -3616,7 +3790,7 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3616 } 3790 }
3617 if (num_bytes) { 3791 if (num_bytes) {
3618 spin_lock(&space_info->lock); 3792 spin_lock(&space_info->lock);
3619 space_info->bytes_reserved -= num_bytes; 3793 space_info->bytes_may_use -= num_bytes;
3620 space_info->reservation_progress++; 3794 space_info->reservation_progress++;
3621 spin_unlock(&space_info->lock); 3795 spin_unlock(&space_info->lock);
3622 } 3796 }
@@ -3640,9 +3814,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
3640{ 3814{
3641 memset(rsv, 0, sizeof(*rsv)); 3815 memset(rsv, 0, sizeof(*rsv));
3642 spin_lock_init(&rsv->lock); 3816 spin_lock_init(&rsv->lock);
3643 atomic_set(&rsv->usage, 1);
3644 rsv->priority = 6;
3645 INIT_LIST_HEAD(&rsv->list);
3646} 3817}
3647 3818
3648struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) 3819struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
@@ -3663,38 +3834,20 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3663void btrfs_free_block_rsv(struct btrfs_root *root, 3834void btrfs_free_block_rsv(struct btrfs_root *root,
3664 struct btrfs_block_rsv *rsv) 3835 struct btrfs_block_rsv *rsv)
3665{ 3836{
3666 if (rsv && atomic_dec_and_test(&rsv->usage)) { 3837 btrfs_block_rsv_release(root, rsv, (u64)-1);
3667 btrfs_block_rsv_release(root, rsv, (u64)-1); 3838 kfree(rsv);
3668 if (!rsv->durable)
3669 kfree(rsv);
3670 }
3671}
3672
3673/*
3674 * make the block_rsv struct be able to capture freed space.
3675 * the captured space will re-add to the the block_rsv struct
3676 * after transaction commit
3677 */
3678void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
3679 struct btrfs_block_rsv *block_rsv)
3680{
3681 block_rsv->durable = 1;
3682 mutex_lock(&fs_info->durable_block_rsv_mutex);
3683 list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
3684 mutex_unlock(&fs_info->durable_block_rsv_mutex);
3685} 3839}
3686 3840
3687int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, 3841static inline int __block_rsv_add(struct btrfs_root *root,
3688 struct btrfs_root *root, 3842 struct btrfs_block_rsv *block_rsv,
3689 struct btrfs_block_rsv *block_rsv, 3843 u64 num_bytes, int flush)
3690 u64 num_bytes)
3691{ 3844{
3692 int ret; 3845 int ret;
3693 3846
3694 if (num_bytes == 0) 3847 if (num_bytes == 0)
3695 return 0; 3848 return 0;
3696 3849
3697 ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1); 3850 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
3698 if (!ret) { 3851 if (!ret) {
3699 block_rsv_add_bytes(block_rsv, num_bytes, 1); 3852 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3700 return 0; 3853 return 0;
@@ -3703,55 +3856,80 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
3703 return ret; 3856 return ret;
3704} 3857}
3705 3858
3706int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, 3859int btrfs_block_rsv_add(struct btrfs_root *root,
3707 struct btrfs_root *root, 3860 struct btrfs_block_rsv *block_rsv,
3708 struct btrfs_block_rsv *block_rsv, 3861 u64 num_bytes)
3709 u64 min_reserved, int min_factor) 3862{
3863 return __block_rsv_add(root, block_rsv, num_bytes, 1);
3864}
3865
3866int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
3867 struct btrfs_block_rsv *block_rsv,
3868 u64 num_bytes)
3869{
3870 return __block_rsv_add(root, block_rsv, num_bytes, 0);
3871}
3872
3873int btrfs_block_rsv_check(struct btrfs_root *root,
3874 struct btrfs_block_rsv *block_rsv, int min_factor)
3710{ 3875{
3711 u64 num_bytes = 0; 3876 u64 num_bytes = 0;
3712 int commit_trans = 0;
3713 int ret = -ENOSPC; 3877 int ret = -ENOSPC;
3714 3878
3715 if (!block_rsv) 3879 if (!block_rsv)
3716 return 0; 3880 return 0;
3717 3881
3718 spin_lock(&block_rsv->lock); 3882 spin_lock(&block_rsv->lock);
3719 if (min_factor > 0) 3883 num_bytes = div_factor(block_rsv->size, min_factor);
3720 num_bytes = div_factor(block_rsv->size, min_factor); 3884 if (block_rsv->reserved >= num_bytes)
3721 if (min_reserved > num_bytes) 3885 ret = 0;
3722 num_bytes = min_reserved; 3886 spin_unlock(&block_rsv->lock);
3723 3887
3724 if (block_rsv->reserved >= num_bytes) { 3888 return ret;
3889}
3890
3891static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
3892 struct btrfs_block_rsv *block_rsv,
3893 u64 min_reserved, int flush)
3894{
3895 u64 num_bytes = 0;
3896 int ret = -ENOSPC;
3897
3898 if (!block_rsv)
3899 return 0;
3900
3901 spin_lock(&block_rsv->lock);
3902 num_bytes = min_reserved;
3903 if (block_rsv->reserved >= num_bytes)
3725 ret = 0; 3904 ret = 0;
3726 } else { 3905 else
3727 num_bytes -= block_rsv->reserved; 3906 num_bytes -= block_rsv->reserved;
3728 if (block_rsv->durable &&
3729 block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
3730 commit_trans = 1;
3731 }
3732 spin_unlock(&block_rsv->lock); 3907 spin_unlock(&block_rsv->lock);
3908
3733 if (!ret) 3909 if (!ret)
3734 return 0; 3910 return 0;
3735 3911
3736 if (block_rsv->refill_used) { 3912 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
3737 ret = reserve_metadata_bytes(trans, root, block_rsv, 3913 if (!ret) {
3738 num_bytes, 0); 3914 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3739 if (!ret) {
3740 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3741 return 0;
3742 }
3743 }
3744
3745 if (commit_trans) {
3746 if (trans)
3747 return -EAGAIN;
3748 trans = btrfs_join_transaction(root);
3749 BUG_ON(IS_ERR(trans));
3750 ret = btrfs_commit_transaction(trans, root);
3751 return 0; 3915 return 0;
3752 } 3916 }
3753 3917
3754 return -ENOSPC; 3918 return ret;
3919}
3920
3921int btrfs_block_rsv_refill(struct btrfs_root *root,
3922 struct btrfs_block_rsv *block_rsv,
3923 u64 min_reserved)
3924{
3925 return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
3926}
3927
3928int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
3929 struct btrfs_block_rsv *block_rsv,
3930 u64 min_reserved)
3931{
3932 return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
3755} 3933}
3756 3934
3757int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 3935int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
@@ -3783,7 +3961,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
3783 u64 num_bytes; 3961 u64 num_bytes;
3784 u64 meta_used; 3962 u64 meta_used;
3785 u64 data_used; 3963 u64 data_used;
3786 int csum_size = btrfs_super_csum_size(&fs_info->super_copy); 3964 int csum_size = btrfs_super_csum_size(fs_info->super_copy);
3787 3965
3788 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); 3966 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
3789 spin_lock(&sinfo->lock); 3967 spin_lock(&sinfo->lock);
@@ -3827,12 +4005,12 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3827 if (sinfo->total_bytes > num_bytes) { 4005 if (sinfo->total_bytes > num_bytes) {
3828 num_bytes = sinfo->total_bytes - num_bytes; 4006 num_bytes = sinfo->total_bytes - num_bytes;
3829 block_rsv->reserved += num_bytes; 4007 block_rsv->reserved += num_bytes;
3830 sinfo->bytes_reserved += num_bytes; 4008 sinfo->bytes_may_use += num_bytes;
3831 } 4009 }
3832 4010
3833 if (block_rsv->reserved >= block_rsv->size) { 4011 if (block_rsv->reserved >= block_rsv->size) {
3834 num_bytes = block_rsv->reserved - block_rsv->size; 4012 num_bytes = block_rsv->reserved - block_rsv->size;
3835 sinfo->bytes_reserved -= num_bytes; 4013 sinfo->bytes_may_use -= num_bytes;
3836 sinfo->reservation_progress++; 4014 sinfo->reservation_progress++;
3837 block_rsv->reserved = block_rsv->size; 4015 block_rsv->reserved = block_rsv->size;
3838 block_rsv->full = 1; 4016 block_rsv->full = 1;
@@ -3848,16 +4026,13 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3848 4026
3849 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4027 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3850 fs_info->chunk_block_rsv.space_info = space_info; 4028 fs_info->chunk_block_rsv.space_info = space_info;
3851 fs_info->chunk_block_rsv.priority = 10;
3852 4029
3853 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4030 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3854 fs_info->global_block_rsv.space_info = space_info; 4031 fs_info->global_block_rsv.space_info = space_info;
3855 fs_info->global_block_rsv.priority = 10;
3856 fs_info->global_block_rsv.refill_used = 1;
3857 fs_info->delalloc_block_rsv.space_info = space_info; 4032 fs_info->delalloc_block_rsv.space_info = space_info;
3858 fs_info->trans_block_rsv.space_info = space_info; 4033 fs_info->trans_block_rsv.space_info = space_info;
3859 fs_info->empty_block_rsv.space_info = space_info; 4034 fs_info->empty_block_rsv.space_info = space_info;
3860 fs_info->empty_block_rsv.priority = 10; 4035 fs_info->delayed_block_rsv.space_info = space_info;
3861 4036
3862 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; 4037 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
3863 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; 4038 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
@@ -3865,10 +4040,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3865 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 4040 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
3866 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 4041 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
3867 4042
3868 btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
3869
3870 btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
3871
3872 update_global_block_rsv(fs_info); 4043 update_global_block_rsv(fs_info);
3873} 4044}
3874 4045
@@ -3881,37 +4052,8 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
3881 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 4052 WARN_ON(fs_info->trans_block_rsv.reserved > 0);
3882 WARN_ON(fs_info->chunk_block_rsv.size > 0); 4053 WARN_ON(fs_info->chunk_block_rsv.size > 0);
3883 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 4054 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
3884} 4055 WARN_ON(fs_info->delayed_block_rsv.size > 0);
3885 4056 WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
3886int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
3887 struct btrfs_root *root,
3888 struct btrfs_block_rsv *rsv)
3889{
3890 struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv;
3891 u64 num_bytes;
3892 int ret;
3893
3894 /*
3895 * Truncate should be freeing data, but give us 2 items just in case it
3896 * needs to use some space. We may want to be smarter about this in the
3897 * future.
3898 */
3899 num_bytes = btrfs_calc_trans_metadata_size(root, 2);
3900
3901 /* We already have enough bytes, just return */
3902 if (rsv->reserved >= num_bytes)
3903 return 0;
3904
3905 num_bytes -= rsv->reserved;
3906
3907 /*
3908 * You should have reserved enough space before hand to do this, so this
3909 * should not fail.
3910 */
3911 ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes);
3912 BUG_ON(ret);
3913
3914 return 0;
3915} 4057}
3916 4058
3917void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 4059void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -3920,9 +4062,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3920 if (!trans->bytes_reserved) 4062 if (!trans->bytes_reserved)
3921 return; 4063 return;
3922 4064
3923 BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv); 4065 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
3924 btrfs_block_rsv_release(root, trans->block_rsv,
3925 trans->bytes_reserved);
3926 trans->bytes_reserved = 0; 4066 trans->bytes_reserved = 0;
3927} 4067}
3928 4068
@@ -3964,33 +4104,99 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
3964 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4104 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3965} 4105}
3966 4106
4107/**
4108 * drop_outstanding_extent - drop an outstanding extent
4109 * @inode: the inode we're dropping the extent for
4110 *
4111 * This is called when we are freeing up an outstanding extent, either called
4112 * after an error or after an extent is written. This will return the number of
4113 * reserved extents that need to be freed. This must be called with
4114 * BTRFS_I(inode)->lock held.
4115 */
3967static unsigned drop_outstanding_extent(struct inode *inode) 4116static unsigned drop_outstanding_extent(struct inode *inode)
3968{ 4117{
4118 unsigned drop_inode_space = 0;
3969 unsigned dropped_extents = 0; 4119 unsigned dropped_extents = 0;
3970 4120
3971 spin_lock(&BTRFS_I(inode)->lock);
3972 BUG_ON(!BTRFS_I(inode)->outstanding_extents); 4121 BUG_ON(!BTRFS_I(inode)->outstanding_extents);
3973 BTRFS_I(inode)->outstanding_extents--; 4122 BTRFS_I(inode)->outstanding_extents--;
3974 4123
4124 if (BTRFS_I(inode)->outstanding_extents == 0 &&
4125 BTRFS_I(inode)->delalloc_meta_reserved) {
4126 drop_inode_space = 1;
4127 BTRFS_I(inode)->delalloc_meta_reserved = 0;
4128 }
4129
3975 /* 4130 /*
3976 * If we have more or the same amount of outsanding extents than we have 4131 * If we have more or the same amount of outsanding extents than we have
3977 * reserved then we need to leave the reserved extents count alone. 4132 * reserved then we need to leave the reserved extents count alone.
3978 */ 4133 */
3979 if (BTRFS_I(inode)->outstanding_extents >= 4134 if (BTRFS_I(inode)->outstanding_extents >=
3980 BTRFS_I(inode)->reserved_extents) 4135 BTRFS_I(inode)->reserved_extents)
3981 goto out; 4136 return drop_inode_space;
3982 4137
3983 dropped_extents = BTRFS_I(inode)->reserved_extents - 4138 dropped_extents = BTRFS_I(inode)->reserved_extents -
3984 BTRFS_I(inode)->outstanding_extents; 4139 BTRFS_I(inode)->outstanding_extents;
3985 BTRFS_I(inode)->reserved_extents -= dropped_extents; 4140 BTRFS_I(inode)->reserved_extents -= dropped_extents;
3986out: 4141 return dropped_extents + drop_inode_space;
3987 spin_unlock(&BTRFS_I(inode)->lock);
3988 return dropped_extents;
3989} 4142}
3990 4143
3991static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes) 4144/**
4145 * calc_csum_metadata_size - return the amount of metada space that must be
4146 * reserved/free'd for the given bytes.
4147 * @inode: the inode we're manipulating
4148 * @num_bytes: the number of bytes in question
4149 * @reserve: 1 if we are reserving space, 0 if we are freeing space
4150 *
4151 * This adjusts the number of csum_bytes in the inode and then returns the
4152 * correct amount of metadata that must either be reserved or freed. We
4153 * calculate how many checksums we can fit into one leaf and then divide the
4154 * number of bytes that will need to be checksumed by this value to figure out
4155 * how many checksums will be required. If we are adding bytes then the number
4156 * may go up and we will return the number of additional bytes that must be
4157 * reserved. If it is going down we will return the number of bytes that must
4158 * be freed.
4159 *
4160 * This must be called with BTRFS_I(inode)->lock held.
4161 */
4162static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
4163 int reserve)
3992{ 4164{
3993 return num_bytes >>= 3; 4165 struct btrfs_root *root = BTRFS_I(inode)->root;
4166 u64 csum_size;
4167 int num_csums_per_leaf;
4168 int num_csums;
4169 int old_csums;
4170
4171 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
4172 BTRFS_I(inode)->csum_bytes == 0)
4173 return 0;
4174
4175 old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4176 if (reserve)
4177 BTRFS_I(inode)->csum_bytes += num_bytes;
4178 else
4179 BTRFS_I(inode)->csum_bytes -= num_bytes;
4180 csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
4181 num_csums_per_leaf = (int)div64_u64(csum_size,
4182 sizeof(struct btrfs_csum_item) +
4183 sizeof(struct btrfs_disk_key));
4184 num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4185 num_csums = num_csums + num_csums_per_leaf - 1;
4186 num_csums = num_csums / num_csums_per_leaf;
4187
4188 old_csums = old_csums + num_csums_per_leaf - 1;
4189 old_csums = old_csums / num_csums_per_leaf;
4190
4191 /* No change, no need to reserve more */
4192 if (old_csums == num_csums)
4193 return 0;
4194
4195 if (reserve)
4196 return btrfs_calc_trans_metadata_size(root,
4197 num_csums - old_csums);
4198
4199 return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
3994} 4200}
3995 4201
3996int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) 4202int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
@@ -3999,9 +4205,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3999 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 4205 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4000 u64 to_reserve = 0; 4206 u64 to_reserve = 0;
4001 unsigned nr_extents = 0; 4207 unsigned nr_extents = 0;
4208 int flush = 1;
4002 int ret; 4209 int ret;
4003 4210
4004 if (btrfs_transaction_in_commit(root->fs_info)) 4211 if (btrfs_is_free_space_inode(root, inode))
4212 flush = 0;
4213
4214 if (flush && btrfs_transaction_in_commit(root->fs_info))
4005 schedule_timeout(1); 4215 schedule_timeout(1);
4006 4216
4007 num_bytes = ALIGN(num_bytes, root->sectorsize); 4217 num_bytes = ALIGN(num_bytes, root->sectorsize);
@@ -4014,21 +4224,41 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4014 nr_extents = BTRFS_I(inode)->outstanding_extents - 4224 nr_extents = BTRFS_I(inode)->outstanding_extents -
4015 BTRFS_I(inode)->reserved_extents; 4225 BTRFS_I(inode)->reserved_extents;
4016 BTRFS_I(inode)->reserved_extents += nr_extents; 4226 BTRFS_I(inode)->reserved_extents += nr_extents;
4227 }
4017 4228
4018 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); 4229 /*
4230 * Add an item to reserve for updating the inode when we complete the
4231 * delalloc io.
4232 */
4233 if (!BTRFS_I(inode)->delalloc_meta_reserved) {
4234 nr_extents++;
4235 BTRFS_I(inode)->delalloc_meta_reserved = 1;
4019 } 4236 }
4237
4238 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4239 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4020 spin_unlock(&BTRFS_I(inode)->lock); 4240 spin_unlock(&BTRFS_I(inode)->lock);
4021 4241
4022 to_reserve += calc_csum_metadata_size(inode, num_bytes); 4242 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4023 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
4024 if (ret) { 4243 if (ret) {
4244 u64 to_free = 0;
4025 unsigned dropped; 4245 unsigned dropped;
4246
4247 spin_lock(&BTRFS_I(inode)->lock);
4248 dropped = drop_outstanding_extent(inode);
4249 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4250 spin_unlock(&BTRFS_I(inode)->lock);
4251 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4252
4026 /* 4253 /*
4027 * We don't need the return value since our reservation failed, 4254 * Somebody could have come in and twiddled with the
4028 * we just need to clean up our counter. 4255 * reservation, so if we have to free more than we would have
4256 * reserved from this reservation go ahead and release those
4257 * bytes.
4029 */ 4258 */
4030 dropped = drop_outstanding_extent(inode); 4259 to_free -= to_reserve;
4031 WARN_ON(dropped > 1); 4260 if (to_free)
4261 btrfs_block_rsv_release(root, block_rsv, to_free);
4032 return ret; 4262 return ret;
4033 } 4263 }
4034 4264
@@ -4037,6 +4267,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4037 return 0; 4267 return 0;
4038} 4268}
4039 4269
4270/**
4271 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
4272 * @inode: the inode to release the reservation for
4273 * @num_bytes: the number of bytes we're releasing
4274 *
4275 * This will release the metadata reservation for an inode. This can be called
4276 * once we complete IO for a given set of bytes to release their metadata
4277 * reservations.
4278 */
4040void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) 4279void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4041{ 4280{
4042 struct btrfs_root *root = BTRFS_I(inode)->root; 4281 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4044,9 +4283,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4044 unsigned dropped; 4283 unsigned dropped;
4045 4284
4046 num_bytes = ALIGN(num_bytes, root->sectorsize); 4285 num_bytes = ALIGN(num_bytes, root->sectorsize);
4286 spin_lock(&BTRFS_I(inode)->lock);
4047 dropped = drop_outstanding_extent(inode); 4287 dropped = drop_outstanding_extent(inode);
4048 4288
4049 to_free = calc_csum_metadata_size(inode, num_bytes); 4289 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4290 spin_unlock(&BTRFS_I(inode)->lock);
4050 if (dropped > 0) 4291 if (dropped > 0)
4051 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4292 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4052 4293
@@ -4054,6 +4295,21 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4054 to_free); 4295 to_free);
4055} 4296}
4056 4297
4298/**
4299 * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
4300 * @inode: inode we're writing to
4301 * @num_bytes: the number of bytes we want to allocate
4302 *
4303 * This will do the following things
4304 *
4305 * o reserve space in the data space info for num_bytes
4306 * o reserve space in the metadata space info based on number of outstanding
4307 * extents and how much csums will be needed
4308 * o add to the inodes ->delalloc_bytes
4309 * o add it to the fs_info's delalloc inodes list.
4310 *
4311 * This will return 0 for success and -ENOSPC if there is no space left.
4312 */
4057int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) 4313int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4058{ 4314{
4059 int ret; 4315 int ret;
@@ -4071,6 +4327,19 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4071 return 0; 4327 return 0;
4072} 4328}
4073 4329
4330/**
4331 * btrfs_delalloc_release_space - release data and metadata space for delalloc
4332 * @inode: inode we're releasing space for
4333 * @num_bytes: the number of bytes we want to free up
4334 *
4335 * This must be matched with a call to btrfs_delalloc_reserve_space. This is
4336 * called in the case that we don't need the metadata AND data reservations
4337 * anymore. So if there is an error or we insert an inline extent.
4338 *
4339 * This function will release the metadata space that was not used and will
4340 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
4341 * list if there are no delalloc bytes left.
4342 */
4074void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) 4343void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4075{ 4344{
4076 btrfs_delalloc_release_metadata(inode, num_bytes); 4345 btrfs_delalloc_release_metadata(inode, num_bytes);
@@ -4090,12 +4359,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4090 4359
4091 /* block accounting for super block */ 4360 /* block accounting for super block */
4092 spin_lock(&info->delalloc_lock); 4361 spin_lock(&info->delalloc_lock);
4093 old_val = btrfs_super_bytes_used(&info->super_copy); 4362 old_val = btrfs_super_bytes_used(info->super_copy);
4094 if (alloc) 4363 if (alloc)
4095 old_val += num_bytes; 4364 old_val += num_bytes;
4096 else 4365 else
4097 old_val -= num_bytes; 4366 old_val -= num_bytes;
4098 btrfs_set_super_bytes_used(&info->super_copy, old_val); 4367 btrfs_set_super_bytes_used(info->super_copy, old_val);
4099 spin_unlock(&info->delalloc_lock); 4368 spin_unlock(&info->delalloc_lock);
4100 4369
4101 while (total) { 4370 while (total) {
@@ -4123,7 +4392,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4123 spin_lock(&cache->space_info->lock); 4392 spin_lock(&cache->space_info->lock);
4124 spin_lock(&cache->lock); 4393 spin_lock(&cache->lock);
4125 4394
4126 if (btrfs_super_cache_generation(&info->super_copy) != 0 && 4395 if (btrfs_test_opt(root, SPACE_CACHE) &&
4127 cache->disk_cache_state < BTRFS_DC_CLEAR) 4396 cache->disk_cache_state < BTRFS_DC_CLEAR)
4128 cache->disk_cache_state = BTRFS_DC_CLEAR; 4397 cache->disk_cache_state = BTRFS_DC_CLEAR;
4129 4398
@@ -4135,7 +4404,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4135 btrfs_set_block_group_used(&cache->item, old_val); 4404 btrfs_set_block_group_used(&cache->item, old_val);
4136 cache->reserved -= num_bytes; 4405 cache->reserved -= num_bytes;
4137 cache->space_info->bytes_reserved -= num_bytes; 4406 cache->space_info->bytes_reserved -= num_bytes;
4138 cache->space_info->reservation_progress++;
4139 cache->space_info->bytes_used += num_bytes; 4407 cache->space_info->bytes_used += num_bytes;
4140 cache->space_info->disk_used += num_bytes * factor; 4408 cache->space_info->disk_used += num_bytes * factor;
4141 spin_unlock(&cache->lock); 4409 spin_unlock(&cache->lock);
@@ -4187,7 +4455,6 @@ static int pin_down_extent(struct btrfs_root *root,
4187 if (reserved) { 4455 if (reserved) {
4188 cache->reserved -= num_bytes; 4456 cache->reserved -= num_bytes;
4189 cache->space_info->bytes_reserved -= num_bytes; 4457 cache->space_info->bytes_reserved -= num_bytes;
4190 cache->space_info->reservation_progress++;
4191 } 4458 }
4192 spin_unlock(&cache->lock); 4459 spin_unlock(&cache->lock);
4193 spin_unlock(&cache->space_info->lock); 4460 spin_unlock(&cache->space_info->lock);
@@ -4215,45 +4482,82 @@ int btrfs_pin_extent(struct btrfs_root *root,
4215} 4482}
4216 4483
4217/* 4484/*
4218 * update size of reserved extents. this function may return -EAGAIN 4485 * this function must be called within transaction
4219 * if 'reserve' is true or 'sinfo' is false. 4486 */
4487int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
4488 struct btrfs_root *root,
4489 u64 bytenr, u64 num_bytes)
4490{
4491 struct btrfs_block_group_cache *cache;
4492
4493 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4494 BUG_ON(!cache);
4495
4496 /*
4497 * pull in the free space cache (if any) so that our pin
4498 * removes the free space from the cache. We have load_only set
4499 * to one because the slow code to read in the free extents does check
4500 * the pinned extents.
4501 */
4502 cache_block_group(cache, trans, root, 1);
4503
4504 pin_down_extent(root, cache, bytenr, num_bytes, 0);
4505
4506 /* remove us from the free space cache (if we're there at all) */
4507 btrfs_remove_free_space(cache, bytenr, num_bytes);
4508 btrfs_put_block_group(cache);
4509 return 0;
4510}
4511
4512/**
4513 * btrfs_update_reserved_bytes - update the block_group and space info counters
4514 * @cache: The cache we are manipulating
4515 * @num_bytes: The number of bytes in question
4516 * @reserve: One of the reservation enums
4517 *
4518 * This is called by the allocator when it reserves space, or by somebody who is
4519 * freeing space that was never actually used on disk. For example if you
4520 * reserve some space for a new leaf in transaction A and before transaction A
4521 * commits you free that leaf, you call this with reserve set to 0 in order to
4522 * clear the reservation.
4523 *
4524 * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
4525 * ENOSPC accounting. For data we handle the reservation through clearing the
4526 * delalloc bits in the io_tree. We have to do this since we could end up
4527 * allocating less disk space for the amount of data we have reserved in the
4528 * case of compression.
4529 *
4530 * If this is a reservation and the block group has become read only we cannot
4531 * make the reservation and return -EAGAIN, otherwise this function always
4532 * succeeds.
4220 */ 4533 */
4221int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 4534static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4222 u64 num_bytes, int reserve, int sinfo) 4535 u64 num_bytes, int reserve)
4223{ 4536{
4537 struct btrfs_space_info *space_info = cache->space_info;
4224 int ret = 0; 4538 int ret = 0;
4225 if (sinfo) { 4539 spin_lock(&space_info->lock);
4226 struct btrfs_space_info *space_info = cache->space_info; 4540 spin_lock(&cache->lock);
4227 spin_lock(&space_info->lock); 4541 if (reserve != RESERVE_FREE) {
4228 spin_lock(&cache->lock);
4229 if (reserve) {
4230 if (cache->ro) {
4231 ret = -EAGAIN;
4232 } else {
4233 cache->reserved += num_bytes;
4234 space_info->bytes_reserved += num_bytes;
4235 }
4236 } else {
4237 if (cache->ro)
4238 space_info->bytes_readonly += num_bytes;
4239 cache->reserved -= num_bytes;
4240 space_info->bytes_reserved -= num_bytes;
4241 space_info->reservation_progress++;
4242 }
4243 spin_unlock(&cache->lock);
4244 spin_unlock(&space_info->lock);
4245 } else {
4246 spin_lock(&cache->lock);
4247 if (cache->ro) { 4542 if (cache->ro) {
4248 ret = -EAGAIN; 4543 ret = -EAGAIN;
4249 } else { 4544 } else {
4250 if (reserve) 4545 cache->reserved += num_bytes;
4251 cache->reserved += num_bytes; 4546 space_info->bytes_reserved += num_bytes;
4252 else 4547 if (reserve == RESERVE_ALLOC) {
4253 cache->reserved -= num_bytes; 4548 BUG_ON(space_info->bytes_may_use < num_bytes);
4549 space_info->bytes_may_use -= num_bytes;
4550 }
4254 } 4551 }
4255 spin_unlock(&cache->lock); 4552 } else {
4553 if (cache->ro)
4554 space_info->bytes_readonly += num_bytes;
4555 cache->reserved -= num_bytes;
4556 space_info->bytes_reserved -= num_bytes;
4557 space_info->reservation_progress++;
4256 } 4558 }
4559 spin_unlock(&cache->lock);
4560 spin_unlock(&space_info->lock);
4257 return ret; 4561 return ret;
4258} 4562}
4259 4563
@@ -4319,13 +4623,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4319 spin_lock(&cache->lock); 4623 spin_lock(&cache->lock);
4320 cache->pinned -= len; 4624 cache->pinned -= len;
4321 cache->space_info->bytes_pinned -= len; 4625 cache->space_info->bytes_pinned -= len;
4322 if (cache->ro) { 4626 if (cache->ro)
4323 cache->space_info->bytes_readonly += len; 4627 cache->space_info->bytes_readonly += len;
4324 } else if (cache->reserved_pinned > 0) {
4325 len = min(len, cache->reserved_pinned);
4326 cache->reserved_pinned -= len;
4327 cache->space_info->bytes_reserved += len;
4328 }
4329 spin_unlock(&cache->lock); 4628 spin_unlock(&cache->lock);
4330 spin_unlock(&cache->space_info->lock); 4629 spin_unlock(&cache->space_info->lock);
4331 } 4630 }
@@ -4340,11 +4639,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4340{ 4639{
4341 struct btrfs_fs_info *fs_info = root->fs_info; 4640 struct btrfs_fs_info *fs_info = root->fs_info;
4342 struct extent_io_tree *unpin; 4641 struct extent_io_tree *unpin;
4343 struct btrfs_block_rsv *block_rsv;
4344 struct btrfs_block_rsv *next_rsv;
4345 u64 start; 4642 u64 start;
4346 u64 end; 4643 u64 end;
4347 int idx;
4348 int ret; 4644 int ret;
4349 4645
4350 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 4646 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -4367,30 +4663,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4367 cond_resched(); 4663 cond_resched();
4368 } 4664 }
4369 4665
4370 mutex_lock(&fs_info->durable_block_rsv_mutex);
4371 list_for_each_entry_safe(block_rsv, next_rsv,
4372 &fs_info->durable_block_rsv_list, list) {
4373
4374 idx = trans->transid & 0x1;
4375 if (block_rsv->freed[idx] > 0) {
4376 block_rsv_add_bytes(block_rsv,
4377 block_rsv->freed[idx], 0);
4378 block_rsv->freed[idx] = 0;
4379 }
4380 if (atomic_read(&block_rsv->usage) == 0) {
4381 btrfs_block_rsv_release(root, block_rsv, (u64)-1);
4382
4383 if (block_rsv->freed[0] == 0 &&
4384 block_rsv->freed[1] == 0) {
4385 list_del_init(&block_rsv->list);
4386 kfree(block_rsv);
4387 }
4388 } else {
4389 btrfs_block_rsv_release(root, block_rsv, 0);
4390 }
4391 }
4392 mutex_unlock(&fs_info->durable_block_rsv_mutex);
4393
4394 return 0; 4666 return 0;
4395} 4667}
4396 4668
@@ -4668,7 +4940,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4668 struct extent_buffer *buf, 4940 struct extent_buffer *buf,
4669 u64 parent, int last_ref) 4941 u64 parent, int last_ref)
4670{ 4942{
4671 struct btrfs_block_rsv *block_rsv;
4672 struct btrfs_block_group_cache *cache = NULL; 4943 struct btrfs_block_group_cache *cache = NULL;
4673 int ret; 4944 int ret;
4674 4945
@@ -4683,64 +4954,24 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4683 if (!last_ref) 4954 if (!last_ref)
4684 return; 4955 return;
4685 4956
4686 block_rsv = get_block_rsv(trans, root);
4687 cache = btrfs_lookup_block_group(root->fs_info, buf->start); 4957 cache = btrfs_lookup_block_group(root->fs_info, buf->start);
4688 if (block_rsv->space_info != cache->space_info)
4689 goto out;
4690 4958
4691 if (btrfs_header_generation(buf) == trans->transid) { 4959 if (btrfs_header_generation(buf) == trans->transid) {
4692 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 4960 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4693 ret = check_ref_cleanup(trans, root, buf->start); 4961 ret = check_ref_cleanup(trans, root, buf->start);
4694 if (!ret) 4962 if (!ret)
4695 goto pin; 4963 goto out;
4696 } 4964 }
4697 4965
4698 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 4966 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
4699 pin_down_extent(root, cache, buf->start, buf->len, 1); 4967 pin_down_extent(root, cache, buf->start, buf->len, 1);
4700 goto pin; 4968 goto out;
4701 } 4969 }
4702 4970
4703 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 4971 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4704 4972
4705 btrfs_add_free_space(cache, buf->start, buf->len); 4973 btrfs_add_free_space(cache, buf->start, buf->len);
4706 ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0); 4974 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
4707 if (ret == -EAGAIN) {
4708 /* block group became read-only */
4709 btrfs_update_reserved_bytes(cache, buf->len, 0, 1);
4710 goto out;
4711 }
4712
4713 ret = 1;
4714 spin_lock(&block_rsv->lock);
4715 if (block_rsv->reserved < block_rsv->size) {
4716 block_rsv->reserved += buf->len;
4717 ret = 0;
4718 }
4719 spin_unlock(&block_rsv->lock);
4720
4721 if (ret) {
4722 spin_lock(&cache->space_info->lock);
4723 cache->space_info->bytes_reserved -= buf->len;
4724 cache->space_info->reservation_progress++;
4725 spin_unlock(&cache->space_info->lock);
4726 }
4727 goto out;
4728 }
4729pin:
4730 if (block_rsv->durable && !cache->ro) {
4731 ret = 0;
4732 spin_lock(&cache->lock);
4733 if (!cache->ro) {
4734 cache->reserved_pinned += buf->len;
4735 ret = 1;
4736 }
4737 spin_unlock(&cache->lock);
4738
4739 if (ret) {
4740 spin_lock(&block_rsv->lock);
4741 block_rsv->freed[trans->transid & 0x1] += buf->len;
4742 spin_unlock(&block_rsv->lock);
4743 }
4744 } 4975 }
4745out: 4976out:
4746 /* 4977 /*
@@ -4876,17 +5107,20 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4876 struct btrfs_root *root = orig_root->fs_info->extent_root; 5107 struct btrfs_root *root = orig_root->fs_info->extent_root;
4877 struct btrfs_free_cluster *last_ptr = NULL; 5108 struct btrfs_free_cluster *last_ptr = NULL;
4878 struct btrfs_block_group_cache *block_group = NULL; 5109 struct btrfs_block_group_cache *block_group = NULL;
5110 struct btrfs_block_group_cache *used_block_group;
4879 int empty_cluster = 2 * 1024 * 1024; 5111 int empty_cluster = 2 * 1024 * 1024;
4880 int allowed_chunk_alloc = 0; 5112 int allowed_chunk_alloc = 0;
4881 int done_chunk_alloc = 0; 5113 int done_chunk_alloc = 0;
4882 struct btrfs_space_info *space_info; 5114 struct btrfs_space_info *space_info;
4883 int last_ptr_loop = 0;
4884 int loop = 0; 5115 int loop = 0;
4885 int index = 0; 5116 int index = 0;
5117 int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
5118 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
4886 bool found_uncached_bg = false; 5119 bool found_uncached_bg = false;
4887 bool failed_cluster_refill = false; 5120 bool failed_cluster_refill = false;
4888 bool failed_alloc = false; 5121 bool failed_alloc = false;
4889 bool use_cluster = true; 5122 bool use_cluster = true;
5123 bool have_caching_bg = false;
4890 u64 ideal_cache_percent = 0; 5124 u64 ideal_cache_percent = 0;
4891 u64 ideal_cache_offset = 0; 5125 u64 ideal_cache_offset = 0;
4892 5126
@@ -4939,6 +5173,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4939ideal_cache: 5173ideal_cache:
4940 block_group = btrfs_lookup_block_group(root->fs_info, 5174 block_group = btrfs_lookup_block_group(root->fs_info,
4941 search_start); 5175 search_start);
5176 used_block_group = block_group;
4942 /* 5177 /*
4943 * we don't want to use the block group if it doesn't match our 5178 * we don't want to use the block group if it doesn't match our
4944 * allocation bits, or if its not cached. 5179 * allocation bits, or if its not cached.
@@ -4969,12 +5204,14 @@ ideal_cache:
4969 } 5204 }
4970 } 5205 }
4971search: 5206search:
5207 have_caching_bg = false;
4972 down_read(&space_info->groups_sem); 5208 down_read(&space_info->groups_sem);
4973 list_for_each_entry(block_group, &space_info->block_groups[index], 5209 list_for_each_entry(block_group, &space_info->block_groups[index],
4974 list) { 5210 list) {
4975 u64 offset; 5211 u64 offset;
4976 int cached; 5212 int cached;
4977 5213
5214 used_block_group = block_group;
4978 btrfs_get_block_group(block_group); 5215 btrfs_get_block_group(block_group);
4979 search_start = block_group->key.objectid; 5216 search_start = block_group->key.objectid;
4980 5217
@@ -4998,13 +5235,15 @@ search:
4998 } 5235 }
4999 5236
5000have_block_group: 5237have_block_group:
5001 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { 5238 cached = block_group_cache_done(block_group);
5239 if (unlikely(!cached)) {
5002 u64 free_percent; 5240 u64 free_percent;
5003 5241
5242 found_uncached_bg = true;
5004 ret = cache_block_group(block_group, trans, 5243 ret = cache_block_group(block_group, trans,
5005 orig_root, 1); 5244 orig_root, 1);
5006 if (block_group->cached == BTRFS_CACHE_FINISHED) 5245 if (block_group->cached == BTRFS_CACHE_FINISHED)
5007 goto have_block_group; 5246 goto alloc;
5008 5247
5009 free_percent = btrfs_block_group_used(&block_group->item); 5248 free_percent = btrfs_block_group_used(&block_group->item);
5010 free_percent *= 100; 5249 free_percent *= 100;
@@ -5026,7 +5265,6 @@ have_block_group:
5026 orig_root, 0); 5265 orig_root, 0);
5027 BUG_ON(ret); 5266 BUG_ON(ret);
5028 } 5267 }
5029 found_uncached_bg = true;
5030 5268
5031 /* 5269 /*
5032 * If loop is set for cached only, try the next block 5270 * If loop is set for cached only, try the next block
@@ -5036,94 +5274,80 @@ have_block_group:
5036 goto loop; 5274 goto loop;
5037 } 5275 }
5038 5276
5039 cached = block_group_cache_done(block_group); 5277alloc:
5040 if (unlikely(!cached))
5041 found_uncached_bg = true;
5042
5043 if (unlikely(block_group->ro)) 5278 if (unlikely(block_group->ro))
5044 goto loop; 5279 goto loop;
5045 5280
5046 spin_lock(&block_group->free_space_ctl->tree_lock); 5281 spin_lock(&block_group->free_space_ctl->tree_lock);
5047 if (cached && 5282 if (cached &&
5048 block_group->free_space_ctl->free_space < 5283 block_group->free_space_ctl->free_space <
5049 num_bytes + empty_size) { 5284 num_bytes + empty_cluster + empty_size) {
5050 spin_unlock(&block_group->free_space_ctl->tree_lock); 5285 spin_unlock(&block_group->free_space_ctl->tree_lock);
5051 goto loop; 5286 goto loop;
5052 } 5287 }
5053 spin_unlock(&block_group->free_space_ctl->tree_lock); 5288 spin_unlock(&block_group->free_space_ctl->tree_lock);
5054 5289
5055 /* 5290 /*
5056 * Ok we want to try and use the cluster allocator, so lets look 5291 * Ok we want to try and use the cluster allocator, so
5057 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will 5292 * lets look there
5058 * have tried the cluster allocator plenty of times at this
5059 * point and not have found anything, so we are likely way too
5060 * fragmented for the clustering stuff to find anything, so lets
5061 * just skip it and let the allocator find whatever block it can
5062 * find
5063 */ 5293 */
5064 if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) { 5294 if (last_ptr) {
5065 /* 5295 /*
5066 * the refill lock keeps out other 5296 * the refill lock keeps out other
5067 * people trying to start a new cluster 5297 * people trying to start a new cluster
5068 */ 5298 */
5069 spin_lock(&last_ptr->refill_lock); 5299 spin_lock(&last_ptr->refill_lock);
5070 if (last_ptr->block_group && 5300 used_block_group = last_ptr->block_group;
5071 (last_ptr->block_group->ro || 5301 if (used_block_group != block_group &&
5072 !block_group_bits(last_ptr->block_group, data))) { 5302 (!used_block_group ||
5073 offset = 0; 5303 used_block_group->ro ||
5304 !block_group_bits(used_block_group, data))) {
5305 used_block_group = block_group;
5074 goto refill_cluster; 5306 goto refill_cluster;
5075 } 5307 }
5076 5308
5077 offset = btrfs_alloc_from_cluster(block_group, last_ptr, 5309 if (used_block_group != block_group)
5078 num_bytes, search_start); 5310 btrfs_get_block_group(used_block_group);
5311
5312 offset = btrfs_alloc_from_cluster(used_block_group,
5313 last_ptr, num_bytes, used_block_group->key.objectid);
5079 if (offset) { 5314 if (offset) {
5080 /* we have a block, we're done */ 5315 /* we have a block, we're done */
5081 spin_unlock(&last_ptr->refill_lock); 5316 spin_unlock(&last_ptr->refill_lock);
5082 goto checks; 5317 goto checks;
5083 } 5318 }
5084 5319
5085 spin_lock(&last_ptr->lock); 5320 WARN_ON(last_ptr->block_group != used_block_group);
5086 /* 5321 if (used_block_group != block_group) {
5087 * whoops, this cluster doesn't actually point to 5322 btrfs_put_block_group(used_block_group);
5088 * this block group. Get a ref on the block 5323 used_block_group = block_group;
5089 * group is does point to and try again
5090 */
5091 if (!last_ptr_loop && last_ptr->block_group &&
5092 last_ptr->block_group != block_group &&
5093 index <=
5094 get_block_group_index(last_ptr->block_group)) {
5095
5096 btrfs_put_block_group(block_group);
5097 block_group = last_ptr->block_group;
5098 btrfs_get_block_group(block_group);
5099 spin_unlock(&last_ptr->lock);
5100 spin_unlock(&last_ptr->refill_lock);
5101
5102 last_ptr_loop = 1;
5103 search_start = block_group->key.objectid;
5104 /*
5105 * we know this block group is properly
5106 * in the list because
5107 * btrfs_remove_block_group, drops the
5108 * cluster before it removes the block
5109 * group from the list
5110 */
5111 goto have_block_group;
5112 } 5324 }
5113 spin_unlock(&last_ptr->lock);
5114refill_cluster: 5325refill_cluster:
5326 BUG_ON(used_block_group != block_group);
5327 /* If we are on LOOP_NO_EMPTY_SIZE, we can't
5328 * set up a new clusters, so lets just skip it
5329 * and let the allocator find whatever block
5330 * it can find. If we reach this point, we
5331 * will have tried the cluster allocator
5332 * plenty of times and not have found
5333 * anything, so we are likely way too
5334 * fragmented for the clustering stuff to find
5335 * anything. */
5336 if (loop >= LOOP_NO_EMPTY_SIZE) {
5337 spin_unlock(&last_ptr->refill_lock);
5338 goto unclustered_alloc;
5339 }
5340
5115 /* 5341 /*
5116 * this cluster didn't work out, free it and 5342 * this cluster didn't work out, free it and
5117 * start over 5343 * start over
5118 */ 5344 */
5119 btrfs_return_cluster_to_free_space(NULL, last_ptr); 5345 btrfs_return_cluster_to_free_space(NULL, last_ptr);
5120 5346
5121 last_ptr_loop = 0;
5122
5123 /* allocate a cluster in this block group */ 5347 /* allocate a cluster in this block group */
5124 ret = btrfs_find_space_cluster(trans, root, 5348 ret = btrfs_find_space_cluster(trans, root,
5125 block_group, last_ptr, 5349 block_group, last_ptr,
5126 offset, num_bytes, 5350 search_start, num_bytes,
5127 empty_cluster + empty_size); 5351 empty_cluster + empty_size);
5128 if (ret == 0) { 5352 if (ret == 0) {
5129 /* 5353 /*
@@ -5159,6 +5383,7 @@ refill_cluster:
5159 goto loop; 5383 goto loop;
5160 } 5384 }
5161 5385
5386unclustered_alloc:
5162 offset = btrfs_find_space_for_alloc(block_group, search_start, 5387 offset = btrfs_find_space_for_alloc(block_group, search_start,
5163 num_bytes, empty_size); 5388 num_bytes, empty_size);
5164 /* 5389 /*
@@ -5177,20 +5402,22 @@ refill_cluster:
5177 failed_alloc = true; 5402 failed_alloc = true;
5178 goto have_block_group; 5403 goto have_block_group;
5179 } else if (!offset) { 5404 } else if (!offset) {
5405 if (!cached)
5406 have_caching_bg = true;
5180 goto loop; 5407 goto loop;
5181 } 5408 }
5182checks: 5409checks:
5183 search_start = stripe_align(root, offset); 5410 search_start = stripe_align(root, offset);
5184 /* move on to the next group */ 5411 /* move on to the next group */
5185 if (search_start + num_bytes >= search_end) { 5412 if (search_start + num_bytes >= search_end) {
5186 btrfs_add_free_space(block_group, offset, num_bytes); 5413 btrfs_add_free_space(used_block_group, offset, num_bytes);
5187 goto loop; 5414 goto loop;
5188 } 5415 }
5189 5416
5190 /* move on to the next group */ 5417 /* move on to the next group */
5191 if (search_start + num_bytes > 5418 if (search_start + num_bytes >
5192 block_group->key.objectid + block_group->key.offset) { 5419 used_block_group->key.objectid + used_block_group->key.offset) {
5193 btrfs_add_free_space(block_group, offset, num_bytes); 5420 btrfs_add_free_space(used_block_group, offset, num_bytes);
5194 goto loop; 5421 goto loop;
5195 } 5422 }
5196 5423
@@ -5198,14 +5425,14 @@ checks:
5198 ins->offset = num_bytes; 5425 ins->offset = num_bytes;
5199 5426
5200 if (offset < search_start) 5427 if (offset < search_start)
5201 btrfs_add_free_space(block_group, offset, 5428 btrfs_add_free_space(used_block_group, offset,
5202 search_start - offset); 5429 search_start - offset);
5203 BUG_ON(offset > search_start); 5430 BUG_ON(offset > search_start);
5204 5431
5205 ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1, 5432 ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
5206 (data & BTRFS_BLOCK_GROUP_DATA)); 5433 alloc_type);
5207 if (ret == -EAGAIN) { 5434 if (ret == -EAGAIN) {
5208 btrfs_add_free_space(block_group, offset, num_bytes); 5435 btrfs_add_free_space(used_block_group, offset, num_bytes);
5209 goto loop; 5436 goto loop;
5210 } 5437 }
5211 5438
@@ -5214,19 +5441,26 @@ checks:
5214 ins->offset = num_bytes; 5441 ins->offset = num_bytes;
5215 5442
5216 if (offset < search_start) 5443 if (offset < search_start)
5217 btrfs_add_free_space(block_group, offset, 5444 btrfs_add_free_space(used_block_group, offset,
5218 search_start - offset); 5445 search_start - offset);
5219 BUG_ON(offset > search_start); 5446 BUG_ON(offset > search_start);
5447 if (used_block_group != block_group)
5448 btrfs_put_block_group(used_block_group);
5220 btrfs_put_block_group(block_group); 5449 btrfs_put_block_group(block_group);
5221 break; 5450 break;
5222loop: 5451loop:
5223 failed_cluster_refill = false; 5452 failed_cluster_refill = false;
5224 failed_alloc = false; 5453 failed_alloc = false;
5225 BUG_ON(index != get_block_group_index(block_group)); 5454 BUG_ON(index != get_block_group_index(block_group));
5455 if (used_block_group != block_group)
5456 btrfs_put_block_group(used_block_group);
5226 btrfs_put_block_group(block_group); 5457 btrfs_put_block_group(block_group);
5227 } 5458 }
5228 up_read(&space_info->groups_sem); 5459 up_read(&space_info->groups_sem);
5229 5460
5461 if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
5462 goto search;
5463
5230 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) 5464 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
5231 goto search; 5465 goto search;
5232 5466
@@ -5325,7 +5559,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
5325 int index = 0; 5559 int index = 0;
5326 5560
5327 spin_lock(&info->lock); 5561 spin_lock(&info->lock);
5328 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 5562 printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
5563 (unsigned long long)info->flags,
5329 (unsigned long long)(info->total_bytes - info->bytes_used - 5564 (unsigned long long)(info->total_bytes - info->bytes_used -
5330 info->bytes_pinned - info->bytes_reserved - 5565 info->bytes_pinned - info->bytes_reserved -
5331 info->bytes_readonly), 5566 info->bytes_readonly),
@@ -5411,7 +5646,8 @@ again:
5411 return ret; 5646 return ret;
5412} 5647}
5413 5648
5414int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) 5649static int __btrfs_free_reserved_extent(struct btrfs_root *root,
5650 u64 start, u64 len, int pin)
5415{ 5651{
5416 struct btrfs_block_group_cache *cache; 5652 struct btrfs_block_group_cache *cache;
5417 int ret = 0; 5653 int ret = 0;
@@ -5426,8 +5662,12 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
5426 if (btrfs_test_opt(root, DISCARD)) 5662 if (btrfs_test_opt(root, DISCARD))
5427 ret = btrfs_discard_extent(root, start, len, NULL); 5663 ret = btrfs_discard_extent(root, start, len, NULL);
5428 5664
5429 btrfs_add_free_space(cache, start, len); 5665 if (pin)
5430 btrfs_update_reserved_bytes(cache, len, 0, 1); 5666 pin_down_extent(root, cache, start, len, 1);
5667 else {
5668 btrfs_add_free_space(cache, start, len);
5669 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
5670 }
5431 btrfs_put_block_group(cache); 5671 btrfs_put_block_group(cache);
5432 5672
5433 trace_btrfs_reserved_extent_free(root, start, len); 5673 trace_btrfs_reserved_extent_free(root, start, len);
@@ -5435,6 +5675,18 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
5435 return ret; 5675 return ret;
5436} 5676}
5437 5677
5678int btrfs_free_reserved_extent(struct btrfs_root *root,
5679 u64 start, u64 len)
5680{
5681 return __btrfs_free_reserved_extent(root, start, len, 0);
5682}
5683
5684int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
5685 u64 start, u64 len)
5686{
5687 return __btrfs_free_reserved_extent(root, start, len, 1);
5688}
5689
5438static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 5690static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5439 struct btrfs_root *root, 5691 struct btrfs_root *root,
5440 u64 parent, u64 root_objectid, 5692 u64 parent, u64 root_objectid,
@@ -5630,7 +5882,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5630 put_caching_control(caching_ctl); 5882 put_caching_control(caching_ctl);
5631 } 5883 }
5632 5884
5633 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1); 5885 ret = btrfs_update_reserved_bytes(block_group, ins->offset,
5886 RESERVE_ALLOC_NO_ACCOUNT);
5634 BUG_ON(ret); 5887 BUG_ON(ret);
5635 btrfs_put_block_group(block_group); 5888 btrfs_put_block_group(block_group);
5636 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 5889 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
@@ -5687,8 +5940,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5687 block_rsv = get_block_rsv(trans, root); 5940 block_rsv = get_block_rsv(trans, root);
5688 5941
5689 if (block_rsv->size == 0) { 5942 if (block_rsv->size == 0) {
5690 ret = reserve_metadata_bytes(trans, root, block_rsv, 5943 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
5691 blocksize, 0);
5692 /* 5944 /*
5693 * If we couldn't reserve metadata bytes try and use some from 5945 * If we couldn't reserve metadata bytes try and use some from
5694 * the global reserve. 5946 * the global reserve.
@@ -5708,13 +5960,15 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5708 if (!ret) 5960 if (!ret)
5709 return block_rsv; 5961 return block_rsv;
5710 if (ret) { 5962 if (ret) {
5711 WARN_ON(1); 5963 static DEFINE_RATELIMIT_STATE(_rs,
5712 ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize, 5964 DEFAULT_RATELIMIT_INTERVAL,
5713 0); 5965 /*DEFAULT_RATELIMIT_BURST*/ 2);
5966 if (__ratelimit(&_rs)) {
5967 printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
5968 WARN_ON(1);
5969 }
5970 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
5714 if (!ret) { 5971 if (!ret) {
5715 spin_lock(&block_rsv->lock);
5716 block_rsv->size += blocksize;
5717 spin_unlock(&block_rsv->lock);
5718 return block_rsv; 5972 return block_rsv;
5719 } else if (ret && block_rsv != global_rsv) { 5973 } else if (ret && block_rsv != global_rsv) {
5720 ret = block_rsv_use_bytes(global_rsv, blocksize); 5974 ret = block_rsv_use_bytes(global_rsv, blocksize);
@@ -6592,12 +6846,9 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
6592 cache->bytes_super - btrfs_block_group_used(&cache->item); 6846 cache->bytes_super - btrfs_block_group_used(&cache->item);
6593 6847
6594 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 6848 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
6595 sinfo->bytes_may_use + sinfo->bytes_readonly + 6849 sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
6596 cache->reserved_pinned + num_bytes + min_allocable_bytes <= 6850 min_allocable_bytes <= sinfo->total_bytes) {
6597 sinfo->total_bytes) {
6598 sinfo->bytes_readonly += num_bytes; 6851 sinfo->bytes_readonly += num_bytes;
6599 sinfo->bytes_reserved += cache->reserved_pinned;
6600 cache->reserved_pinned = 0;
6601 cache->ro = 1; 6852 cache->ro = 1;
6602 ret = 0; 6853 ret = 0;
6603 } 6854 }
@@ -6964,7 +7215,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
6964 struct btrfs_space_info, 7215 struct btrfs_space_info,
6965 list); 7216 list);
6966 if (space_info->bytes_pinned > 0 || 7217 if (space_info->bytes_pinned > 0 ||
6967 space_info->bytes_reserved > 0) { 7218 space_info->bytes_reserved > 0 ||
7219 space_info->bytes_may_use > 0) {
6968 WARN_ON(1); 7220 WARN_ON(1);
6969 dump_space_info(space_info, 0, 0); 7221 dump_space_info(space_info, 0, 0);
6970 } 7222 }
@@ -7006,14 +7258,12 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7006 return -ENOMEM; 7258 return -ENOMEM;
7007 path->reada = 1; 7259 path->reada = 1;
7008 7260
7009 cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); 7261 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
7010 if (cache_gen != 0 && 7262 if (btrfs_test_opt(root, SPACE_CACHE) &&
7011 btrfs_super_generation(&root->fs_info->super_copy) != cache_gen) 7263 btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
7012 need_clear = 1; 7264 need_clear = 1;
7013 if (btrfs_test_opt(root, CLEAR_CACHE)) 7265 if (btrfs_test_opt(root, CLEAR_CACHE))
7014 need_clear = 1; 7266 need_clear = 1;
7015 if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
7016 printk(KERN_INFO "btrfs: disk space caching is enabled\n");
7017 7267
7018 while (1) { 7268 while (1) {
7019 ret = find_first_block_group(root, path, &key); 7269 ret = find_first_block_group(root, path, &key);
@@ -7252,7 +7502,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7252 goto out; 7502 goto out;
7253 } 7503 }
7254 7504
7255 inode = lookup_free_space_inode(root, block_group, path); 7505 inode = lookup_free_space_inode(tree_root, block_group, path);
7256 if (!IS_ERR(inode)) { 7506 if (!IS_ERR(inode)) {
7257 ret = btrfs_orphan_add(trans, inode); 7507 ret = btrfs_orphan_add(trans, inode);
7258 BUG_ON(ret); 7508 BUG_ON(ret);
@@ -7268,7 +7518,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7268 spin_unlock(&block_group->lock); 7518 spin_unlock(&block_group->lock);
7269 } 7519 }
7270 /* One for our lookup ref */ 7520 /* One for our lookup ref */
7271 iput(inode); 7521 btrfs_add_delayed_iput(inode);
7272 } 7522 }
7273 7523
7274 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 7524 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@ -7339,7 +7589,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
7339 int mixed = 0; 7589 int mixed = 0;
7340 int ret; 7590 int ret;
7341 7591
7342 disk_super = &fs_info->super_copy; 7592 disk_super = fs_info->super_copy;
7343 if (!btrfs_super_root(disk_super)) 7593 if (!btrfs_super_root(disk_super))
7344 return 1; 7594 return 1;
7345 7595
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d418164a35f..49f3c9dc09f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -17,6 +17,7 @@
17#include "compat.h" 17#include "compat.h"
18#include "ctree.h" 18#include "ctree.h"
19#include "btrfs_inode.h" 19#include "btrfs_inode.h"
20#include "volumes.h"
20 21
21static struct kmem_cache *extent_state_cache; 22static struct kmem_cache *extent_state_cache;
22static struct kmem_cache *extent_buffer_cache; 23static struct kmem_cache *extent_buffer_cache;
@@ -894,6 +895,202 @@ search_again:
894 goto again; 895 goto again;
895} 896}
896 897
898/**
899 * convert_extent - convert all bits in a given range from one bit to another
900 * @tree: the io tree to search
901 * @start: the start offset in bytes
902 * @end: the end offset in bytes (inclusive)
903 * @bits: the bits to set in this range
904 * @clear_bits: the bits to clear in this range
905 * @mask: the allocation mask
906 *
907 * This will go through and set bits for the given range. If any states exist
908 * already in this range they are set with the given bit and cleared of the
909 * clear_bits. This is only meant to be used by things that are mergeable, ie
910 * converting from say DELALLOC to DIRTY. This is not meant to be used with
911 * boundary bits like LOCK.
912 */
913int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
914 int bits, int clear_bits, gfp_t mask)
915{
916 struct extent_state *state;
917 struct extent_state *prealloc = NULL;
918 struct rb_node *node;
919 int err = 0;
920 u64 last_start;
921 u64 last_end;
922
923again:
924 if (!prealloc && (mask & __GFP_WAIT)) {
925 prealloc = alloc_extent_state(mask);
926 if (!prealloc)
927 return -ENOMEM;
928 }
929
930 spin_lock(&tree->lock);
931 /*
932 * this search will find all the extents that end after
933 * our range starts.
934 */
935 node = tree_search(tree, start);
936 if (!node) {
937 prealloc = alloc_extent_state_atomic(prealloc);
938 if (!prealloc) {
939 err = -ENOMEM;
940 goto out;
941 }
942 err = insert_state(tree, prealloc, start, end, &bits);
943 prealloc = NULL;
944 BUG_ON(err == -EEXIST);
945 goto out;
946 }
947 state = rb_entry(node, struct extent_state, rb_node);
948hit_next:
949 last_start = state->start;
950 last_end = state->end;
951
952 /*
953 * | ---- desired range ---- |
954 * | state |
955 *
956 * Just lock what we found and keep going
957 */
958 if (state->start == start && state->end <= end) {
959 struct rb_node *next_node;
960
961 set_state_bits(tree, state, &bits);
962 clear_state_bit(tree, state, &clear_bits, 0);
963
964 merge_state(tree, state);
965 if (last_end == (u64)-1)
966 goto out;
967
968 start = last_end + 1;
969 next_node = rb_next(&state->rb_node);
970 if (next_node && start < end && prealloc && !need_resched()) {
971 state = rb_entry(next_node, struct extent_state,
972 rb_node);
973 if (state->start == start)
974 goto hit_next;
975 }
976 goto search_again;
977 }
978
979 /*
980 * | ---- desired range ---- |
981 * | state |
982 * or
983 * | ------------- state -------------- |
984 *
985 * We need to split the extent we found, and may flip bits on
986 * second half.
987 *
988 * If the extent we found extends past our
989 * range, we just split and search again. It'll get split
990 * again the next time though.
991 *
992 * If the extent we found is inside our range, we set the
993 * desired bit on it.
994 */
995 if (state->start < start) {
996 prealloc = alloc_extent_state_atomic(prealloc);
997 if (!prealloc) {
998 err = -ENOMEM;
999 goto out;
1000 }
1001 err = split_state(tree, state, prealloc, start);
1002 BUG_ON(err == -EEXIST);
1003 prealloc = NULL;
1004 if (err)
1005 goto out;
1006 if (state->end <= end) {
1007 set_state_bits(tree, state, &bits);
1008 clear_state_bit(tree, state, &clear_bits, 0);
1009 merge_state(tree, state);
1010 if (last_end == (u64)-1)
1011 goto out;
1012 start = last_end + 1;
1013 }
1014 goto search_again;
1015 }
1016 /*
1017 * | ---- desired range ---- |
1018 * | state | or | state |
1019 *
1020 * There's a hole, we need to insert something in it and
1021 * ignore the extent we found.
1022 */
1023 if (state->start > start) {
1024 u64 this_end;
1025 if (end < last_start)
1026 this_end = end;
1027 else
1028 this_end = last_start - 1;
1029
1030 prealloc = alloc_extent_state_atomic(prealloc);
1031 if (!prealloc) {
1032 err = -ENOMEM;
1033 goto out;
1034 }
1035
1036 /*
1037 * Avoid to free 'prealloc' if it can be merged with
1038 * the later extent.
1039 */
1040 err = insert_state(tree, prealloc, start, this_end,
1041 &bits);
1042 BUG_ON(err == -EEXIST);
1043 if (err) {
1044 free_extent_state(prealloc);
1045 prealloc = NULL;
1046 goto out;
1047 }
1048 prealloc = NULL;
1049 start = this_end + 1;
1050 goto search_again;
1051 }
1052 /*
1053 * | ---- desired range ---- |
1054 * | state |
1055 * We need to split the extent, and set the bit
1056 * on the first half
1057 */
1058 if (state->start <= end && state->end > end) {
1059 prealloc = alloc_extent_state_atomic(prealloc);
1060 if (!prealloc) {
1061 err = -ENOMEM;
1062 goto out;
1063 }
1064
1065 err = split_state(tree, state, prealloc, end + 1);
1066 BUG_ON(err == -EEXIST);
1067
1068 set_state_bits(tree, prealloc, &bits);
1069 clear_state_bit(tree, prealloc, &clear_bits, 0);
1070
1071 merge_state(tree, prealloc);
1072 prealloc = NULL;
1073 goto out;
1074 }
1075
1076 goto search_again;
1077
1078out:
1079 spin_unlock(&tree->lock);
1080 if (prealloc)
1081 free_extent_state(prealloc);
1082
1083 return err;
1084
1085search_again:
1086 if (start > end)
1087 goto out;
1088 spin_unlock(&tree->lock);
1089 if (mask & __GFP_WAIT)
1090 cond_resched();
1091 goto again;
1092}
1093
897/* wrappers around set/clear extent bit */ 1094/* wrappers around set/clear extent bit */
898int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1095int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
899 gfp_t mask) 1096 gfp_t mask)
@@ -919,7 +1116,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
919 struct extent_state **cached_state, gfp_t mask) 1116 struct extent_state **cached_state, gfp_t mask)
920{ 1117{
921 return set_extent_bit(tree, start, end, 1118 return set_extent_bit(tree, start, end,
922 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, 1119 EXTENT_DELALLOC | EXTENT_UPTODATE,
923 0, NULL, cached_state, mask); 1120 0, NULL, cached_state, mask);
924} 1121}
925 1122
@@ -1599,6 +1796,368 @@ static int check_page_writeback(struct extent_io_tree *tree,
1599 return 0; 1796 return 0;
1600} 1797}
1601 1798
1799/*
1800 * When IO fails, either with EIO or csum verification fails, we
1801 * try other mirrors that might have a good copy of the data. This
1802 * io_failure_record is used to record state as we go through all the
1803 * mirrors. If another mirror has good data, the page is set up to date
1804 * and things continue. If a good mirror can't be found, the original
1805 * bio end_io callback is called to indicate things have failed.
1806 */
1807struct io_failure_record {
1808 struct page *page;
1809 u64 start;
1810 u64 len;
1811 u64 logical;
1812 unsigned long bio_flags;
1813 int this_mirror;
1814 int failed_mirror;
1815 int in_validation;
1816};
1817
1818static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1819 int did_repair)
1820{
1821 int ret;
1822 int err = 0;
1823 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1824
1825 set_state_private(failure_tree, rec->start, 0);
1826 ret = clear_extent_bits(failure_tree, rec->start,
1827 rec->start + rec->len - 1,
1828 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1829 if (ret)
1830 err = ret;
1831
1832 if (did_repair) {
1833 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1834 rec->start + rec->len - 1,
1835 EXTENT_DAMAGED, GFP_NOFS);
1836 if (ret && !err)
1837 err = ret;
1838 }
1839
1840 kfree(rec);
1841 return err;
1842}
1843
1844static void repair_io_failure_callback(struct bio *bio, int err)
1845{
1846 complete(bio->bi_private);
1847}
1848
1849/*
1850 * this bypasses the standard btrfs submit functions deliberately, as
1851 * the standard behavior is to write all copies in a raid setup. here we only
1852 * want to write the one bad copy. so we do the mapping for ourselves and issue
1853 * submit_bio directly.
1854 * to avoid any synchonization issues, wait for the data after writing, which
1855 * actually prevents the read that triggered the error from finishing.
1856 * currently, there can be no more than two copies of every data bit. thus,
1857 * exactly one rewrite is required.
1858 */
1859int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1860 u64 length, u64 logical, struct page *page,
1861 int mirror_num)
1862{
1863 struct bio *bio;
1864 struct btrfs_device *dev;
1865 DECLARE_COMPLETION_ONSTACK(compl);
1866 u64 map_length = 0;
1867 u64 sector;
1868 struct btrfs_bio *bbio = NULL;
1869 int ret;
1870
1871 BUG_ON(!mirror_num);
1872
1873 bio = bio_alloc(GFP_NOFS, 1);
1874 if (!bio)
1875 return -EIO;
1876 bio->bi_private = &compl;
1877 bio->bi_end_io = repair_io_failure_callback;
1878 bio->bi_size = 0;
1879 map_length = length;
1880
1881 ret = btrfs_map_block(map_tree, WRITE, logical,
1882 &map_length, &bbio, mirror_num);
1883 if (ret) {
1884 bio_put(bio);
1885 return -EIO;
1886 }
1887 BUG_ON(mirror_num != bbio->mirror_num);
1888 sector = bbio->stripes[mirror_num-1].physical >> 9;
1889 bio->bi_sector = sector;
1890 dev = bbio->stripes[mirror_num-1].dev;
1891 kfree(bbio);
1892 if (!dev || !dev->bdev || !dev->writeable) {
1893 bio_put(bio);
1894 return -EIO;
1895 }
1896 bio->bi_bdev = dev->bdev;
1897 bio_add_page(bio, page, length, start-page_offset(page));
1898 submit_bio(WRITE_SYNC, bio);
1899 wait_for_completion(&compl);
1900
1901 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1902 /* try to remap that extent elsewhere? */
1903 bio_put(bio);
1904 return -EIO;
1905 }
1906
1907 printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
1908 "sector %llu)\n", page->mapping->host->i_ino, start,
1909 dev->name, sector);
1910
1911 bio_put(bio);
1912 return 0;
1913}
1914
1915/*
1916 * each time an IO finishes, we do a fast check in the IO failure tree
1917 * to see if we need to process or clean up an io_failure_record
1918 */
1919static int clean_io_failure(u64 start, struct page *page)
1920{
1921 u64 private;
1922 u64 private_failure;
1923 struct io_failure_record *failrec;
1924 struct btrfs_mapping_tree *map_tree;
1925 struct extent_state *state;
1926 int num_copies;
1927 int did_repair = 0;
1928 int ret;
1929 struct inode *inode = page->mapping->host;
1930
1931 private = 0;
1932 ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1933 (u64)-1, 1, EXTENT_DIRTY, 0);
1934 if (!ret)
1935 return 0;
1936
1937 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
1938 &private_failure);
1939 if (ret)
1940 return 0;
1941
1942 failrec = (struct io_failure_record *)(unsigned long) private_failure;
1943 BUG_ON(!failrec->this_mirror);
1944
1945 if (failrec->in_validation) {
1946 /* there was no real error, just free the record */
1947 pr_debug("clean_io_failure: freeing dummy error at %llu\n",
1948 failrec->start);
1949 did_repair = 1;
1950 goto out;
1951 }
1952
1953 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1954 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1955 failrec->start,
1956 EXTENT_LOCKED);
1957 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1958
1959 if (state && state->start == failrec->start) {
1960 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
1961 num_copies = btrfs_num_copies(map_tree, failrec->logical,
1962 failrec->len);
1963 if (num_copies > 1) {
1964 ret = repair_io_failure(map_tree, start, failrec->len,
1965 failrec->logical, page,
1966 failrec->failed_mirror);
1967 did_repair = !ret;
1968 }
1969 }
1970
1971out:
1972 if (!ret)
1973 ret = free_io_failure(inode, failrec, did_repair);
1974
1975 return ret;
1976}
1977
1978/*
1979 * this is a generic handler for readpage errors (default
1980 * readpage_io_failed_hook). if other copies exist, read those and write back
1981 * good data to the failed position. does not investigate in remapping the
1982 * failed extent elsewhere, hoping the device will be smart enough to do this as
1983 * needed
1984 */
1985
1986static int bio_readpage_error(struct bio *failed_bio, struct page *page,
1987 u64 start, u64 end, int failed_mirror,
1988 struct extent_state *state)
1989{
1990 struct io_failure_record *failrec = NULL;
1991 u64 private;
1992 struct extent_map *em;
1993 struct inode *inode = page->mapping->host;
1994 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1995 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
1996 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1997 struct bio *bio;
1998 int num_copies;
1999 int ret;
2000 int read_mode;
2001 u64 logical;
2002
2003 BUG_ON(failed_bio->bi_rw & REQ_WRITE);
2004
2005 ret = get_state_private(failure_tree, start, &private);
2006 if (ret) {
2007 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2008 if (!failrec)
2009 return -ENOMEM;
2010 failrec->start = start;
2011 failrec->len = end - start + 1;
2012 failrec->this_mirror = 0;
2013 failrec->bio_flags = 0;
2014 failrec->in_validation = 0;
2015
2016 read_lock(&em_tree->lock);
2017 em = lookup_extent_mapping(em_tree, start, failrec->len);
2018 if (!em) {
2019 read_unlock(&em_tree->lock);
2020 kfree(failrec);
2021 return -EIO;
2022 }
2023
2024 if (em->start > start || em->start + em->len < start) {
2025 free_extent_map(em);
2026 em = NULL;
2027 }
2028 read_unlock(&em_tree->lock);
2029
2030 if (!em || IS_ERR(em)) {
2031 kfree(failrec);
2032 return -EIO;
2033 }
2034 logical = start - em->start;
2035 logical = em->block_start + logical;
2036 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2037 logical = em->block_start;
2038 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2039 extent_set_compress_type(&failrec->bio_flags,
2040 em->compress_type);
2041 }
2042 pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
2043 "len=%llu\n", logical, start, failrec->len);
2044 failrec->logical = logical;
2045 free_extent_map(em);
2046
2047 /* set the bits in the private failure tree */
2048 ret = set_extent_bits(failure_tree, start, end,
2049 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2050 if (ret >= 0)
2051 ret = set_state_private(failure_tree, start,
2052 (u64)(unsigned long)failrec);
2053 /* set the bits in the inode's tree */
2054 if (ret >= 0)
2055 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
2056 GFP_NOFS);
2057 if (ret < 0) {
2058 kfree(failrec);
2059 return ret;
2060 }
2061 } else {
2062 failrec = (struct io_failure_record *)(unsigned long)private;
2063 pr_debug("bio_readpage_error: (found) logical=%llu, "
2064 "start=%llu, len=%llu, validation=%d\n",
2065 failrec->logical, failrec->start, failrec->len,
2066 failrec->in_validation);
2067 /*
2068 * when data can be on disk more than twice, add to failrec here
2069 * (e.g. with a list for failed_mirror) to make
2070 * clean_io_failure() clean all those errors at once.
2071 */
2072 }
2073 num_copies = btrfs_num_copies(
2074 &BTRFS_I(inode)->root->fs_info->mapping_tree,
2075 failrec->logical, failrec->len);
2076 if (num_copies == 1) {
2077 /*
2078 * we only have a single copy of the data, so don't bother with
2079 * all the retry and error correction code that follows. no
2080 * matter what the error is, it is very likely to persist.
2081 */
2082 pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
2083 "state=%p, num_copies=%d, next_mirror %d, "
2084 "failed_mirror %d\n", state, num_copies,
2085 failrec->this_mirror, failed_mirror);
2086 free_io_failure(inode, failrec, 0);
2087 return -EIO;
2088 }
2089
2090 if (!state) {
2091 spin_lock(&tree->lock);
2092 state = find_first_extent_bit_state(tree, failrec->start,
2093 EXTENT_LOCKED);
2094 if (state && state->start != failrec->start)
2095 state = NULL;
2096 spin_unlock(&tree->lock);
2097 }
2098
2099 /*
2100 * there are two premises:
2101 * a) deliver good data to the caller
2102 * b) correct the bad sectors on disk
2103 */
2104 if (failed_bio->bi_vcnt > 1) {
2105 /*
2106 * to fulfill b), we need to know the exact failing sectors, as
2107 * we don't want to rewrite any more than the failed ones. thus,
2108 * we need separate read requests for the failed bio
2109 *
2110 * if the following BUG_ON triggers, our validation request got
2111 * merged. we need separate requests for our algorithm to work.
2112 */
2113 BUG_ON(failrec->in_validation);
2114 failrec->in_validation = 1;
2115 failrec->this_mirror = failed_mirror;
2116 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2117 } else {
2118 /*
2119 * we're ready to fulfill a) and b) alongside. get a good copy
2120 * of the failed sector and if we succeed, we have setup
2121 * everything for repair_io_failure to do the rest for us.
2122 */
2123 if (failrec->in_validation) {
2124 BUG_ON(failrec->this_mirror != failed_mirror);
2125 failrec->in_validation = 0;
2126 failrec->this_mirror = 0;
2127 }
2128 failrec->failed_mirror = failed_mirror;
2129 failrec->this_mirror++;
2130 if (failrec->this_mirror == failed_mirror)
2131 failrec->this_mirror++;
2132 read_mode = READ_SYNC;
2133 }
2134
2135 if (!state || failrec->this_mirror > num_copies) {
2136 pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
2137 "next_mirror %d, failed_mirror %d\n", state,
2138 num_copies, failrec->this_mirror, failed_mirror);
2139 free_io_failure(inode, failrec, 0);
2140 return -EIO;
2141 }
2142
2143 bio = bio_alloc(GFP_NOFS, 1);
2144 bio->bi_private = state;
2145 bio->bi_end_io = failed_bio->bi_end_io;
2146 bio->bi_sector = failrec->logical >> 9;
2147 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
2148 bio->bi_size = 0;
2149
2150 bio_add_page(bio, page, failrec->len, start - page_offset(page));
2151
2152 pr_debug("bio_readpage_error: submitting new read[%#x] to "
2153 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
2154 failrec->this_mirror, num_copies, failrec->in_validation);
2155
2156 tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
2157 failrec->bio_flags, 0);
2158 return 0;
2159}
2160
1602/* lots and lots of room for performance fixes in the end_bio funcs */ 2161/* lots and lots of room for performance fixes in the end_bio funcs */
1603 2162
1604/* 2163/*
@@ -1697,6 +2256,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1697 struct extent_state *cached = NULL; 2256 struct extent_state *cached = NULL;
1698 struct extent_state *state; 2257 struct extent_state *state;
1699 2258
2259 pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
2260 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
2261 (long int)bio->bi_bdev);
1700 tree = &BTRFS_I(page->mapping->host)->io_tree; 2262 tree = &BTRFS_I(page->mapping->host)->io_tree;
1701 2263
1702 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 2264 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1727,12 +2289,26 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1727 state); 2289 state);
1728 if (ret) 2290 if (ret)
1729 uptodate = 0; 2291 uptodate = 0;
2292 else
2293 clean_io_failure(start, page);
1730 } 2294 }
1731 if (!uptodate && tree->ops && 2295 if (!uptodate) {
1732 tree->ops->readpage_io_failed_hook) { 2296 int failed_mirror;
1733 ret = tree->ops->readpage_io_failed_hook(bio, page, 2297 failed_mirror = (int)(unsigned long)bio->bi_bdev;
1734 start, end, NULL); 2298 /*
2299 * The generic bio_readpage_error handles errors the
2300 * following way: If possible, new read requests are
2301 * created and submitted and will end up in
2302 * end_bio_extent_readpage as well (if we're lucky, not
2303 * in the !uptodate case). In that case it returns 0 and
2304 * we just go on with the next page in our bio. If it
2305 * can't handle the error it will return -EIO and we
2306 * remain responsible for that page.
2307 */
2308 ret = bio_readpage_error(bio, page, start, end,
2309 failed_mirror, NULL);
1735 if (ret == 0) { 2310 if (ret == 0) {
2311error_handled:
1736 uptodate = 2312 uptodate =
1737 test_bit(BIO_UPTODATE, &bio->bi_flags); 2313 test_bit(BIO_UPTODATE, &bio->bi_flags);
1738 if (err) 2314 if (err)
@@ -1740,6 +2316,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1740 uncache_state(&cached); 2316 uncache_state(&cached);
1741 continue; 2317 continue;
1742 } 2318 }
2319 if (tree->ops && tree->ops->readpage_io_failed_hook) {
2320 ret = tree->ops->readpage_io_failed_hook(
2321 bio, page, start, end,
2322 failed_mirror, state);
2323 if (ret == 0)
2324 goto error_handled;
2325 }
1743 } 2326 }
1744 2327
1745 if (uptodate) { 2328 if (uptodate) {
@@ -1811,6 +2394,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1811 mirror_num, bio_flags, start); 2394 mirror_num, bio_flags, start);
1812 else 2395 else
1813 submit_bio(rw, bio); 2396 submit_bio(rw, bio);
2397
1814 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 2398 if (bio_flagged(bio, BIO_EOPNOTSUPP))
1815 ret = -EOPNOTSUPP; 2399 ret = -EOPNOTSUPP;
1816 bio_put(bio); 2400 bio_put(bio);
@@ -2076,16 +2660,16 @@ out:
2076} 2660}
2077 2661
2078int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 2662int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2079 get_extent_t *get_extent) 2663 get_extent_t *get_extent, int mirror_num)
2080{ 2664{
2081 struct bio *bio = NULL; 2665 struct bio *bio = NULL;
2082 unsigned long bio_flags = 0; 2666 unsigned long bio_flags = 0;
2083 int ret; 2667 int ret;
2084 2668
2085 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, 2669 ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
2086 &bio_flags); 2670 &bio_flags);
2087 if (bio) 2671 if (bio)
2088 ret = submit_one_bio(READ, bio, 0, bio_flags); 2672 ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
2089 return ret; 2673 return ret;
2090} 2674}
2091 2675
@@ -2136,6 +2720,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2136 int compressed; 2720 int compressed;
2137 int write_flags; 2721 int write_flags;
2138 unsigned long nr_written = 0; 2722 unsigned long nr_written = 0;
2723 bool fill_delalloc = true;
2139 2724
2140 if (wbc->sync_mode == WB_SYNC_ALL) 2725 if (wbc->sync_mode == WB_SYNC_ALL)
2141 write_flags = WRITE_SYNC; 2726 write_flags = WRITE_SYNC;
@@ -2145,6 +2730,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2145 trace___extent_writepage(page, inode, wbc); 2730 trace___extent_writepage(page, inode, wbc);
2146 2731
2147 WARN_ON(!PageLocked(page)); 2732 WARN_ON(!PageLocked(page));
2733
2734 ClearPageError(page);
2735
2148 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2736 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2149 if (page->index > end_index || 2737 if (page->index > end_index ||
2150 (page->index == end_index && !pg_offset)) { 2738 (page->index == end_index && !pg_offset)) {
@@ -2166,10 +2754,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2166 2754
2167 set_page_extent_mapped(page); 2755 set_page_extent_mapped(page);
2168 2756
2757 if (!tree->ops || !tree->ops->fill_delalloc)
2758 fill_delalloc = false;
2759
2169 delalloc_start = start; 2760 delalloc_start = start;
2170 delalloc_end = 0; 2761 delalloc_end = 0;
2171 page_started = 0; 2762 page_started = 0;
2172 if (!epd->extent_locked) { 2763 if (!epd->extent_locked && fill_delalloc) {
2173 u64 delalloc_to_write = 0; 2764 u64 delalloc_to_write = 0;
2174 /* 2765 /*
2175 * make sure the wbc mapping index is at least updated 2766 * make sure the wbc mapping index is at least updated
@@ -2421,10 +3012,16 @@ retry:
2421 * swizzled back from swapper_space to tmpfs file 3012 * swizzled back from swapper_space to tmpfs file
2422 * mapping 3013 * mapping
2423 */ 3014 */
2424 if (tree->ops && tree->ops->write_cache_pages_lock_hook) 3015 if (tree->ops &&
2425 tree->ops->write_cache_pages_lock_hook(page); 3016 tree->ops->write_cache_pages_lock_hook) {
2426 else 3017 tree->ops->write_cache_pages_lock_hook(page,
2427 lock_page(page); 3018 data, flush_fn);
3019 } else {
3020 if (!trylock_page(page)) {
3021 flush_fn(data);
3022 lock_page(page);
3023 }
3024 }
2428 3025
2429 if (unlikely(page->mapping != mapping)) { 3026 if (unlikely(page->mapping != mapping)) {
2430 unlock_page(page); 3027 unlock_page(page);
@@ -2790,6 +3387,9 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2790 return -ENOMEM; 3387 return -ENOMEM;
2791 path->leave_spinning = 1; 3388 path->leave_spinning = 1;
2792 3389
3390 start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
3391 len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
3392
2793 /* 3393 /*
2794 * lookup the last file extent. We're not using i_size here 3394 * lookup the last file extent. We're not using i_size here
2795 * because there might be preallocation past i_size 3395 * because there might be preallocation past i_size
@@ -2837,7 +3437,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2837 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, 3437 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
2838 &cached_state, GFP_NOFS); 3438 &cached_state, GFP_NOFS);
2839 3439
2840 em = get_extent_skip_holes(inode, off, last_for_get_extent, 3440 em = get_extent_skip_holes(inode, start, last_for_get_extent,
2841 get_extent); 3441 get_extent);
2842 if (!em) 3442 if (!em)
2843 goto out; 3443 goto out;
@@ -2926,7 +3526,7 @@ out:
2926 return ret; 3526 return ret;
2927} 3527}
2928 3528
2929static inline struct page *extent_buffer_page(struct extent_buffer *eb, 3529inline struct page *extent_buffer_page(struct extent_buffer *eb,
2930 unsigned long i) 3530 unsigned long i)
2931{ 3531{
2932 struct page *p; 3532 struct page *p;
@@ -2951,7 +3551,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2951 return p; 3551 return p;
2952} 3552}
2953 3553
2954static inline unsigned long num_extent_pages(u64 start, u64 len) 3554inline unsigned long num_extent_pages(u64 start, u64 len)
2955{ 3555{
2956 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - 3556 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2957 (start >> PAGE_CACHE_SHIFT); 3557 (start >> PAGE_CACHE_SHIFT);
@@ -3204,6 +3804,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3204 PAGECACHE_TAG_DIRTY); 3804 PAGECACHE_TAG_DIRTY);
3205 } 3805 }
3206 spin_unlock_irq(&page->mapping->tree_lock); 3806 spin_unlock_irq(&page->mapping->tree_lock);
3807 ClearPageError(page);
3207 unlock_page(page); 3808 unlock_page(page);
3208 } 3809 }
3209 return 0; 3810 return 0;
@@ -3349,8 +3950,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3349} 3950}
3350 3951
3351int read_extent_buffer_pages(struct extent_io_tree *tree, 3952int read_extent_buffer_pages(struct extent_io_tree *tree,
3352 struct extent_buffer *eb, 3953 struct extent_buffer *eb, u64 start, int wait,
3353 u64 start, int wait,
3354 get_extent_t *get_extent, int mirror_num) 3954 get_extent_t *get_extent, int mirror_num)
3355{ 3955{
3356 unsigned long i; 3956 unsigned long i;
@@ -3386,7 +3986,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3386 num_pages = num_extent_pages(eb->start, eb->len); 3986 num_pages = num_extent_pages(eb->start, eb->len);
3387 for (i = start_i; i < num_pages; i++) { 3987 for (i = start_i; i < num_pages; i++) {
3388 page = extent_buffer_page(eb, i); 3988 page = extent_buffer_page(eb, i);
3389 if (!wait) { 3989 if (wait == WAIT_NONE) {
3390 if (!trylock_page(page)) 3990 if (!trylock_page(page))
3391 goto unlock_exit; 3991 goto unlock_exit;
3392 } else { 3992 } else {
@@ -3430,7 +4030,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3430 if (bio) 4030 if (bio)
3431 submit_one_bio(READ, bio, mirror_num, bio_flags); 4031 submit_one_bio(READ, bio, mirror_num, bio_flags);
3432 4032
3433 if (ret || !wait) 4033 if (ret || wait != WAIT_COMPLETE)
3434 return ret; 4034 return ret;
3435 4035
3436 for (i = start_i; i < num_pages; i++) { 4036 for (i = start_i; i < num_pages; i++) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7b2f0c3e792..7604c300132 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -17,6 +17,8 @@
17#define EXTENT_NODATASUM (1 << 10) 17#define EXTENT_NODATASUM (1 << 10)
18#define EXTENT_DO_ACCOUNTING (1 << 11) 18#define EXTENT_DO_ACCOUNTING (1 << 11)
19#define EXTENT_FIRST_DELALLOC (1 << 12) 19#define EXTENT_FIRST_DELALLOC (1 << 12)
20#define EXTENT_NEED_WAIT (1 << 13)
21#define EXTENT_DAMAGED (1 << 14)
20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 22#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) 23#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
22 24
@@ -32,6 +34,7 @@
32#define EXTENT_BUFFER_BLOCKING 1 34#define EXTENT_BUFFER_BLOCKING 1
33#define EXTENT_BUFFER_DIRTY 2 35#define EXTENT_BUFFER_DIRTY 2
34#define EXTENT_BUFFER_CORRUPT 3 36#define EXTENT_BUFFER_CORRUPT 3
37#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */
35 38
36/* these are flags for extent_clear_unlock_delalloc */ 39/* these are flags for extent_clear_unlock_delalloc */
37#define EXTENT_CLEAR_UNLOCK_PAGE 0x1 40#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -67,7 +70,7 @@ struct extent_io_ops {
67 unsigned long bio_flags); 70 unsigned long bio_flags);
68 int (*readpage_io_hook)(struct page *page, u64 start, u64 end); 71 int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
69 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, 72 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
70 u64 start, u64 end, 73 u64 start, u64 end, int failed_mirror,
71 struct extent_state *state); 74 struct extent_state *state);
72 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, 75 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
73 u64 start, u64 end, 76 u64 start, u64 end,
@@ -85,7 +88,8 @@ struct extent_io_ops {
85 struct extent_state *other); 88 struct extent_state *other);
86 void (*split_extent_hook)(struct inode *inode, 89 void (*split_extent_hook)(struct inode *inode,
87 struct extent_state *orig, u64 split); 90 struct extent_state *orig, u64 split);
88 int (*write_cache_pages_lock_hook)(struct page *page); 91 int (*write_cache_pages_lock_hook)(struct page *page, void *data,
92 void (*flush_fn)(void *));
89}; 93};
90 94
91struct extent_io_tree { 95struct extent_io_tree {
@@ -185,7 +189,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
185int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, 189int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
186 gfp_t mask); 190 gfp_t mask);
187int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 191int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
188 get_extent_t *get_extent); 192 get_extent_t *get_extent, int mirror_num);
189int __init extent_io_init(void); 193int __init extent_io_init(void);
190void extent_io_exit(void); 194void extent_io_exit(void);
191 195
@@ -214,6 +218,8 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
214 gfp_t mask); 218 gfp_t mask);
215int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 219int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
216 gfp_t mask); 220 gfp_t mask);
221int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
222 int bits, int clear_bits, gfp_t mask);
217int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 223int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
218 struct extent_state **cached_state, gfp_t mask); 224 struct extent_state **cached_state, gfp_t mask);
219int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 225int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@ -248,9 +254,14 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
248struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 254struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
249 u64 start, unsigned long len); 255 u64 start, unsigned long len);
250void free_extent_buffer(struct extent_buffer *eb); 256void free_extent_buffer(struct extent_buffer *eb);
257#define WAIT_NONE 0
258#define WAIT_COMPLETE 1
259#define WAIT_PAGE_LOCK 2
251int read_extent_buffer_pages(struct extent_io_tree *tree, 260int read_extent_buffer_pages(struct extent_io_tree *tree,
252 struct extent_buffer *eb, u64 start, int wait, 261 struct extent_buffer *eb, u64 start, int wait,
253 get_extent_t *get_extent, int mirror_num); 262 get_extent_t *get_extent, int mirror_num);
263unsigned long num_extent_pages(u64 start, u64 len);
264struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i);
254 265
255static inline void extent_buffer_get(struct extent_buffer *eb) 266static inline void extent_buffer_get(struct extent_buffer *eb)
256{ 267{
@@ -300,4 +311,10 @@ int extent_clear_unlock_delalloc(struct inode *inode,
300struct bio * 311struct bio *
301btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 312btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
302 gfp_t gfp_flags); 313 gfp_t gfp_flags);
314
315struct btrfs_mapping_tree;
316
317int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
318 u64 length, u64 logical, struct page *page,
319 int mirror_num);
303#endif 320#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a1cb7821bec..c7fb3a4247d 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -91,8 +91,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
91 struct btrfs_csum_item *item; 91 struct btrfs_csum_item *item;
92 struct extent_buffer *leaf; 92 struct extent_buffer *leaf;
93 u64 csum_offset = 0; 93 u64 csum_offset = 0;
94 u16 csum_size = 94 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
95 btrfs_super_csum_size(&root->fs_info->super_copy);
96 int csums_in_item; 95 int csums_in_item;
97 96
98 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 97 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -162,8 +161,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
162 u64 item_last_offset = 0; 161 u64 item_last_offset = 0;
163 u64 disk_bytenr; 162 u64 disk_bytenr;
164 u32 diff; 163 u32 diff;
165 u16 csum_size = 164 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
166 btrfs_super_csum_size(&root->fs_info->super_copy);
167 int ret; 165 int ret;
168 struct btrfs_path *path; 166 struct btrfs_path *path;
169 struct btrfs_csum_item *item = NULL; 167 struct btrfs_csum_item *item = NULL;
@@ -290,7 +288,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
290 int ret; 288 int ret;
291 size_t size; 289 size_t size;
292 u64 csum_end; 290 u64 csum_end;
293 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); 291 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
294 292
295 path = btrfs_alloc_path(); 293 path = btrfs_alloc_path();
296 if (!path) 294 if (!path)
@@ -492,8 +490,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
492 u64 bytenr, u64 len) 490 u64 bytenr, u64 len)
493{ 491{
494 struct extent_buffer *leaf; 492 struct extent_buffer *leaf;
495 u16 csum_size = 493 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
496 btrfs_super_csum_size(&root->fs_info->super_copy);
497 u64 csum_end; 494 u64 csum_end;
498 u64 end_byte = bytenr + len; 495 u64 end_byte = bytenr + len;
499 u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; 496 u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
@@ -549,8 +546,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
549 u64 csum_end; 546 u64 csum_end;
550 struct extent_buffer *leaf; 547 struct extent_buffer *leaf;
551 int ret; 548 int ret;
552 u16 csum_size = 549 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
553 btrfs_super_csum_size(&root->fs_info->super_copy);
554 int blocksize_bits = root->fs_info->sb->s_blocksize_bits; 550 int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
555 551
556 root = root->fs_info->csum_root; 552 root = root->fs_info->csum_root;
@@ -676,8 +672,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
676 struct btrfs_sector_sum *sector_sum; 672 struct btrfs_sector_sum *sector_sum;
677 u32 nritems; 673 u32 nritems;
678 u32 ins_size; 674 u32 ins_size;
679 u16 csum_size = 675 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
680 btrfs_super_csum_size(&root->fs_info->super_copy);
681 676
682 path = btrfs_alloc_path(); 677 path = btrfs_alloc_path();
683 if (!path) 678 if (!path)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1266f6e9cdb..dafdfa059bf 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1069,6 +1069,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1069 int i; 1069 int i;
1070 unsigned long index = pos >> PAGE_CACHE_SHIFT; 1070 unsigned long index = pos >> PAGE_CACHE_SHIFT;
1071 struct inode *inode = fdentry(file)->d_inode; 1071 struct inode *inode = fdentry(file)->d_inode;
1072 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1072 int err = 0; 1073 int err = 0;
1073 int faili = 0; 1074 int faili = 0;
1074 u64 start_pos; 1075 u64 start_pos;
@@ -1080,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1080again: 1081again:
1081 for (i = 0; i < num_pages; i++) { 1082 for (i = 0; i < num_pages; i++) {
1082 pages[i] = find_or_create_page(inode->i_mapping, index + i, 1083 pages[i] = find_or_create_page(inode->i_mapping, index + i,
1083 GFP_NOFS); 1084 mask);
1084 if (!pages[i]) { 1085 if (!pages[i]) {
1085 faili = i - 1; 1086 faili = i - 1;
1086 err = -ENOMEM; 1087 err = -ENOMEM;
@@ -1615,10 +1616,6 @@ static long btrfs_fallocate(struct file *file, int mode,
1615 goto out; 1616 goto out;
1616 } 1617 }
1617 1618
1618 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
1619 if (ret)
1620 goto out;
1621
1622 locked_end = alloc_end - 1; 1619 locked_end = alloc_end - 1;
1623 while (1) { 1620 while (1) {
1624 struct btrfs_ordered_extent *ordered; 1621 struct btrfs_ordered_extent *ordered;
@@ -1664,11 +1661,27 @@ static long btrfs_fallocate(struct file *file, int mode,
1664 if (em->block_start == EXTENT_MAP_HOLE || 1661 if (em->block_start == EXTENT_MAP_HOLE ||
1665 (cur_offset >= inode->i_size && 1662 (cur_offset >= inode->i_size &&
1666 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 1663 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
1664
1665 /*
1666 * Make sure we have enough space before we do the
1667 * allocation.
1668 */
1669 ret = btrfs_check_data_free_space(inode, last_byte -
1670 cur_offset);
1671 if (ret) {
1672 free_extent_map(em);
1673 break;
1674 }
1675
1667 ret = btrfs_prealloc_file_range(inode, mode, cur_offset, 1676 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
1668 last_byte - cur_offset, 1677 last_byte - cur_offset,
1669 1 << inode->i_blkbits, 1678 1 << inode->i_blkbits,
1670 offset + len, 1679 offset + len,
1671 &alloc_hint); 1680 &alloc_hint);
1681
1682 /* Let go of our reservation. */
1683 btrfs_free_reserved_data_space(inode, last_byte -
1684 cur_offset);
1672 if (ret < 0) { 1685 if (ret < 0) {
1673 free_extent_map(em); 1686 free_extent_map(em);
1674 break; 1687 break;
@@ -1694,8 +1707,6 @@ static long btrfs_fallocate(struct file *file, int mode,
1694 } 1707 }
1695 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 1708 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
1696 &cached_state, GFP_NOFS); 1709 &cached_state, GFP_NOFS);
1697
1698 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
1699out: 1710out:
1700 mutex_unlock(&inode->i_mutex); 1711 mutex_unlock(&inode->i_mutex);
1701 return ret; 1712 return ret;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 41ac927401d..ec23d43d0c3 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -20,6 +20,7 @@
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/math64.h> 22#include <linux/math64.h>
23#include <linux/ratelimit.h>
23#include "ctree.h" 24#include "ctree.h"
24#include "free-space-cache.h" 25#include "free-space-cache.h"
25#include "transaction.h" 26#include "transaction.h"
@@ -84,6 +85,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
84 *block_group, struct btrfs_path *path) 85 *block_group, struct btrfs_path *path)
85{ 86{
86 struct inode *inode = NULL; 87 struct inode *inode = NULL;
88 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
87 89
88 spin_lock(&block_group->lock); 90 spin_lock(&block_group->lock);
89 if (block_group->inode) 91 if (block_group->inode)
@@ -98,13 +100,14 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
98 return inode; 100 return inode;
99 101
100 spin_lock(&block_group->lock); 102 spin_lock(&block_group->lock);
101 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) { 103 if (!((BTRFS_I(inode)->flags & flags) == flags)) {
102 printk(KERN_INFO "Old style space inode found, converting.\n"); 104 printk(KERN_INFO "Old style space inode found, converting.\n");
103 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM; 105 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
106 BTRFS_INODE_NODATACOW;
104 block_group->disk_cache_state = BTRFS_DC_CLEAR; 107 block_group->disk_cache_state = BTRFS_DC_CLEAR;
105 } 108 }
106 109
107 if (!btrfs_fs_closing(root->fs_info)) { 110 if (!block_group->iref) {
108 block_group->inode = igrab(inode); 111 block_group->inode = igrab(inode);
109 block_group->iref = 1; 112 block_group->iref = 1;
110 } 113 }
@@ -122,12 +125,17 @@ int __create_free_space_inode(struct btrfs_root *root,
122 struct btrfs_free_space_header *header; 125 struct btrfs_free_space_header *header;
123 struct btrfs_inode_item *inode_item; 126 struct btrfs_inode_item *inode_item;
124 struct extent_buffer *leaf; 127 struct extent_buffer *leaf;
128 u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
125 int ret; 129 int ret;
126 130
127 ret = btrfs_insert_empty_inode(trans, root, path, ino); 131 ret = btrfs_insert_empty_inode(trans, root, path, ino);
128 if (ret) 132 if (ret)
129 return ret; 133 return ret;
130 134
135 /* We inline crc's for the free disk space cache */
136 if (ino != BTRFS_FREE_INO_OBJECTID)
137 flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
138
131 leaf = path->nodes[0]; 139 leaf = path->nodes[0];
132 inode_item = btrfs_item_ptr(leaf, path->slots[0], 140 inode_item = btrfs_item_ptr(leaf, path->slots[0],
133 struct btrfs_inode_item); 141 struct btrfs_inode_item);
@@ -140,8 +148,7 @@ int __create_free_space_inode(struct btrfs_root *root,
140 btrfs_set_inode_uid(leaf, inode_item, 0); 148 btrfs_set_inode_uid(leaf, inode_item, 0);
141 btrfs_set_inode_gid(leaf, inode_item, 0); 149 btrfs_set_inode_gid(leaf, inode_item, 0);
142 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 150 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
143 btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | 151 btrfs_set_inode_flags(leaf, inode_item, flags);
144 BTRFS_INODE_PREALLOC);
145 btrfs_set_inode_nlink(leaf, inode_item, 1); 152 btrfs_set_inode_nlink(leaf, inode_item, 1);
146 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 153 btrfs_set_inode_transid(leaf, inode_item, trans->transid);
147 btrfs_set_inode_block_group(leaf, inode_item, offset); 154 btrfs_set_inode_block_group(leaf, inode_item, offset);
@@ -191,16 +198,24 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
191 struct inode *inode) 198 struct inode *inode)
192{ 199{
193 struct btrfs_block_rsv *rsv; 200 struct btrfs_block_rsv *rsv;
201 u64 needed_bytes;
194 loff_t oldsize; 202 loff_t oldsize;
195 int ret = 0; 203 int ret = 0;
196 204
197 rsv = trans->block_rsv; 205 rsv = trans->block_rsv;
198 trans->block_rsv = root->orphan_block_rsv; 206 trans->block_rsv = &root->fs_info->global_block_rsv;
199 ret = btrfs_block_rsv_check(trans, root, 207
200 root->orphan_block_rsv, 208 /* 1 for slack space, 1 for updating the inode */
201 0, 5); 209 needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) +
202 if (ret) 210 btrfs_calc_trans_metadata_size(root, 1);
203 return ret; 211
212 spin_lock(&trans->block_rsv->lock);
213 if (trans->block_rsv->reserved < needed_bytes) {
214 spin_unlock(&trans->block_rsv->lock);
215 trans->block_rsv = rsv;
216 return -ENOSPC;
217 }
218 spin_unlock(&trans->block_rsv->lock);
204 219
205 oldsize = i_size_read(inode); 220 oldsize = i_size_read(inode);
206 btrfs_i_size_write(inode, 0); 221 btrfs_i_size_write(inode, 0);
@@ -213,13 +228,15 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
213 ret = btrfs_truncate_inode_items(trans, root, inode, 228 ret = btrfs_truncate_inode_items(trans, root, inode,
214 0, BTRFS_EXTENT_DATA_KEY); 229 0, BTRFS_EXTENT_DATA_KEY);
215 230
216 trans->block_rsv = rsv;
217 if (ret) { 231 if (ret) {
232 trans->block_rsv = rsv;
218 WARN_ON(1); 233 WARN_ON(1);
219 return ret; 234 return ret;
220 } 235 }
221 236
222 ret = btrfs_update_inode(trans, root, inode); 237 ret = btrfs_update_inode(trans, root, inode);
238 trans->block_rsv = rsv;
239
223 return ret; 240 return ret;
224} 241}
225 242
@@ -242,26 +259,348 @@ static int readahead_cache(struct inode *inode)
242 return 0; 259 return 0;
243} 260}
244 261
262struct io_ctl {
263 void *cur, *orig;
264 struct page *page;
265 struct page **pages;
266 struct btrfs_root *root;
267 unsigned long size;
268 int index;
269 int num_pages;
270 unsigned check_crcs:1;
271};
272
273static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
274 struct btrfs_root *root)
275{
276 memset(io_ctl, 0, sizeof(struct io_ctl));
277 io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
278 PAGE_CACHE_SHIFT;
279 io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages,
280 GFP_NOFS);
281 if (!io_ctl->pages)
282 return -ENOMEM;
283 io_ctl->root = root;
284 if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
285 io_ctl->check_crcs = 1;
286 return 0;
287}
288
289static void io_ctl_free(struct io_ctl *io_ctl)
290{
291 kfree(io_ctl->pages);
292}
293
294static void io_ctl_unmap_page(struct io_ctl *io_ctl)
295{
296 if (io_ctl->cur) {
297 kunmap(io_ctl->page);
298 io_ctl->cur = NULL;
299 io_ctl->orig = NULL;
300 }
301}
302
303static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
304{
305 WARN_ON(io_ctl->cur);
306 BUG_ON(io_ctl->index >= io_ctl->num_pages);
307 io_ctl->page = io_ctl->pages[io_ctl->index++];
308 io_ctl->cur = kmap(io_ctl->page);
309 io_ctl->orig = io_ctl->cur;
310 io_ctl->size = PAGE_CACHE_SIZE;
311 if (clear)
312 memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
313}
314
315static void io_ctl_drop_pages(struct io_ctl *io_ctl)
316{
317 int i;
318
319 io_ctl_unmap_page(io_ctl);
320
321 for (i = 0; i < io_ctl->num_pages; i++) {
322 ClearPageChecked(io_ctl->pages[i]);
323 unlock_page(io_ctl->pages[i]);
324 page_cache_release(io_ctl->pages[i]);
325 }
326}
327
328static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
329 int uptodate)
330{
331 struct page *page;
332 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
333 int i;
334
335 for (i = 0; i < io_ctl->num_pages; i++) {
336 page = find_or_create_page(inode->i_mapping, i, mask);
337 if (!page) {
338 io_ctl_drop_pages(io_ctl);
339 return -ENOMEM;
340 }
341 io_ctl->pages[i] = page;
342 if (uptodate && !PageUptodate(page)) {
343 btrfs_readpage(NULL, page);
344 lock_page(page);
345 if (!PageUptodate(page)) {
346 printk(KERN_ERR "btrfs: error reading free "
347 "space cache\n");
348 io_ctl_drop_pages(io_ctl);
349 return -EIO;
350 }
351 }
352 }
353
354 for (i = 0; i < io_ctl->num_pages; i++) {
355 clear_page_dirty_for_io(io_ctl->pages[i]);
356 set_page_extent_mapped(io_ctl->pages[i]);
357 }
358
359 return 0;
360}
361
362static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
363{
364 u64 *val;
365
366 io_ctl_map_page(io_ctl, 1);
367
368 /*
369 * Skip the csum areas. If we don't check crcs then we just have a
370 * 64bit chunk at the front of the first page.
371 */
372 if (io_ctl->check_crcs) {
373 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
374 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
375 } else {
376 io_ctl->cur += sizeof(u64);
377 io_ctl->size -= sizeof(u64) * 2;
378 }
379
380 val = io_ctl->cur;
381 *val = cpu_to_le64(generation);
382 io_ctl->cur += sizeof(u64);
383}
384
385static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
386{
387 u64 *gen;
388
389 /*
390 * Skip the crc area. If we don't check crcs then we just have a 64bit
391 * chunk at the front of the first page.
392 */
393 if (io_ctl->check_crcs) {
394 io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
395 io_ctl->size -= sizeof(u64) +
396 (sizeof(u32) * io_ctl->num_pages);
397 } else {
398 io_ctl->cur += sizeof(u64);
399 io_ctl->size -= sizeof(u64) * 2;
400 }
401
402 gen = io_ctl->cur;
403 if (le64_to_cpu(*gen) != generation) {
404 printk_ratelimited(KERN_ERR "btrfs: space cache generation "
405 "(%Lu) does not match inode (%Lu)\n", *gen,
406 generation);
407 io_ctl_unmap_page(io_ctl);
408 return -EIO;
409 }
410 io_ctl->cur += sizeof(u64);
411 return 0;
412}
413
414static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
415{
416 u32 *tmp;
417 u32 crc = ~(u32)0;
418 unsigned offset = 0;
419
420 if (!io_ctl->check_crcs) {
421 io_ctl_unmap_page(io_ctl);
422 return;
423 }
424
425 if (index == 0)
426 offset = sizeof(u32) * io_ctl->num_pages;;
427
428 crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
429 PAGE_CACHE_SIZE - offset);
430 btrfs_csum_final(crc, (char *)&crc);
431 io_ctl_unmap_page(io_ctl);
432 tmp = kmap(io_ctl->pages[0]);
433 tmp += index;
434 *tmp = crc;
435 kunmap(io_ctl->pages[0]);
436}
437
438static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
439{
440 u32 *tmp, val;
441 u32 crc = ~(u32)0;
442 unsigned offset = 0;
443
444 if (!io_ctl->check_crcs) {
445 io_ctl_map_page(io_ctl, 0);
446 return 0;
447 }
448
449 if (index == 0)
450 offset = sizeof(u32) * io_ctl->num_pages;
451
452 tmp = kmap(io_ctl->pages[0]);
453 tmp += index;
454 val = *tmp;
455 kunmap(io_ctl->pages[0]);
456
457 io_ctl_map_page(io_ctl, 0);
458 crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
459 PAGE_CACHE_SIZE - offset);
460 btrfs_csum_final(crc, (char *)&crc);
461 if (val != crc) {
462 printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free "
463 "space cache\n");
464 io_ctl_unmap_page(io_ctl);
465 return -EIO;
466 }
467
468 return 0;
469}
470
471static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
472 void *bitmap)
473{
474 struct btrfs_free_space_entry *entry;
475
476 if (!io_ctl->cur)
477 return -ENOSPC;
478
479 entry = io_ctl->cur;
480 entry->offset = cpu_to_le64(offset);
481 entry->bytes = cpu_to_le64(bytes);
482 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
483 BTRFS_FREE_SPACE_EXTENT;
484 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
485 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
486
487 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
488 return 0;
489
490 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
491
492 /* No more pages to map */
493 if (io_ctl->index >= io_ctl->num_pages)
494 return 0;
495
496 /* map the next page */
497 io_ctl_map_page(io_ctl, 1);
498 return 0;
499}
500
501static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
502{
503 if (!io_ctl->cur)
504 return -ENOSPC;
505
506 /*
507 * If we aren't at the start of the current page, unmap this one and
508 * map the next one if there is any left.
509 */
510 if (io_ctl->cur != io_ctl->orig) {
511 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
512 if (io_ctl->index >= io_ctl->num_pages)
513 return -ENOSPC;
514 io_ctl_map_page(io_ctl, 0);
515 }
516
517 memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
518 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
519 if (io_ctl->index < io_ctl->num_pages)
520 io_ctl_map_page(io_ctl, 0);
521 return 0;
522}
523
524static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
525{
526 /*
527 * If we're not on the boundary we know we've modified the page and we
528 * need to crc the page.
529 */
530 if (io_ctl->cur != io_ctl->orig)
531 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
532 else
533 io_ctl_unmap_page(io_ctl);
534
535 while (io_ctl->index < io_ctl->num_pages) {
536 io_ctl_map_page(io_ctl, 1);
537 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
538 }
539}
540
541static int io_ctl_read_entry(struct io_ctl *io_ctl,
542 struct btrfs_free_space *entry, u8 *type)
543{
544 struct btrfs_free_space_entry *e;
545 int ret;
546
547 if (!io_ctl->cur) {
548 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
549 if (ret)
550 return ret;
551 }
552
553 e = io_ctl->cur;
554 entry->offset = le64_to_cpu(e->offset);
555 entry->bytes = le64_to_cpu(e->bytes);
556 *type = e->type;
557 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
558 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
559
560 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
561 return 0;
562
563 io_ctl_unmap_page(io_ctl);
564
565 return 0;
566}
567
568static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
569 struct btrfs_free_space *entry)
570{
571 int ret;
572
573 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
574 if (ret)
575 return ret;
576
577 memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE);
578 io_ctl_unmap_page(io_ctl);
579
580 return 0;
581}
582
245int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, 583int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
246 struct btrfs_free_space_ctl *ctl, 584 struct btrfs_free_space_ctl *ctl,
247 struct btrfs_path *path, u64 offset) 585 struct btrfs_path *path, u64 offset)
248{ 586{
249 struct btrfs_free_space_header *header; 587 struct btrfs_free_space_header *header;
250 struct extent_buffer *leaf; 588 struct extent_buffer *leaf;
251 struct page *page; 589 struct io_ctl io_ctl;
252 struct btrfs_key key; 590 struct btrfs_key key;
591 struct btrfs_free_space *e, *n;
253 struct list_head bitmaps; 592 struct list_head bitmaps;
254 u64 num_entries; 593 u64 num_entries;
255 u64 num_bitmaps; 594 u64 num_bitmaps;
256 u64 generation; 595 u64 generation;
257 pgoff_t index = 0; 596 u8 type;
258 int ret = 0; 597 int ret = 0;
259 598
260 INIT_LIST_HEAD(&bitmaps); 599 INIT_LIST_HEAD(&bitmaps);
261 600
262 /* Nothing in the space cache, goodbye */ 601 /* Nothing in the space cache, goodbye */
263 if (!i_size_read(inode)) 602 if (!i_size_read(inode))
264 goto out; 603 return 0;
265 604
266 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 605 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
267 key.offset = offset; 606 key.offset = offset;
@@ -269,11 +608,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
269 608
270 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 609 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
271 if (ret < 0) 610 if (ret < 0)
272 goto out; 611 return 0;
273 else if (ret > 0) { 612 else if (ret > 0) {
274 btrfs_release_path(path); 613 btrfs_release_path(path);
275 ret = 0; 614 return 0;
276 goto out;
277 } 615 }
278 616
279 ret = -1; 617 ret = -1;
@@ -291,169 +629,102 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
291 " not match free space cache generation (%llu)\n", 629 " not match free space cache generation (%llu)\n",
292 (unsigned long long)BTRFS_I(inode)->generation, 630 (unsigned long long)BTRFS_I(inode)->generation,
293 (unsigned long long)generation); 631 (unsigned long long)generation);
294 goto out; 632 return 0;
295 } 633 }
296 634
297 if (!num_entries) 635 if (!num_entries)
298 goto out; 636 return 0;
299 637
638 io_ctl_init(&io_ctl, inode, root);
300 ret = readahead_cache(inode); 639 ret = readahead_cache(inode);
301 if (ret) 640 if (ret)
302 goto out; 641 goto out;
303 642
304 while (1) { 643 ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
305 struct btrfs_free_space_entry *entry; 644 if (ret)
306 struct btrfs_free_space *e; 645 goto out;
307 void *addr;
308 unsigned long offset = 0;
309 int need_loop = 0;
310 646
311 if (!num_entries && !num_bitmaps) 647 ret = io_ctl_check_crc(&io_ctl, 0);
312 break; 648 if (ret)
649 goto free_cache;
650
651 ret = io_ctl_check_generation(&io_ctl, generation);
652 if (ret)
653 goto free_cache;
313 654
314 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 655 while (num_entries) {
315 if (!page) 656 e = kmem_cache_zalloc(btrfs_free_space_cachep,
657 GFP_NOFS);
658 if (!e)
316 goto free_cache; 659 goto free_cache;
317 660
318 if (!PageUptodate(page)) { 661 ret = io_ctl_read_entry(&io_ctl, e, &type);
319 btrfs_readpage(NULL, page); 662 if (ret) {
320 lock_page(page); 663 kmem_cache_free(btrfs_free_space_cachep, e);
321 if (!PageUptodate(page)) { 664 goto free_cache;
322 unlock_page(page);
323 page_cache_release(page);
324 printk(KERN_ERR "btrfs: error reading free "
325 "space cache\n");
326 goto free_cache;
327 }
328 } 665 }
329 addr = kmap(page);
330 666
331 if (index == 0) { 667 if (!e->bytes) {
332 u64 *gen; 668 kmem_cache_free(btrfs_free_space_cachep, e);
669 goto free_cache;
670 }
333 671
334 /* 672 if (type == BTRFS_FREE_SPACE_EXTENT) {
335 * We put a bogus crc in the front of the first page in 673 spin_lock(&ctl->tree_lock);
336 * case old kernels try to mount a fs with the new 674 ret = link_free_space(ctl, e);
337 * format to make sure they discard the cache. 675 spin_unlock(&ctl->tree_lock);
338 */ 676 if (ret) {
339 addr += sizeof(u64); 677 printk(KERN_ERR "Duplicate entries in "
340 offset += sizeof(u64); 678 "free space cache, dumping\n");
341 679 kmem_cache_free(btrfs_free_space_cachep, e);
342 gen = addr;
343 if (*gen != BTRFS_I(inode)->generation) {
344 printk(KERN_ERR "btrfs: space cache generation"
345 " (%llu) does not match inode (%llu)\n",
346 (unsigned long long)*gen,
347 (unsigned long long)
348 BTRFS_I(inode)->generation);
349 kunmap(page);
350 unlock_page(page);
351 page_cache_release(page);
352 goto free_cache; 680 goto free_cache;
353 } 681 }
354 addr += sizeof(u64); 682 } else {
355 offset += sizeof(u64); 683 BUG_ON(!num_bitmaps);
356 } 684 num_bitmaps--;
357 entry = addr; 685 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
358 686 if (!e->bitmap) {
359 while (1) { 687 kmem_cache_free(
360 if (!num_entries) 688 btrfs_free_space_cachep, e);
361 break;
362
363 need_loop = 1;
364 e = kmem_cache_zalloc(btrfs_free_space_cachep,
365 GFP_NOFS);
366 if (!e) {
367 kunmap(page);
368 unlock_page(page);
369 page_cache_release(page);
370 goto free_cache; 689 goto free_cache;
371 } 690 }
372 691 spin_lock(&ctl->tree_lock);
373 e->offset = le64_to_cpu(entry->offset); 692 ret = link_free_space(ctl, e);
374 e->bytes = le64_to_cpu(entry->bytes); 693 ctl->total_bitmaps++;
375 if (!e->bytes) { 694 ctl->op->recalc_thresholds(ctl);
376 kunmap(page); 695 spin_unlock(&ctl->tree_lock);
696 if (ret) {
697 printk(KERN_ERR "Duplicate entries in "
698 "free space cache, dumping\n");
377 kmem_cache_free(btrfs_free_space_cachep, e); 699 kmem_cache_free(btrfs_free_space_cachep, e);
378 unlock_page(page);
379 page_cache_release(page);
380 goto free_cache; 700 goto free_cache;
381 } 701 }
382 702 list_add_tail(&e->list, &bitmaps);
383 if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
384 spin_lock(&ctl->tree_lock);
385 ret = link_free_space(ctl, e);
386 spin_unlock(&ctl->tree_lock);
387 if (ret) {
388 printk(KERN_ERR "Duplicate entries in "
389 "free space cache, dumping\n");
390 kunmap(page);
391 unlock_page(page);
392 page_cache_release(page);
393 goto free_cache;
394 }
395 } else {
396 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
397 if (!e->bitmap) {
398 kunmap(page);
399 kmem_cache_free(
400 btrfs_free_space_cachep, e);
401 unlock_page(page);
402 page_cache_release(page);
403 goto free_cache;
404 }
405 spin_lock(&ctl->tree_lock);
406 ret = link_free_space(ctl, e);
407 ctl->total_bitmaps++;
408 ctl->op->recalc_thresholds(ctl);
409 spin_unlock(&ctl->tree_lock);
410 if (ret) {
411 printk(KERN_ERR "Duplicate entries in "
412 "free space cache, dumping\n");
413 kunmap(page);
414 unlock_page(page);
415 page_cache_release(page);
416 goto free_cache;
417 }
418 list_add_tail(&e->list, &bitmaps);
419 }
420
421 num_entries--;
422 offset += sizeof(struct btrfs_free_space_entry);
423 if (offset + sizeof(struct btrfs_free_space_entry) >=
424 PAGE_CACHE_SIZE)
425 break;
426 entry++;
427 } 703 }
428 704
429 /* 705 num_entries--;
430 * We read an entry out of this page, we need to move on to the 706 }
431 * next page.
432 */
433 if (need_loop) {
434 kunmap(page);
435 goto next;
436 }
437 707
438 /* 708 io_ctl_unmap_page(&io_ctl);
439 * We add the bitmaps at the end of the entries in order that 709
440 * the bitmap entries are added to the cache. 710 /*
441 */ 711 * We add the bitmaps at the end of the entries in order that
442 e = list_entry(bitmaps.next, struct btrfs_free_space, list); 712 * the bitmap entries are added to the cache.
713 */
714 list_for_each_entry_safe(e, n, &bitmaps, list) {
443 list_del_init(&e->list); 715 list_del_init(&e->list);
444 memcpy(e->bitmap, addr, PAGE_CACHE_SIZE); 716 ret = io_ctl_read_bitmap(&io_ctl, e);
445 kunmap(page); 717 if (ret)
446 num_bitmaps--; 718 goto free_cache;
447next:
448 unlock_page(page);
449 page_cache_release(page);
450 index++;
451 } 719 }
452 720
721 io_ctl_drop_pages(&io_ctl);
453 ret = 1; 722 ret = 1;
454out: 723out:
724 io_ctl_free(&io_ctl);
455 return ret; 725 return ret;
456free_cache: 726free_cache:
727 io_ctl_drop_pages(&io_ctl);
457 __btrfs_remove_free_space_cache(ctl); 728 __btrfs_remove_free_space_cache(ctl);
458 goto out; 729 goto out;
459} 730}
@@ -465,7 +736,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
465 struct btrfs_root *root = fs_info->tree_root; 736 struct btrfs_root *root = fs_info->tree_root;
466 struct inode *inode; 737 struct inode *inode;
467 struct btrfs_path *path; 738 struct btrfs_path *path;
468 int ret; 739 int ret = 0;
469 bool matched; 740 bool matched;
470 u64 used = btrfs_block_group_used(&block_group->item); 741 u64 used = btrfs_block_group_used(&block_group->item);
471 742
@@ -497,6 +768,14 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
497 return 0; 768 return 0;
498 } 769 }
499 770
771 /* We may have converted the inode and made the cache invalid. */
772 spin_lock(&block_group->lock);
773 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
774 spin_unlock(&block_group->lock);
775 goto out;
776 }
777 spin_unlock(&block_group->lock);
778
500 ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, 779 ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
501 path, block_group->key.objectid); 780 path, block_group->key.objectid);
502 btrfs_free_path(path); 781 btrfs_free_path(path);
@@ -530,6 +809,19 @@ out:
530 return ret; 809 return ret;
531} 810}
532 811
812/**
813 * __btrfs_write_out_cache - write out cached info to an inode
814 * @root - the root the inode belongs to
815 * @ctl - the free space cache we are going to write out
816 * @block_group - the block_group for this cache if it belongs to a block_group
817 * @trans - the trans handle
818 * @path - the path to use
819 * @offset - the offset for the key we'll insert
820 *
821 * This function writes out a free space cache struct to disk for quick recovery
822 * on mount. This will return 0 if it was successfull in writing the cache out,
823 * and -1 if it was not.
824 */
533int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, 825int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
534 struct btrfs_free_space_ctl *ctl, 826 struct btrfs_free_space_ctl *ctl,
535 struct btrfs_block_group_cache *block_group, 827 struct btrfs_block_group_cache *block_group,
@@ -540,42 +832,24 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
540 struct extent_buffer *leaf; 832 struct extent_buffer *leaf;
541 struct rb_node *node; 833 struct rb_node *node;
542 struct list_head *pos, *n; 834 struct list_head *pos, *n;
543 struct page **pages;
544 struct page *page;
545 struct extent_state *cached_state = NULL; 835 struct extent_state *cached_state = NULL;
546 struct btrfs_free_cluster *cluster = NULL; 836 struct btrfs_free_cluster *cluster = NULL;
547 struct extent_io_tree *unpin = NULL; 837 struct extent_io_tree *unpin = NULL;
838 struct io_ctl io_ctl;
548 struct list_head bitmap_list; 839 struct list_head bitmap_list;
549 struct btrfs_key key; 840 struct btrfs_key key;
550 u64 start, end, len; 841 u64 start, end, len;
551 u64 bytes = 0;
552 u32 crc = ~(u32)0;
553 int index = 0, num_pages = 0;
554 int entries = 0; 842 int entries = 0;
555 int bitmaps = 0; 843 int bitmaps = 0;
556 int ret = -1; 844 int ret;
557 bool next_page = false; 845 int err = -1;
558 bool out_of_space = false;
559 846
560 INIT_LIST_HEAD(&bitmap_list); 847 INIT_LIST_HEAD(&bitmap_list);
561 848
562 node = rb_first(&ctl->free_space_offset);
563 if (!node)
564 return 0;
565
566 if (!i_size_read(inode)) 849 if (!i_size_read(inode))
567 return -1; 850 return -1;
568 851
569 num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> 852 io_ctl_init(&io_ctl, inode, root);
570 PAGE_CACHE_SHIFT;
571
572 filemap_write_and_wait(inode->i_mapping);
573 btrfs_wait_ordered_range(inode, inode->i_size &
574 ~(root->sectorsize - 1), (u64)-1);
575
576 pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
577 if (!pages)
578 return -1;
579 853
580 /* Get the cluster for this block_group if it exists */ 854 /* Get the cluster for this block_group if it exists */
581 if (block_group && !list_empty(&block_group->cluster_list)) 855 if (block_group && !list_empty(&block_group->cluster_list))
@@ -589,30 +863,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
589 */ 863 */
590 unpin = root->fs_info->pinned_extents; 864 unpin = root->fs_info->pinned_extents;
591 865
592 /* 866 /* Lock all pages first so we can lock the extent safely. */
593 * Lock all pages first so we can lock the extent safely. 867 io_ctl_prepare_pages(&io_ctl, inode, 0);
594 *
595 * NOTE: Because we hold the ref the entire time we're going to write to
596 * the page find_get_page should never fail, so we don't do a check
597 * after find_get_page at this point. Just putting this here so people
598 * know and don't freak out.
599 */
600 while (index < num_pages) {
601 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
602 if (!page) {
603 int i;
604 868
605 for (i = 0; i < num_pages; i++) {
606 unlock_page(pages[i]);
607 page_cache_release(pages[i]);
608 }
609 goto out;
610 }
611 pages[index] = page;
612 index++;
613 }
614
615 index = 0;
616 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 869 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
617 0, &cached_state, GFP_NOFS); 870 0, &cached_state, GFP_NOFS);
618 871
@@ -623,189 +876,111 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
623 if (block_group) 876 if (block_group)
624 start = block_group->key.objectid; 877 start = block_group->key.objectid;
625 878
626 /* Write out the extent entries */ 879 node = rb_first(&ctl->free_space_offset);
627 do { 880 if (!node && cluster) {
628 struct btrfs_free_space_entry *entry; 881 node = rb_first(&cluster->root);
629 void *addr, *orig; 882 cluster = NULL;
630 unsigned long offset = 0; 883 }
631 884
632 next_page = false; 885 /* Make sure we can fit our crcs into the first page */
886 if (io_ctl.check_crcs &&
887 (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) {
888 WARN_ON(1);
889 goto out_nospc;
890 }
633 891
634 if (index >= num_pages) { 892 io_ctl_set_generation(&io_ctl, trans->transid);
635 out_of_space = true;
636 break;
637 }
638 893
639 page = pages[index]; 894 /* Write out the extent entries */
895 while (node) {
896 struct btrfs_free_space *e;
640 897
641 orig = addr = kmap(page); 898 e = rb_entry(node, struct btrfs_free_space, offset_index);
642 if (index == 0) { 899 entries++;
643 u64 *gen;
644 900
645 /* 901 ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes,
646 * We're going to put in a bogus crc for this page to 902 e->bitmap);
647 * make sure that old kernels who aren't aware of this 903 if (ret)
648 * format will be sure to discard the cache. 904 goto out_nospc;
649 */
650 addr += sizeof(u64);
651 offset += sizeof(u64);
652 905
653 gen = addr; 906 if (e->bitmap) {
654 *gen = trans->transid; 907 list_add_tail(&e->list, &bitmap_list);
655 addr += sizeof(u64); 908 bitmaps++;
656 offset += sizeof(u64);
657 } 909 }
658 entry = addr; 910 node = rb_next(node);
659 911 if (!node && cluster) {
660 memset(addr, 0, PAGE_CACHE_SIZE - offset); 912 node = rb_first(&cluster->root);
661 while (node && !next_page) { 913 cluster = NULL;
662 struct btrfs_free_space *e;
663
664 e = rb_entry(node, struct btrfs_free_space, offset_index);
665 entries++;
666
667 entry->offset = cpu_to_le64(e->offset);
668 entry->bytes = cpu_to_le64(e->bytes);
669 if (e->bitmap) {
670 entry->type = BTRFS_FREE_SPACE_BITMAP;
671 list_add_tail(&e->list, &bitmap_list);
672 bitmaps++;
673 } else {
674 entry->type = BTRFS_FREE_SPACE_EXTENT;
675 }
676 node = rb_next(node);
677 if (!node && cluster) {
678 node = rb_first(&cluster->root);
679 cluster = NULL;
680 }
681 offset += sizeof(struct btrfs_free_space_entry);
682 if (offset + sizeof(struct btrfs_free_space_entry) >=
683 PAGE_CACHE_SIZE)
684 next_page = true;
685 entry++;
686 } 914 }
915 }
687 916
688 /* 917 /*
689 * We want to add any pinned extents to our free space cache 918 * We want to add any pinned extents to our free space cache
690 * so we don't leak the space 919 * so we don't leak the space
691 */ 920 */
692 while (block_group && !next_page && 921 while (block_group && (start < block_group->key.objectid +
693 (start < block_group->key.objectid + 922 block_group->key.offset)) {
694 block_group->key.offset)) { 923 ret = find_first_extent_bit(unpin, start, &start, &end,
695 ret = find_first_extent_bit(unpin, start, &start, &end, 924 EXTENT_DIRTY);
696 EXTENT_DIRTY); 925 if (ret) {
697 if (ret) { 926 ret = 0;
698 ret = 0; 927 break;
699 break;
700 }
701
702 /* This pinned extent is out of our range */
703 if (start >= block_group->key.objectid +
704 block_group->key.offset)
705 break;
706
707 len = block_group->key.objectid +
708 block_group->key.offset - start;
709 len = min(len, end + 1 - start);
710
711 entries++;
712 entry->offset = cpu_to_le64(start);
713 entry->bytes = cpu_to_le64(len);
714 entry->type = BTRFS_FREE_SPACE_EXTENT;
715
716 start = end + 1;
717 offset += sizeof(struct btrfs_free_space_entry);
718 if (offset + sizeof(struct btrfs_free_space_entry) >=
719 PAGE_CACHE_SIZE)
720 next_page = true;
721 entry++;
722 } 928 }
723 929
724 /* Generate bogus crc value */ 930 /* This pinned extent is out of our range */
725 if (index == 0) { 931 if (start >= block_group->key.objectid +
726 u32 *tmp; 932 block_group->key.offset)
727 crc = btrfs_csum_data(root, orig + sizeof(u64), crc, 933 break;
728 PAGE_CACHE_SIZE - sizeof(u64));
729 btrfs_csum_final(crc, (char *)&crc);
730 crc++;
731 tmp = orig;
732 *tmp = crc;
733 }
734 934
735 kunmap(page); 935 len = block_group->key.objectid +
936 block_group->key.offset - start;
937 len = min(len, end + 1 - start);
736 938
737 bytes += PAGE_CACHE_SIZE; 939 entries++;
940 ret = io_ctl_add_entry(&io_ctl, start, len, NULL);
941 if (ret)
942 goto out_nospc;
738 943
739 index++; 944 start = end + 1;
740 } while (node || next_page); 945 }
741 946
742 /* Write out the bitmaps */ 947 /* Write out the bitmaps */
743 list_for_each_safe(pos, n, &bitmap_list) { 948 list_for_each_safe(pos, n, &bitmap_list) {
744 void *addr;
745 struct btrfs_free_space *entry = 949 struct btrfs_free_space *entry =
746 list_entry(pos, struct btrfs_free_space, list); 950 list_entry(pos, struct btrfs_free_space, list);
747 951
748 if (index >= num_pages) { 952 ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap);
749 out_of_space = true; 953 if (ret)
750 break; 954 goto out_nospc;
751 }
752 page = pages[index];
753
754 addr = kmap(page);
755 memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
756 kunmap(page);
757 bytes += PAGE_CACHE_SIZE;
758
759 list_del_init(&entry->list); 955 list_del_init(&entry->list);
760 index++;
761 }
762
763 if (out_of_space) {
764 btrfs_drop_pages(pages, num_pages);
765 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
766 i_size_read(inode) - 1, &cached_state,
767 GFP_NOFS);
768 ret = 0;
769 goto out;
770 } 956 }
771 957
772 /* Zero out the rest of the pages just to make sure */ 958 /* Zero out the rest of the pages just to make sure */
773 while (index < num_pages) { 959 io_ctl_zero_remaining_pages(&io_ctl);
774 void *addr;
775 960
776 page = pages[index]; 961 ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
777 addr = kmap(page); 962 0, i_size_read(inode), &cached_state);
778 memset(addr, 0, PAGE_CACHE_SIZE); 963 io_ctl_drop_pages(&io_ctl);
779 kunmap(page);
780 bytes += PAGE_CACHE_SIZE;
781 index++;
782 }
783
784 ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0,
785 bytes, &cached_state);
786 btrfs_drop_pages(pages, num_pages);
787 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 964 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
788 i_size_read(inode) - 1, &cached_state, GFP_NOFS); 965 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
789 966
790 if (ret) { 967 if (ret)
791 ret = 0;
792 goto out; 968 goto out;
793 }
794 969
795 BTRFS_I(inode)->generation = trans->transid;
796 970
797 filemap_write_and_wait(inode->i_mapping); 971 ret = filemap_write_and_wait(inode->i_mapping);
972 if (ret)
973 goto out;
798 974
799 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 975 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
800 key.offset = offset; 976 key.offset = offset;
801 key.type = 0; 977 key.type = 0;
802 978
803 ret = btrfs_search_slot(trans, root, &key, path, 1, 1); 979 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
804 if (ret < 0) { 980 if (ret < 0) {
805 ret = -1; 981 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
806 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, 982 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
807 EXTENT_DIRTY | EXTENT_DELALLOC | 983 GFP_NOFS);
808 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
809 goto out; 984 goto out;
810 } 985 }
811 leaf = path->nodes[0]; 986 leaf = path->nodes[0];
@@ -816,15 +991,16 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
816 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 991 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
817 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 992 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
818 found_key.offset != offset) { 993 found_key.offset != offset) {
819 ret = -1; 994 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
820 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, 995 inode->i_size - 1,
821 EXTENT_DIRTY | EXTENT_DELALLOC | 996 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
822 EXTENT_DO_ACCOUNTING, 0, 0, NULL, 997 NULL, GFP_NOFS);
823 GFP_NOFS);
824 btrfs_release_path(path); 998 btrfs_release_path(path);
825 goto out; 999 goto out;
826 } 1000 }
827 } 1001 }
1002
1003 BTRFS_I(inode)->generation = trans->transid;
828 header = btrfs_item_ptr(leaf, path->slots[0], 1004 header = btrfs_item_ptr(leaf, path->slots[0],
829 struct btrfs_free_space_header); 1005 struct btrfs_free_space_header);
830 btrfs_set_free_space_entries(leaf, header, entries); 1006 btrfs_set_free_space_entries(leaf, header, entries);
@@ -833,16 +1009,26 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
833 btrfs_mark_buffer_dirty(leaf); 1009 btrfs_mark_buffer_dirty(leaf);
834 btrfs_release_path(path); 1010 btrfs_release_path(path);
835 1011
836 ret = 1; 1012 err = 0;
837
838out: 1013out:
839 kfree(pages); 1014 io_ctl_free(&io_ctl);
840 if (ret != 1) { 1015 if (err) {
841 invalidate_inode_pages2_range(inode->i_mapping, 0, index); 1016 invalidate_inode_pages2(inode->i_mapping);
842 BTRFS_I(inode)->generation = 0; 1017 BTRFS_I(inode)->generation = 0;
843 } 1018 }
844 btrfs_update_inode(trans, root, inode); 1019 btrfs_update_inode(trans, root, inode);
845 return ret; 1020 return err;
1021
1022out_nospc:
1023 list_for_each_safe(pos, n, &bitmap_list) {
1024 struct btrfs_free_space *entry =
1025 list_entry(pos, struct btrfs_free_space, list);
1026 list_del_init(&entry->list);
1027 }
1028 io_ctl_drop_pages(&io_ctl);
1029 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1030 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
1031 goto out;
846} 1032}
847 1033
848int btrfs_write_out_cache(struct btrfs_root *root, 1034int btrfs_write_out_cache(struct btrfs_root *root,
@@ -869,14 +1055,15 @@ int btrfs_write_out_cache(struct btrfs_root *root,
869 1055
870 ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, 1056 ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
871 path, block_group->key.objectid); 1057 path, block_group->key.objectid);
872 if (ret < 0) { 1058 if (ret) {
873 spin_lock(&block_group->lock); 1059 spin_lock(&block_group->lock);
874 block_group->disk_cache_state = BTRFS_DC_ERROR; 1060 block_group->disk_cache_state = BTRFS_DC_ERROR;
875 spin_unlock(&block_group->lock); 1061 spin_unlock(&block_group->lock);
876 ret = 0; 1062 ret = 0;
877 1063#ifdef DEBUG
878 printk(KERN_ERR "btrfs: failed to write free space cace " 1064 printk(KERN_ERR "btrfs: failed to write free space cace "
879 "for block group %llu\n", block_group->key.objectid); 1065 "for block group %llu\n", block_group->key.objectid);
1066#endif
880 } 1067 }
881 1068
882 iput(inode); 1069 iput(inode);
@@ -1283,6 +1470,7 @@ static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
1283{ 1470{
1284 info->offset = offset_to_bitmap(ctl, offset); 1471 info->offset = offset_to_bitmap(ctl, offset);
1285 info->bytes = 0; 1472 info->bytes = 0;
1473 INIT_LIST_HEAD(&info->list);
1286 link_free_space(ctl, info); 1474 link_free_space(ctl, info);
1287 ctl->total_bitmaps++; 1475 ctl->total_bitmaps++;
1288 1476
@@ -1662,7 +1850,13 @@ again:
1662 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1850 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
1663 1, 0); 1851 1, 0);
1664 if (!info) { 1852 if (!info) {
1665 WARN_ON(1); 1853 /* the tree logging code might be calling us before we
1854 * have fully loaded the free space rbtree for this
1855 * block group. So it is possible the entry won't
1856 * be in the rbtree yet at all. The caching code
1857 * will make sure not to put it in the rbtree if
1858 * the logging code has pinned it.
1859 */
1666 goto out_lock; 1860 goto out_lock;
1667 } 1861 }
1668 } 1862 }
@@ -1701,6 +1895,7 @@ again:
1701 ctl->total_bitmaps--; 1895 ctl->total_bitmaps--;
1702 } 1896 }
1703 kmem_cache_free(btrfs_free_space_cachep, info); 1897 kmem_cache_free(btrfs_free_space_cachep, info);
1898 ret = 0;
1704 goto out_lock; 1899 goto out_lock;
1705 } 1900 }
1706 1901
@@ -1708,7 +1903,8 @@ again:
1708 unlink_free_space(ctl, info); 1903 unlink_free_space(ctl, info);
1709 info->offset += bytes; 1904 info->offset += bytes;
1710 info->bytes -= bytes; 1905 info->bytes -= bytes;
1711 link_free_space(ctl, info); 1906 ret = link_free_space(ctl, info);
1907 WARN_ON(ret);
1712 goto out_lock; 1908 goto out_lock;
1713 } 1909 }
1714 1910
@@ -2124,6 +2320,7 @@ again:
2124 2320
2125 if (!found) { 2321 if (!found) {
2126 start = i; 2322 start = i;
2323 cluster->max_size = 0;
2127 found = true; 2324 found = true;
2128 } 2325 }
2129 2326
@@ -2267,16 +2464,23 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2267{ 2464{
2268 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2465 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2269 struct btrfs_free_space *entry; 2466 struct btrfs_free_space *entry;
2270 struct rb_node *node;
2271 int ret = -ENOSPC; 2467 int ret = -ENOSPC;
2468 u64 bitmap_offset = offset_to_bitmap(ctl, offset);
2272 2469
2273 if (ctl->total_bitmaps == 0) 2470 if (ctl->total_bitmaps == 0)
2274 return -ENOSPC; 2471 return -ENOSPC;
2275 2472
2276 /* 2473 /*
2277 * First check our cached list of bitmaps and see if there is an entry 2474 * The bitmap that covers offset won't be in the list unless offset
2278 * here that will work. 2475 * is just its start offset.
2279 */ 2476 */
2477 entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
2478 if (entry->offset != bitmap_offset) {
2479 entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
2480 if (entry && list_empty(&entry->list))
2481 list_add(&entry->list, bitmaps);
2482 }
2483
2280 list_for_each_entry(entry, bitmaps, list) { 2484 list_for_each_entry(entry, bitmaps, list) {
2281 if (entry->bytes < min_bytes) 2485 if (entry->bytes < min_bytes)
2282 continue; 2486 continue;
@@ -2287,38 +2491,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2287 } 2491 }
2288 2492
2289 /* 2493 /*
2290 * If we do have entries on our list and we are here then we didn't find 2494 * The bitmaps list has all the bitmaps that record free space
2291 * anything, so go ahead and get the next entry after the last entry in 2495 * starting after offset, so no more search is required.
2292 * this list and start the search from there.
2293 */ 2496 */
2294 if (!list_empty(bitmaps)) { 2497 return -ENOSPC;
2295 entry = list_entry(bitmaps->prev, struct btrfs_free_space,
2296 list);
2297 node = rb_next(&entry->offset_index);
2298 if (!node)
2299 return -ENOSPC;
2300 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2301 goto search;
2302 }
2303
2304 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1);
2305 if (!entry)
2306 return -ENOSPC;
2307
2308search:
2309 node = &entry->offset_index;
2310 do {
2311 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2312 node = rb_next(&entry->offset_index);
2313 if (!entry->bitmap)
2314 continue;
2315 if (entry->bytes < min_bytes)
2316 continue;
2317 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
2318 bytes, min_bytes);
2319 } while (ret && node);
2320
2321 return ret;
2322} 2498}
2323 2499
2324/* 2500/*
@@ -2336,8 +2512,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2336 u64 offset, u64 bytes, u64 empty_size) 2512 u64 offset, u64 bytes, u64 empty_size)
2337{ 2513{
2338 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2514 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2339 struct list_head bitmaps;
2340 struct btrfs_free_space *entry, *tmp; 2515 struct btrfs_free_space *entry, *tmp;
2516 LIST_HEAD(bitmaps);
2341 u64 min_bytes; 2517 u64 min_bytes;
2342 int ret; 2518 int ret;
2343 2519
@@ -2376,7 +2552,6 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2376 goto out; 2552 goto out;
2377 } 2553 }
2378 2554
2379 INIT_LIST_HEAD(&bitmaps);
2380 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, 2555 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
2381 bytes, min_bytes); 2556 bytes, min_bytes);
2382 if (ret) 2557 if (ret)
@@ -2472,9 +2647,19 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2472 spin_unlock(&ctl->tree_lock); 2647 spin_unlock(&ctl->tree_lock);
2473 2648
2474 if (bytes >= minlen) { 2649 if (bytes >= minlen) {
2475 int update_ret; 2650 struct btrfs_space_info *space_info;
2476 update_ret = btrfs_update_reserved_bytes(block_group, 2651 int update = 0;
2477 bytes, 1, 1); 2652
2653 space_info = block_group->space_info;
2654 spin_lock(&space_info->lock);
2655 spin_lock(&block_group->lock);
2656 if (!block_group->ro) {
2657 block_group->reserved += bytes;
2658 space_info->bytes_reserved += bytes;
2659 update = 1;
2660 }
2661 spin_unlock(&block_group->lock);
2662 spin_unlock(&space_info->lock);
2478 2663
2479 ret = btrfs_error_discard_extent(fs_info->extent_root, 2664 ret = btrfs_error_discard_extent(fs_info->extent_root,
2480 start, 2665 start,
@@ -2482,9 +2667,16 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2482 &actually_trimmed); 2667 &actually_trimmed);
2483 2668
2484 btrfs_add_free_space(block_group, start, bytes); 2669 btrfs_add_free_space(block_group, start, bytes);
2485 if (!update_ret) 2670 if (update) {
2486 btrfs_update_reserved_bytes(block_group, 2671 spin_lock(&space_info->lock);
2487 bytes, 0, 1); 2672 spin_lock(&block_group->lock);
2673 if (block_group->ro)
2674 space_info->bytes_readonly += bytes;
2675 block_group->reserved -= bytes;
2676 space_info->bytes_reserved -= bytes;
2677 spin_unlock(&space_info->lock);
2678 spin_unlock(&block_group->lock);
2679 }
2488 2680
2489 if (ret) 2681 if (ret)
2490 break; 2682 break;
@@ -2643,9 +2835,13 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
2643 return 0; 2835 return 0;
2644 2836
2645 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); 2837 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
2646 if (ret < 0) 2838 if (ret) {
2839 btrfs_delalloc_release_metadata(inode, inode->i_size);
2840#ifdef DEBUG
2647 printk(KERN_ERR "btrfs: failed to write free ino cache " 2841 printk(KERN_ERR "btrfs: failed to write free ino cache "
2648 "for root %llu\n", root->root_key.objectid); 2842 "for root %llu\n", root->root_key.objectid);
2843#endif
2844 }
2649 2845
2650 iput(inode); 2846 iput(inode);
2651 return ret; 2847 return ret;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index b4087e0fa87..f8962a957d6 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -398,6 +398,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
398 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; 398 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
399 struct btrfs_path *path; 399 struct btrfs_path *path;
400 struct inode *inode; 400 struct inode *inode;
401 struct btrfs_block_rsv *rsv;
402 u64 num_bytes;
401 u64 alloc_hint = 0; 403 u64 alloc_hint = 0;
402 int ret; 404 int ret;
403 int prealloc; 405 int prealloc;
@@ -421,11 +423,26 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
421 if (!path) 423 if (!path)
422 return -ENOMEM; 424 return -ENOMEM;
423 425
426 rsv = trans->block_rsv;
427 trans->block_rsv = &root->fs_info->trans_block_rsv;
428
429 num_bytes = trans->bytes_reserved;
430 /*
431 * 1 item for inode item insertion if need
432 * 3 items for inode item update (in the worst case)
433 * 1 item for free space object
434 * 3 items for pre-allocation
435 */
436 trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
437 ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv,
438 trans->bytes_reserved);
439 if (ret)
440 goto out;
424again: 441again:
425 inode = lookup_free_ino_inode(root, path); 442 inode = lookup_free_ino_inode(root, path);
426 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 443 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
427 ret = PTR_ERR(inode); 444 ret = PTR_ERR(inode);
428 goto out; 445 goto out_release;
429 } 446 }
430 447
431 if (IS_ERR(inode)) { 448 if (IS_ERR(inode)) {
@@ -434,7 +451,7 @@ again:
434 451
435 ret = create_free_ino_inode(root, trans, path); 452 ret = create_free_ino_inode(root, trans, path);
436 if (ret) 453 if (ret)
437 goto out; 454 goto out_release;
438 goto again; 455 goto again;
439 } 456 }
440 457
@@ -465,21 +482,26 @@ again:
465 /* Just to make sure we have enough space */ 482 /* Just to make sure we have enough space */
466 prealloc += 8 * PAGE_CACHE_SIZE; 483 prealloc += 8 * PAGE_CACHE_SIZE;
467 484
468 ret = btrfs_check_data_free_space(inode, prealloc); 485 ret = btrfs_delalloc_reserve_space(inode, prealloc);
469 if (ret) 486 if (ret)
470 goto out_put; 487 goto out_put;
471 488
472 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, 489 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
473 prealloc, prealloc, &alloc_hint); 490 prealloc, prealloc, &alloc_hint);
474 if (ret) 491 if (ret) {
492 btrfs_delalloc_release_space(inode, prealloc);
475 goto out_put; 493 goto out_put;
494 }
476 btrfs_free_reserved_data_space(inode, prealloc); 495 btrfs_free_reserved_data_space(inode, prealloc);
477 496
497 ret = btrfs_write_out_ino_cache(root, trans, path);
478out_put: 498out_put:
479 iput(inode); 499 iput(inode);
500out_release:
501 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
480out: 502out:
481 if (ret == 0) 503 trans->block_rsv = rsv;
482 ret = btrfs_write_out_ino_cache(root, trans, path); 504 trans->bytes_reserved = num_bytes;
483 505
484 btrfs_free_path(path); 506 btrfs_free_path(path);
485 return ret; 507 return ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 75686a61bd4..2c984f7d4c2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -45,10 +45,10 @@
45#include "btrfs_inode.h" 45#include "btrfs_inode.h"
46#include "ioctl.h" 46#include "ioctl.h"
47#include "print-tree.h" 47#include "print-tree.h"
48#include "volumes.h"
49#include "ordered-data.h" 48#include "ordered-data.h"
50#include "xattr.h" 49#include "xattr.h"
51#include "tree-log.h" 50#include "tree-log.h"
51#include "volumes.h"
52#include "compression.h" 52#include "compression.h"
53#include "locking.h" 53#include "locking.h"
54#include "free-space-cache.h" 54#include "free-space-cache.h"
@@ -93,6 +93,8 @@ static noinline int cow_file_range(struct inode *inode,
93 struct page *locked_page, 93 struct page *locked_page,
94 u64 start, u64 end, int *page_started, 94 u64 start, u64 end, int *page_started,
95 unsigned long *nr_written, int unlock); 95 unsigned long *nr_written, int unlock);
96static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
97 struct btrfs_root *root, struct inode *inode);
96 98
97static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 99static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
98 struct inode *inode, struct inode *dir, 100 struct inode *inode, struct inode *dir,
@@ -393,7 +395,10 @@ again:
393 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { 395 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
394 WARN_ON(pages); 396 WARN_ON(pages);
395 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 397 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
396 BUG_ON(!pages); 398 if (!pages) {
399 /* just bail out to the uncompressed code */
400 goto cont;
401 }
397 402
398 if (BTRFS_I(inode)->force_compress) 403 if (BTRFS_I(inode)->force_compress)
399 compress_type = BTRFS_I(inode)->force_compress; 404 compress_type = BTRFS_I(inode)->force_compress;
@@ -424,6 +429,7 @@ again:
424 will_compress = 1; 429 will_compress = 1;
425 } 430 }
426 } 431 }
432cont:
427 if (start == 0) { 433 if (start == 0) {
428 trans = btrfs_join_transaction(root); 434 trans = btrfs_join_transaction(root);
429 BUG_ON(IS_ERR(trans)); 435 BUG_ON(IS_ERR(trans));
@@ -820,7 +826,7 @@ static noinline int cow_file_range(struct inode *inode,
820 } 826 }
821 827
822 BUG_ON(disk_num_bytes > 828 BUG_ON(disk_num_bytes >
823 btrfs_super_total_bytes(&root->fs_info->super_copy)); 829 btrfs_super_total_bytes(root->fs_info->super_copy));
824 830
825 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); 831 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
826 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 832 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
@@ -1737,7 +1743,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1737 trans = btrfs_join_transaction(root); 1743 trans = btrfs_join_transaction(root);
1738 BUG_ON(IS_ERR(trans)); 1744 BUG_ON(IS_ERR(trans));
1739 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1745 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1740 ret = btrfs_update_inode(trans, root, inode); 1746 ret = btrfs_update_inode_fallback(trans, root, inode);
1741 BUG_ON(ret); 1747 BUG_ON(ret);
1742 } 1748 }
1743 goto out; 1749 goto out;
@@ -1787,17 +1793,17 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1787 1793
1788 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1794 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1789 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1795 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1790 ret = btrfs_update_inode(trans, root, inode); 1796 ret = btrfs_update_inode_fallback(trans, root, inode);
1791 BUG_ON(ret); 1797 BUG_ON(ret);
1792 } 1798 }
1793 ret = 0; 1799 ret = 0;
1794out: 1800out:
1795 if (nolock) { 1801 if (root != root->fs_info->tree_root)
1796 if (trans)
1797 btrfs_end_transaction_nolock(trans, root);
1798 } else {
1799 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 1802 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
1800 if (trans) 1803 if (trans) {
1804 if (nolock)
1805 btrfs_end_transaction_nolock(trans, root);
1806 else
1801 btrfs_end_transaction(trans, root); 1807 btrfs_end_transaction(trans, root);
1802 } 1808 }
1803 1809
@@ -1819,153 +1825,9 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
1819} 1825}
1820 1826
1821/* 1827/*
1822 * When IO fails, either with EIO or csum verification fails, we
1823 * try other mirrors that might have a good copy of the data. This
1824 * io_failure_record is used to record state as we go through all the
1825 * mirrors. If another mirror has good data, the page is set up to date
1826 * and things continue. If a good mirror can't be found, the original
1827 * bio end_io callback is called to indicate things have failed.
1828 */
1829struct io_failure_record {
1830 struct page *page;
1831 u64 start;
1832 u64 len;
1833 u64 logical;
1834 unsigned long bio_flags;
1835 int last_mirror;
1836};
1837
1838static int btrfs_io_failed_hook(struct bio *failed_bio,
1839 struct page *page, u64 start, u64 end,
1840 struct extent_state *state)
1841{
1842 struct io_failure_record *failrec = NULL;
1843 u64 private;
1844 struct extent_map *em;
1845 struct inode *inode = page->mapping->host;
1846 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1847 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1848 struct bio *bio;
1849 int num_copies;
1850 int ret;
1851 int rw;
1852 u64 logical;
1853
1854 ret = get_state_private(failure_tree, start, &private);
1855 if (ret) {
1856 failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
1857 if (!failrec)
1858 return -ENOMEM;
1859 failrec->start = start;
1860 failrec->len = end - start + 1;
1861 failrec->last_mirror = 0;
1862 failrec->bio_flags = 0;
1863
1864 read_lock(&em_tree->lock);
1865 em = lookup_extent_mapping(em_tree, start, failrec->len);
1866 if (em->start > start || em->start + em->len < start) {
1867 free_extent_map(em);
1868 em = NULL;
1869 }
1870 read_unlock(&em_tree->lock);
1871
1872 if (IS_ERR_OR_NULL(em)) {
1873 kfree(failrec);
1874 return -EIO;
1875 }
1876 logical = start - em->start;
1877 logical = em->block_start + logical;
1878 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1879 logical = em->block_start;
1880 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
1881 extent_set_compress_type(&failrec->bio_flags,
1882 em->compress_type);
1883 }
1884 failrec->logical = logical;
1885 free_extent_map(em);
1886 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
1887 EXTENT_DIRTY, GFP_NOFS);
1888 set_state_private(failure_tree, start,
1889 (u64)(unsigned long)failrec);
1890 } else {
1891 failrec = (struct io_failure_record *)(unsigned long)private;
1892 }
1893 num_copies = btrfs_num_copies(
1894 &BTRFS_I(inode)->root->fs_info->mapping_tree,
1895 failrec->logical, failrec->len);
1896 failrec->last_mirror++;
1897 if (!state) {
1898 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1899 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1900 failrec->start,
1901 EXTENT_LOCKED);
1902 if (state && state->start != failrec->start)
1903 state = NULL;
1904 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1905 }
1906 if (!state || failrec->last_mirror > num_copies) {
1907 set_state_private(failure_tree, failrec->start, 0);
1908 clear_extent_bits(failure_tree, failrec->start,
1909 failrec->start + failrec->len - 1,
1910 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1911 kfree(failrec);
1912 return -EIO;
1913 }
1914 bio = bio_alloc(GFP_NOFS, 1);
1915 bio->bi_private = state;
1916 bio->bi_end_io = failed_bio->bi_end_io;
1917 bio->bi_sector = failrec->logical >> 9;
1918 bio->bi_bdev = failed_bio->bi_bdev;
1919 bio->bi_size = 0;
1920
1921 bio_add_page(bio, page, failrec->len, start - page_offset(page));
1922 if (failed_bio->bi_rw & REQ_WRITE)
1923 rw = WRITE;
1924 else
1925 rw = READ;
1926
1927 ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1928 failrec->last_mirror,
1929 failrec->bio_flags, 0);
1930 return ret;
1931}
1932
1933/*
1934 * each time an IO finishes, we do a fast check in the IO failure tree
1935 * to see if we need to process or clean up an io_failure_record
1936 */
1937static int btrfs_clean_io_failures(struct inode *inode, u64 start)
1938{
1939 u64 private;
1940 u64 private_failure;
1941 struct io_failure_record *failure;
1942 int ret;
1943
1944 private = 0;
1945 if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1946 (u64)-1, 1, EXTENT_DIRTY, 0)) {
1947 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
1948 start, &private_failure);
1949 if (ret == 0) {
1950 failure = (struct io_failure_record *)(unsigned long)
1951 private_failure;
1952 set_state_private(&BTRFS_I(inode)->io_failure_tree,
1953 failure->start, 0);
1954 clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
1955 failure->start,
1956 failure->start + failure->len - 1,
1957 EXTENT_DIRTY | EXTENT_LOCKED,
1958 GFP_NOFS);
1959 kfree(failure);
1960 }
1961 }
1962 return 0;
1963}
1964
1965/*
1966 * when reads are done, we need to check csums to verify the data is correct 1828 * when reads are done, we need to check csums to verify the data is correct
1967 * if there's a match, we allow the bio to finish. If not, we go through 1829 * if there's a match, we allow the bio to finish. If not, the code in
1968 * the io_failure_record routines to find good copies 1830 * extent_io.c will try to find good copies for us.
1969 */ 1831 */
1970static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, 1832static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
1971 struct extent_state *state) 1833 struct extent_state *state)
@@ -2011,10 +1873,6 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
2011 1873
2012 kunmap_atomic(kaddr, KM_USER0); 1874 kunmap_atomic(kaddr, KM_USER0);
2013good: 1875good:
2014 /* if the io failure tree for this inode is non-empty,
2015 * check to see if we've recovered from a failed IO
2016 */
2017 btrfs_clean_io_failures(inode, start);
2018 return 0; 1876 return 0;
2019 1877
2020zeroit: 1878zeroit:
@@ -2079,89 +1937,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
2079 up_read(&root->fs_info->cleanup_work_sem); 1937 up_read(&root->fs_info->cleanup_work_sem);
2080} 1938}
2081 1939
2082/*
2083 * calculate extra metadata reservation when snapshotting a subvolume
2084 * contains orphan files.
2085 */
2086void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2087 struct btrfs_pending_snapshot *pending,
2088 u64 *bytes_to_reserve)
2089{
2090 struct btrfs_root *root;
2091 struct btrfs_block_rsv *block_rsv;
2092 u64 num_bytes;
2093 int index;
2094
2095 root = pending->root;
2096 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2097 return;
2098
2099 block_rsv = root->orphan_block_rsv;
2100
2101 /* orphan block reservation for the snapshot */
2102 num_bytes = block_rsv->size;
2103
2104 /*
2105 * after the snapshot is created, COWing tree blocks may use more
2106 * space than it frees. So we should make sure there is enough
2107 * reserved space.
2108 */
2109 index = trans->transid & 0x1;
2110 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2111 num_bytes += block_rsv->size -
2112 (block_rsv->reserved + block_rsv->freed[index]);
2113 }
2114
2115 *bytes_to_reserve += num_bytes;
2116}
2117
2118void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2119 struct btrfs_pending_snapshot *pending)
2120{
2121 struct btrfs_root *root = pending->root;
2122 struct btrfs_root *snap = pending->snap;
2123 struct btrfs_block_rsv *block_rsv;
2124 u64 num_bytes;
2125 int index;
2126 int ret;
2127
2128 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2129 return;
2130
2131 /* refill source subvolume's orphan block reservation */
2132 block_rsv = root->orphan_block_rsv;
2133 index = trans->transid & 0x1;
2134 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2135 num_bytes = block_rsv->size -
2136 (block_rsv->reserved + block_rsv->freed[index]);
2137 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2138 root->orphan_block_rsv,
2139 num_bytes);
2140 BUG_ON(ret);
2141 }
2142
2143 /* setup orphan block reservation for the snapshot */
2144 block_rsv = btrfs_alloc_block_rsv(snap);
2145 BUG_ON(!block_rsv);
2146
2147 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2148 snap->orphan_block_rsv = block_rsv;
2149
2150 num_bytes = root->orphan_block_rsv->size;
2151 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2152 block_rsv, num_bytes);
2153 BUG_ON(ret);
2154
2155#if 0
2156 /* insert orphan item for the snapshot */
2157 WARN_ON(!root->orphan_item_inserted);
2158 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2159 snap->root_key.objectid);
2160 BUG_ON(ret);
2161 snap->orphan_item_inserted = 1;
2162#endif
2163}
2164
2165enum btrfs_orphan_cleanup_state { 1940enum btrfs_orphan_cleanup_state {
2166 ORPHAN_CLEANUP_STARTED = 1, 1941 ORPHAN_CLEANUP_STARTED = 1,
2167 ORPHAN_CLEANUP_DONE = 2, 1942 ORPHAN_CLEANUP_DONE = 2,
@@ -2247,9 +2022,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2247 } 2022 }
2248 spin_unlock(&root->orphan_lock); 2023 spin_unlock(&root->orphan_lock);
2249 2024
2250 if (block_rsv)
2251 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2252
2253 /* grab metadata reservation from transaction handle */ 2025 /* grab metadata reservation from transaction handle */
2254 if (reserve) { 2026 if (reserve) {
2255 ret = btrfs_orphan_reserve_metadata(trans, inode); 2027 ret = btrfs_orphan_reserve_metadata(trans, inode);
@@ -2316,6 +2088,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2316 struct btrfs_key key, found_key; 2088 struct btrfs_key key, found_key;
2317 struct btrfs_trans_handle *trans; 2089 struct btrfs_trans_handle *trans;
2318 struct inode *inode; 2090 struct inode *inode;
2091 u64 last_objectid = 0;
2319 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2092 int ret = 0, nr_unlink = 0, nr_truncate = 0;
2320 2093
2321 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) 2094 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
@@ -2367,41 +2140,49 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2367 * crossing root thing. we store the inode number in the 2140 * crossing root thing. we store the inode number in the
2368 * offset of the orphan item. 2141 * offset of the orphan item.
2369 */ 2142 */
2143
2144 if (found_key.offset == last_objectid) {
2145 printk(KERN_ERR "btrfs: Error removing orphan entry, "
2146 "stopping orphan cleanup\n");
2147 ret = -EINVAL;
2148 goto out;
2149 }
2150
2151 last_objectid = found_key.offset;
2152
2370 found_key.objectid = found_key.offset; 2153 found_key.objectid = found_key.offset;
2371 found_key.type = BTRFS_INODE_ITEM_KEY; 2154 found_key.type = BTRFS_INODE_ITEM_KEY;
2372 found_key.offset = 0; 2155 found_key.offset = 0;
2373 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 2156 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
2374 if (IS_ERR(inode)) { 2157 ret = PTR_RET(inode);
2375 ret = PTR_ERR(inode); 2158 if (ret && ret != -ESTALE)
2376 goto out; 2159 goto out;
2377 }
2378 2160
2379 /* 2161 /*
2380 * add this inode to the orphan list so btrfs_orphan_del does 2162 * Inode is already gone but the orphan item is still there,
2381 * the proper thing when we hit it 2163 * kill the orphan item.
2382 */ 2164 */
2383 spin_lock(&root->orphan_lock); 2165 if (ret == -ESTALE) {
2384 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2166 trans = btrfs_start_transaction(root, 1);
2385 spin_unlock(&root->orphan_lock);
2386
2387 /*
2388 * if this is a bad inode, means we actually succeeded in
2389 * removing the inode, but not the orphan record, which means
2390 * we need to manually delete the orphan since iput will just
2391 * do a destroy_inode
2392 */
2393 if (is_bad_inode(inode)) {
2394 trans = btrfs_start_transaction(root, 0);
2395 if (IS_ERR(trans)) { 2167 if (IS_ERR(trans)) {
2396 ret = PTR_ERR(trans); 2168 ret = PTR_ERR(trans);
2397 goto out; 2169 goto out;
2398 } 2170 }
2399 btrfs_orphan_del(trans, inode); 2171 ret = btrfs_del_orphan_item(trans, root,
2172 found_key.objectid);
2173 BUG_ON(ret);
2400 btrfs_end_transaction(trans, root); 2174 btrfs_end_transaction(trans, root);
2401 iput(inode);
2402 continue; 2175 continue;
2403 } 2176 }
2404 2177
2178 /*
2179 * add this inode to the orphan list so btrfs_orphan_del does
2180 * the proper thing when we hit it
2181 */
2182 spin_lock(&root->orphan_lock);
2183 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2184 spin_unlock(&root->orphan_lock);
2185
2405 /* if we have links, this was a truncate, lets do that */ 2186 /* if we have links, this was a truncate, lets do that */
2406 if (inode->i_nlink) { 2187 if (inode->i_nlink) {
2407 if (!S_ISREG(inode->i_mode)) { 2188 if (!S_ISREG(inode->i_mode)) {
@@ -2420,6 +2201,9 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2420 if (ret) 2201 if (ret)
2421 goto out; 2202 goto out;
2422 } 2203 }
2204 /* release the path since we're done with it */
2205 btrfs_release_path(path);
2206
2423 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; 2207 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
2424 2208
2425 if (root->orphan_block_rsv) 2209 if (root->orphan_block_rsv)
@@ -2647,7 +2431,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2647/* 2431/*
2648 * copy everything in the in-memory inode into the btree. 2432 * copy everything in the in-memory inode into the btree.
2649 */ 2433 */
2650noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, 2434static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
2651 struct btrfs_root *root, struct inode *inode) 2435 struct btrfs_root *root, struct inode *inode)
2652{ 2436{
2653 struct btrfs_inode_item *inode_item; 2437 struct btrfs_inode_item *inode_item;
@@ -2655,21 +2439,6 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2655 struct extent_buffer *leaf; 2439 struct extent_buffer *leaf;
2656 int ret; 2440 int ret;
2657 2441
2658 /*
2659 * If the inode is a free space inode, we can deadlock during commit
2660 * if we put it into the delayed code.
2661 *
2662 * The data relocation inode should also be directly updated
2663 * without delay
2664 */
2665 if (!btrfs_is_free_space_inode(root, inode)
2666 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
2667 ret = btrfs_delayed_update_inode(trans, root, inode);
2668 if (!ret)
2669 btrfs_set_inode_last_trans(trans, inode);
2670 return ret;
2671 }
2672
2673 path = btrfs_alloc_path(); 2442 path = btrfs_alloc_path();
2674 if (!path) 2443 if (!path)
2675 return -ENOMEM; 2444 return -ENOMEM;
@@ -2698,6 +2467,43 @@ failed:
2698} 2467}
2699 2468
2700/* 2469/*
2470 * copy everything in the in-memory inode into the btree.
2471 */
2472noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2473 struct btrfs_root *root, struct inode *inode)
2474{
2475 int ret;
2476
2477 /*
2478 * If the inode is a free space inode, we can deadlock during commit
2479 * if we put it into the delayed code.
2480 *
2481 * The data relocation inode should also be directly updated
2482 * without delay
2483 */
2484 if (!btrfs_is_free_space_inode(root, inode)
2485 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
2486 ret = btrfs_delayed_update_inode(trans, root, inode);
2487 if (!ret)
2488 btrfs_set_inode_last_trans(trans, inode);
2489 return ret;
2490 }
2491
2492 return btrfs_update_inode_item(trans, root, inode);
2493}
2494
2495static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
2496 struct btrfs_root *root, struct inode *inode)
2497{
2498 int ret;
2499
2500 ret = btrfs_update_inode(trans, root, inode);
2501 if (ret == -ENOSPC)
2502 return btrfs_update_inode_item(trans, root, inode);
2503 return ret;
2504}
2505
2506/*
2701 * unlink helper that gets used here in inode.c and in the tree logging 2507 * unlink helper that gets used here in inode.c and in the tree logging
2702 * recovery code. It remove a link in a directory with a given name, and 2508 * recovery code. It remove a link in a directory with a given name, and
2703 * also drops the back refs in the inode to the directory 2509 * also drops the back refs in the inode to the directory
@@ -2835,7 +2641,16 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2835 u64 ino = btrfs_ino(inode); 2641 u64 ino = btrfs_ino(inode);
2836 u64 dir_ino = btrfs_ino(dir); 2642 u64 dir_ino = btrfs_ino(dir);
2837 2643
2838 trans = btrfs_start_transaction(root, 10); 2644 /*
2645 * 1 for the possible orphan item
2646 * 1 for the dir item
2647 * 1 for the dir index
2648 * 1 for the inode ref
2649 * 1 for the inode ref in the tree log
2650 * 2 for the dir entries in the log
2651 * 1 for the inode
2652 */
2653 trans = btrfs_start_transaction(root, 8);
2839 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) 2654 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
2840 return trans; 2655 return trans;
2841 2656
@@ -2858,7 +2673,8 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2858 return ERR_PTR(-ENOMEM); 2673 return ERR_PTR(-ENOMEM);
2859 } 2674 }
2860 2675
2861 trans = btrfs_start_transaction(root, 0); 2676 /* 1 for the orphan item */
2677 trans = btrfs_start_transaction(root, 1);
2862 if (IS_ERR(trans)) { 2678 if (IS_ERR(trans)) {
2863 btrfs_free_path(path); 2679 btrfs_free_path(path);
2864 root->fs_info->enospc_unlink = 0; 2680 root->fs_info->enospc_unlink = 0;
@@ -2963,6 +2779,12 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2963 err = 0; 2779 err = 0;
2964out: 2780out:
2965 btrfs_free_path(path); 2781 btrfs_free_path(path);
2782 /* Migrate the orphan reservation over */
2783 if (!err)
2784 err = btrfs_block_rsv_migrate(trans->block_rsv,
2785 &root->fs_info->global_block_rsv,
2786 trans->bytes_reserved);
2787
2966 if (err) { 2788 if (err) {
2967 btrfs_end_transaction(trans, root); 2789 btrfs_end_transaction(trans, root);
2968 root->fs_info->enospc_unlink = 0; 2790 root->fs_info->enospc_unlink = 0;
@@ -2977,6 +2799,9 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
2977 struct btrfs_root *root) 2799 struct btrfs_root *root)
2978{ 2800{
2979 if (trans->block_rsv == &root->fs_info->global_block_rsv) { 2801 if (trans->block_rsv == &root->fs_info->global_block_rsv) {
2802 btrfs_block_rsv_release(root, trans->block_rsv,
2803 trans->bytes_reserved);
2804 trans->block_rsv = &root->fs_info->trans_block_rsv;
2980 BUG_ON(!root->fs_info->enospc_unlink); 2805 BUG_ON(!root->fs_info->enospc_unlink);
2981 root->fs_info->enospc_unlink = 0; 2806 root->fs_info->enospc_unlink = 0;
2982 } 2807 }
@@ -3368,6 +3193,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3368 pgoff_t index = from >> PAGE_CACHE_SHIFT; 3193 pgoff_t index = from >> PAGE_CACHE_SHIFT;
3369 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3194 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3370 struct page *page; 3195 struct page *page;
3196 gfp_t mask = btrfs_alloc_write_mask(mapping);
3371 int ret = 0; 3197 int ret = 0;
3372 u64 page_start; 3198 u64 page_start;
3373 u64 page_end; 3199 u64 page_end;
@@ -3380,7 +3206,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3380 3206
3381 ret = -ENOMEM; 3207 ret = -ENOMEM;
3382again: 3208again:
3383 page = find_or_create_page(mapping, index, GFP_NOFS); 3209 page = find_or_create_page(mapping, index, mask);
3384 if (!page) { 3210 if (!page) {
3385 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3211 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3386 goto out; 3212 goto out;
@@ -3613,6 +3439,8 @@ void btrfs_evict_inode(struct inode *inode)
3613{ 3439{
3614 struct btrfs_trans_handle *trans; 3440 struct btrfs_trans_handle *trans;
3615 struct btrfs_root *root = BTRFS_I(inode)->root; 3441 struct btrfs_root *root = BTRFS_I(inode)->root;
3442 struct btrfs_block_rsv *rsv, *global_rsv;
3443 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
3616 unsigned long nr; 3444 unsigned long nr;
3617 int ret; 3445 int ret;
3618 3446
@@ -3640,22 +3468,55 @@ void btrfs_evict_inode(struct inode *inode)
3640 goto no_delete; 3468 goto no_delete;
3641 } 3469 }
3642 3470
3471 rsv = btrfs_alloc_block_rsv(root);
3472 if (!rsv) {
3473 btrfs_orphan_del(NULL, inode);
3474 goto no_delete;
3475 }
3476 rsv->size = min_size;
3477 global_rsv = &root->fs_info->global_block_rsv;
3478
3643 btrfs_i_size_write(inode, 0); 3479 btrfs_i_size_write(inode, 0);
3644 3480
3481 /*
3482 * This is a bit simpler than btrfs_truncate since
3483 *
3484 * 1) We've already reserved our space for our orphan item in the
3485 * unlink.
3486 * 2) We're going to delete the inode item, so we don't need to update
3487 * it at all.
3488 *
3489 * So we just need to reserve some slack space in case we add bytes when
3490 * doing the truncate.
3491 */
3645 while (1) { 3492 while (1) {
3646 trans = btrfs_join_transaction(root); 3493 ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size);
3647 BUG_ON(IS_ERR(trans)); 3494
3648 trans->block_rsv = root->orphan_block_rsv; 3495 /*
3496 * Try and steal from the global reserve since we will
3497 * likely not use this space anyway, we want to try as
3498 * hard as possible to get this to work.
3499 */
3500 if (ret)
3501 ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
3649 3502
3650 ret = btrfs_block_rsv_check(trans, root,
3651 root->orphan_block_rsv, 0, 5);
3652 if (ret) { 3503 if (ret) {
3653 BUG_ON(ret != -EAGAIN); 3504 printk(KERN_WARNING "Could not get space for a "
3654 ret = btrfs_commit_transaction(trans, root); 3505 "delete, will truncate on mount %d\n", ret);
3655 BUG_ON(ret); 3506 btrfs_orphan_del(NULL, inode);
3656 continue; 3507 btrfs_free_block_rsv(root, rsv);
3508 goto no_delete;
3509 }
3510
3511 trans = btrfs_start_transaction(root, 0);
3512 if (IS_ERR(trans)) {
3513 btrfs_orphan_del(NULL, inode);
3514 btrfs_free_block_rsv(root, rsv);
3515 goto no_delete;
3657 } 3516 }
3658 3517
3518 trans->block_rsv = rsv;
3519
3659 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 3520 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
3660 if (ret != -EAGAIN) 3521 if (ret != -EAGAIN)
3661 break; 3522 break;
@@ -3664,14 +3525,17 @@ void btrfs_evict_inode(struct inode *inode)
3664 btrfs_end_transaction(trans, root); 3525 btrfs_end_transaction(trans, root);
3665 trans = NULL; 3526 trans = NULL;
3666 btrfs_btree_balance_dirty(root, nr); 3527 btrfs_btree_balance_dirty(root, nr);
3667
3668 } 3528 }
3669 3529
3530 btrfs_free_block_rsv(root, rsv);
3531
3670 if (ret == 0) { 3532 if (ret == 0) {
3533 trans->block_rsv = root->orphan_block_rsv;
3671 ret = btrfs_orphan_del(trans, inode); 3534 ret = btrfs_orphan_del(trans, inode);
3672 BUG_ON(ret); 3535 BUG_ON(ret);
3673 } 3536 }
3674 3537
3538 trans->block_rsv = &root->fs_info->trans_block_rsv;
3675 if (!(root == root->fs_info->tree_root || 3539 if (!(root == root->fs_info->tree_root ||
3676 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) 3540 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
3677 btrfs_return_ino(root, btrfs_ino(inode)); 3541 btrfs_return_ino(root, btrfs_ino(inode));
@@ -5795,8 +5659,7 @@ again:
5795 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { 5659 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
5796 ret = btrfs_ordered_update_i_size(inode, 0, ordered); 5660 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5797 if (!ret) 5661 if (!ret)
5798 ret = btrfs_update_inode(trans, root, inode); 5662 err = btrfs_update_inode_fallback(trans, root, inode);
5799 err = ret;
5800 goto out; 5663 goto out;
5801 } 5664 }
5802 5665
@@ -5834,7 +5697,7 @@ again:
5834 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); 5697 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
5835 ret = btrfs_ordered_update_i_size(inode, 0, ordered); 5698 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5836 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) 5699 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
5837 btrfs_update_inode(trans, root, inode); 5700 btrfs_update_inode_fallback(trans, root, inode);
5838 ret = 0; 5701 ret = 0;
5839out_unlock: 5702out_unlock:
5840 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, 5703 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
@@ -6289,7 +6152,7 @@ int btrfs_readpage(struct file *file, struct page *page)
6289{ 6152{
6290 struct extent_io_tree *tree; 6153 struct extent_io_tree *tree;
6291 tree = &BTRFS_I(page->mapping->host)->io_tree; 6154 tree = &BTRFS_I(page->mapping->host)->io_tree;
6292 return extent_read_full_page(tree, page, btrfs_get_extent); 6155 return extent_read_full_page(tree, page, btrfs_get_extent, 0);
6293} 6156}
6294 6157
6295static int btrfs_writepage(struct page *page, struct writeback_control *wbc) 6158static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
@@ -6541,6 +6404,7 @@ static int btrfs_truncate(struct inode *inode)
6541 struct btrfs_trans_handle *trans; 6404 struct btrfs_trans_handle *trans;
6542 unsigned long nr; 6405 unsigned long nr;
6543 u64 mask = root->sectorsize - 1; 6406 u64 mask = root->sectorsize - 1;
6407 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
6544 6408
6545 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); 6409 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
6546 if (ret) 6410 if (ret)
@@ -6588,19 +6452,23 @@ static int btrfs_truncate(struct inode *inode)
6588 rsv = btrfs_alloc_block_rsv(root); 6452 rsv = btrfs_alloc_block_rsv(root);
6589 if (!rsv) 6453 if (!rsv)
6590 return -ENOMEM; 6454 return -ENOMEM;
6591 btrfs_add_durable_block_rsv(root->fs_info, rsv); 6455 rsv->size = min_size;
6592 6456
6457 /*
6458 * 1 for the truncate slack space
6459 * 1 for the orphan item we're going to add
6460 * 1 for the orphan item deletion
6461 * 1 for updating the inode.
6462 */
6593 trans = btrfs_start_transaction(root, 4); 6463 trans = btrfs_start_transaction(root, 4);
6594 if (IS_ERR(trans)) { 6464 if (IS_ERR(trans)) {
6595 err = PTR_ERR(trans); 6465 err = PTR_ERR(trans);
6596 goto out; 6466 goto out;
6597 } 6467 }
6598 6468
6599 /* 6469 /* Migrate the slack space for the truncate to our reserve */
6600 * Reserve space for the truncate process. Truncate should be adding 6470 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
6601 * space, but if there are snapshots it may end up using space. 6471 min_size);
6602 */
6603 ret = btrfs_truncate_reserve_metadata(trans, root, rsv);
6604 BUG_ON(ret); 6472 BUG_ON(ret);
6605 6473
6606 ret = btrfs_orphan_add(trans, inode); 6474 ret = btrfs_orphan_add(trans, inode);
@@ -6609,21 +6477,6 @@ static int btrfs_truncate(struct inode *inode)
6609 goto out; 6477 goto out;
6610 } 6478 }
6611 6479
6612 nr = trans->blocks_used;
6613 btrfs_end_transaction(trans, root);
6614 btrfs_btree_balance_dirty(root, nr);
6615
6616 /*
6617 * Ok so we've already migrated our bytes over for the truncate, so here
6618 * just reserve the one slot we need for updating the inode.
6619 */
6620 trans = btrfs_start_transaction(root, 1);
6621 if (IS_ERR(trans)) {
6622 err = PTR_ERR(trans);
6623 goto out;
6624 }
6625 trans->block_rsv = rsv;
6626
6627 /* 6480 /*
6628 * setattr is responsible for setting the ordered_data_close flag, 6481 * setattr is responsible for setting the ordered_data_close flag,
6629 * but that is only tested during the last file release. That 6482 * but that is only tested during the last file release. That
@@ -6645,20 +6498,30 @@ static int btrfs_truncate(struct inode *inode)
6645 btrfs_add_ordered_operation(trans, root, inode); 6498 btrfs_add_ordered_operation(trans, root, inode);
6646 6499
6647 while (1) { 6500 while (1) {
6501 ret = btrfs_block_rsv_refill(root, rsv, min_size);
6502 if (ret) {
6503 /*
6504 * This can only happen with the original transaction we
6505 * started above, every other time we shouldn't have a
6506 * transaction started yet.
6507 */
6508 if (ret == -EAGAIN)
6509 goto end_trans;
6510 err = ret;
6511 break;
6512 }
6513
6648 if (!trans) { 6514 if (!trans) {
6649 trans = btrfs_start_transaction(root, 3); 6515 /* Just need the 1 for updating the inode */
6516 trans = btrfs_start_transaction(root, 1);
6650 if (IS_ERR(trans)) { 6517 if (IS_ERR(trans)) {
6651 err = PTR_ERR(trans); 6518 err = PTR_ERR(trans);
6652 goto out; 6519 goto out;
6653 } 6520 }
6654
6655 ret = btrfs_truncate_reserve_metadata(trans, root,
6656 rsv);
6657 BUG_ON(ret);
6658
6659 trans->block_rsv = rsv;
6660 } 6521 }
6661 6522
6523 trans->block_rsv = rsv;
6524
6662 ret = btrfs_truncate_inode_items(trans, root, inode, 6525 ret = btrfs_truncate_inode_items(trans, root, inode,
6663 inode->i_size, 6526 inode->i_size,
6664 BTRFS_EXTENT_DATA_KEY); 6527 BTRFS_EXTENT_DATA_KEY);
@@ -6673,7 +6536,7 @@ static int btrfs_truncate(struct inode *inode)
6673 err = ret; 6536 err = ret;
6674 break; 6537 break;
6675 } 6538 }
6676 6539end_trans:
6677 nr = trans->blocks_used; 6540 nr = trans->blocks_used;
6678 btrfs_end_transaction(trans, root); 6541 btrfs_end_transaction(trans, root);
6679 trans = NULL; 6542 trans = NULL;
@@ -6693,14 +6556,16 @@ static int btrfs_truncate(struct inode *inode)
6693 ret = btrfs_orphan_del(NULL, inode); 6556 ret = btrfs_orphan_del(NULL, inode);
6694 } 6557 }
6695 6558
6696 trans->block_rsv = &root->fs_info->trans_block_rsv; 6559 if (trans) {
6697 ret = btrfs_update_inode(trans, root, inode); 6560 trans->block_rsv = &root->fs_info->trans_block_rsv;
6698 if (ret && !err) 6561 ret = btrfs_update_inode(trans, root, inode);
6699 err = ret; 6562 if (ret && !err)
6563 err = ret;
6700 6564
6701 nr = trans->blocks_used; 6565 nr = trans->blocks_used;
6702 ret = btrfs_end_transaction_throttle(trans, root); 6566 ret = btrfs_end_transaction_throttle(trans, root);
6703 btrfs_btree_balance_dirty(root, nr); 6567 btrfs_btree_balance_dirty(root, nr);
6568 }
6704 6569
6705out: 6570out:
6706 btrfs_free_block_rsv(root, rsv); 6571 btrfs_free_block_rsv(root, rsv);
@@ -6755,9 +6620,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6755 ei->last_sub_trans = 0; 6620 ei->last_sub_trans = 0;
6756 ei->logged_trans = 0; 6621 ei->logged_trans = 0;
6757 ei->delalloc_bytes = 0; 6622 ei->delalloc_bytes = 0;
6758 ei->reserved_bytes = 0;
6759 ei->disk_i_size = 0; 6623 ei->disk_i_size = 0;
6760 ei->flags = 0; 6624 ei->flags = 0;
6625 ei->csum_bytes = 0;
6761 ei->index_cnt = (u64)-1; 6626 ei->index_cnt = (u64)-1;
6762 ei->last_unlink_trans = 0; 6627 ei->last_unlink_trans = 0;
6763 6628
@@ -6769,6 +6634,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6769 ei->orphan_meta_reserved = 0; 6634 ei->orphan_meta_reserved = 0;
6770 ei->dummy_inode = 0; 6635 ei->dummy_inode = 0;
6771 ei->in_defrag = 0; 6636 ei->in_defrag = 0;
6637 ei->delalloc_meta_reserved = 0;
6772 ei->force_compress = BTRFS_COMPRESS_NONE; 6638 ei->force_compress = BTRFS_COMPRESS_NONE;
6773 6639
6774 ei->delayed_node = NULL; 6640 ei->delayed_node = NULL;
@@ -6803,6 +6669,8 @@ void btrfs_destroy_inode(struct inode *inode)
6803 WARN_ON(inode->i_data.nrpages); 6669 WARN_ON(inode->i_data.nrpages);
6804 WARN_ON(BTRFS_I(inode)->outstanding_extents); 6670 WARN_ON(BTRFS_I(inode)->outstanding_extents);
6805 WARN_ON(BTRFS_I(inode)->reserved_extents); 6671 WARN_ON(BTRFS_I(inode)->reserved_extents);
6672 WARN_ON(BTRFS_I(inode)->delalloc_bytes);
6673 WARN_ON(BTRFS_I(inode)->csum_bytes);
6806 6674
6807 /* 6675 /*
6808 * This can happen where we create an inode, but somebody else also 6676 * This can happen where we create an inode, but somebody else also
@@ -6926,11 +6794,13 @@ static int btrfs_getattr(struct vfsmount *mnt,
6926 struct dentry *dentry, struct kstat *stat) 6794 struct dentry *dentry, struct kstat *stat)
6927{ 6795{
6928 struct inode *inode = dentry->d_inode; 6796 struct inode *inode = dentry->d_inode;
6797 u32 blocksize = inode->i_sb->s_blocksize;
6798
6929 generic_fillattr(inode, stat); 6799 generic_fillattr(inode, stat);
6930 stat->dev = BTRFS_I(inode)->root->anon_dev; 6800 stat->dev = BTRFS_I(inode)->root->anon_dev;
6931 stat->blksize = PAGE_CACHE_SIZE; 6801 stat->blksize = PAGE_CACHE_SIZE;
6932 stat->blocks = (inode_get_bytes(inode) + 6802 stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
6933 BTRFS_I(inode)->delalloc_bytes) >> 9; 6803 ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9;
6934 return 0; 6804 return 0;
6935} 6805}
6936 6806
@@ -7420,7 +7290,6 @@ static struct extent_io_ops btrfs_extent_io_ops = {
7420 .readpage_end_io_hook = btrfs_readpage_end_io_hook, 7290 .readpage_end_io_hook = btrfs_readpage_end_io_hook,
7421 .writepage_end_io_hook = btrfs_writepage_end_io_hook, 7291 .writepage_end_io_hook = btrfs_writepage_end_io_hook,
7422 .writepage_start_hook = btrfs_writepage_start_hook, 7292 .writepage_start_hook = btrfs_writepage_start_hook,
7423 .readpage_io_failed_hook = btrfs_io_failed_hook,
7424 .set_bit_hook = btrfs_set_bit_hook, 7293 .set_bit_hook = btrfs_set_bit_hook,
7425 .clear_bit_hook = btrfs_clear_bit_hook, 7294 .clear_bit_hook = btrfs_clear_bit_hook,
7426 .merge_extent_hook = btrfs_merge_extent_hook, 7295 .merge_extent_hook = btrfs_merge_extent_hook,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index dae5dfe41ba..72d461656f6 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -51,6 +51,7 @@
51#include "volumes.h" 51#include "volumes.h"
52#include "locking.h" 52#include "locking.h"
53#include "inode-map.h" 53#include "inode-map.h"
54#include "backref.h"
54 55
55/* Mask out flags that are inappropriate for the given type of inode. */ 56/* Mask out flags that are inappropriate for the given type of inode. */
56static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) 57static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -117,7 +118,7 @@ void btrfs_update_iflags(struct inode *inode)
117/* 118/*
118 * Inherit flags from the parent inode. 119 * Inherit flags from the parent inode.
119 * 120 *
120 * Unlike extN we don't have any flags we don't want to inherit currently. 121 * Currently only the compression flags and the cow flags are inherited.
121 */ 122 */
122void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) 123void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
123{ 124{
@@ -128,12 +129,17 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
128 129
129 flags = BTRFS_I(dir)->flags; 130 flags = BTRFS_I(dir)->flags;
130 131
131 if (S_ISREG(inode->i_mode)) 132 if (flags & BTRFS_INODE_NOCOMPRESS) {
132 flags &= ~BTRFS_INODE_DIRSYNC; 133 BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
133 else if (!S_ISDIR(inode->i_mode)) 134 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
134 flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME); 135 } else if (flags & BTRFS_INODE_COMPRESS) {
136 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
137 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
138 }
139
140 if (flags & BTRFS_INODE_NODATACOW)
141 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
135 142
136 BTRFS_I(inode)->flags = flags;
137 btrfs_update_iflags(inode); 143 btrfs_update_iflags(inode);
138} 144}
139 145
@@ -277,6 +283,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
277 struct fstrim_range range; 283 struct fstrim_range range;
278 u64 minlen = ULLONG_MAX; 284 u64 minlen = ULLONG_MAX;
279 u64 num_devices = 0; 285 u64 num_devices = 0;
286 u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
280 int ret; 287 int ret;
281 288
282 if (!capable(CAP_SYS_ADMIN)) 289 if (!capable(CAP_SYS_ADMIN))
@@ -295,12 +302,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
295 } 302 }
296 } 303 }
297 rcu_read_unlock(); 304 rcu_read_unlock();
305
298 if (!num_devices) 306 if (!num_devices)
299 return -EOPNOTSUPP; 307 return -EOPNOTSUPP;
300
301 if (copy_from_user(&range, arg, sizeof(range))) 308 if (copy_from_user(&range, arg, sizeof(range)))
302 return -EFAULT; 309 return -EFAULT;
310 if (range.start > total_bytes)
311 return -EINVAL;
303 312
313 range.len = min(range.len, total_bytes - range.start);
304 range.minlen = max(range.minlen, minlen); 314 range.minlen = max(range.minlen, minlen);
305 ret = btrfs_trim_fs(root, &range); 315 ret = btrfs_trim_fs(root, &range);
306 if (ret < 0) 316 if (ret < 0)
@@ -760,7 +770,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
760 int ret = 1; 770 int ret = 1;
761 771
762 /* 772 /*
763 * make sure that once we start defragging and extent, we keep on 773 * make sure that once we start defragging an extent, we keep on
764 * defragging it 774 * defragging it
765 */ 775 */
766 if (start < *defrag_end) 776 if (start < *defrag_end)
@@ -805,7 +815,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
805 * extent will force at least part of that big extent to be defragged. 815 * extent will force at least part of that big extent to be defragged.
806 */ 816 */
807 if (ret) { 817 if (ret) {
808 *last_len += len;
809 *defrag_end = extent_map_end(em); 818 *defrag_end = extent_map_end(em);
810 } else { 819 } else {
811 *last_len = 0; 820 *last_len = 0;
@@ -843,6 +852,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
843 int i_done; 852 int i_done;
844 struct btrfs_ordered_extent *ordered; 853 struct btrfs_ordered_extent *ordered;
845 struct extent_state *cached_state = NULL; 854 struct extent_state *cached_state = NULL;
855 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
846 856
847 if (isize == 0) 857 if (isize == 0)
848 return 0; 858 return 0;
@@ -860,7 +870,7 @@ again:
860 for (i = 0; i < num_pages; i++) { 870 for (i = 0; i < num_pages; i++) {
861 struct page *page; 871 struct page *page;
862 page = find_or_create_page(inode->i_mapping, 872 page = find_or_create_page(inode->i_mapping,
863 start_index + i, GFP_NOFS); 873 start_index + i, mask);
864 if (!page) 874 if (!page)
865 break; 875 break;
866 876
@@ -972,18 +982,20 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
972 struct btrfs_super_block *disk_super; 982 struct btrfs_super_block *disk_super;
973 struct file_ra_state *ra = NULL; 983 struct file_ra_state *ra = NULL;
974 unsigned long last_index; 984 unsigned long last_index;
985 u64 isize = i_size_read(inode);
975 u64 features; 986 u64 features;
976 u64 last_len = 0; 987 u64 last_len = 0;
977 u64 skip = 0; 988 u64 skip = 0;
978 u64 defrag_end = 0; 989 u64 defrag_end = 0;
979 u64 newer_off = range->start; 990 u64 newer_off = range->start;
980 int newer_left = 0;
981 unsigned long i; 991 unsigned long i;
992 unsigned long ra_index = 0;
982 int ret; 993 int ret;
983 int defrag_count = 0; 994 int defrag_count = 0;
984 int compress_type = BTRFS_COMPRESS_ZLIB; 995 int compress_type = BTRFS_COMPRESS_ZLIB;
985 int extent_thresh = range->extent_thresh; 996 int extent_thresh = range->extent_thresh;
986 int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; 997 int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
998 int cluster = max_cluster;
987 u64 new_align = ~((u64)128 * 1024 - 1); 999 u64 new_align = ~((u64)128 * 1024 - 1);
988 struct page **pages = NULL; 1000 struct page **pages = NULL;
989 1001
@@ -997,7 +1009,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
997 compress_type = range->compress_type; 1009 compress_type = range->compress_type;
998 } 1010 }
999 1011
1000 if (inode->i_size == 0) 1012 if (isize == 0)
1001 return 0; 1013 return 0;
1002 1014
1003 /* 1015 /*
@@ -1013,7 +1025,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1013 ra = &file->f_ra; 1025 ra = &file->f_ra;
1014 } 1026 }
1015 1027
1016 pages = kmalloc(sizeof(struct page *) * newer_cluster, 1028 pages = kmalloc(sizeof(struct page *) * max_cluster,
1017 GFP_NOFS); 1029 GFP_NOFS);
1018 if (!pages) { 1030 if (!pages) {
1019 ret = -ENOMEM; 1031 ret = -ENOMEM;
@@ -1022,10 +1034,10 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1022 1034
1023 /* find the last page to defrag */ 1035 /* find the last page to defrag */
1024 if (range->start + range->len > range->start) { 1036 if (range->start + range->len > range->start) {
1025 last_index = min_t(u64, inode->i_size - 1, 1037 last_index = min_t(u64, isize - 1,
1026 range->start + range->len - 1) >> PAGE_CACHE_SHIFT; 1038 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
1027 } else { 1039 } else {
1028 last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; 1040 last_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1029 } 1041 }
1030 1042
1031 if (newer_than) { 1043 if (newer_than) {
@@ -1038,14 +1050,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1038 * the extents in the file evenly spaced 1050 * the extents in the file evenly spaced
1039 */ 1051 */
1040 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; 1052 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
1041 newer_left = newer_cluster;
1042 } else 1053 } else
1043 goto out_ra; 1054 goto out_ra;
1044 } else { 1055 } else {
1045 i = range->start >> PAGE_CACHE_SHIFT; 1056 i = range->start >> PAGE_CACHE_SHIFT;
1046 } 1057 }
1047 if (!max_to_defrag) 1058 if (!max_to_defrag)
1048 max_to_defrag = last_index - 1; 1059 max_to_defrag = last_index;
1049 1060
1050 /* 1061 /*
1051 * make writeback starts from i, so the defrag range can be 1062 * make writeback starts from i, so the defrag range can be
@@ -1079,18 +1090,31 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1079 i = max(i + 1, next); 1090 i = max(i + 1, next);
1080 continue; 1091 continue;
1081 } 1092 }
1093
1094 if (!newer_than) {
1095 cluster = (PAGE_CACHE_ALIGN(defrag_end) >>
1096 PAGE_CACHE_SHIFT) - i;
1097 cluster = min(cluster, max_cluster);
1098 } else {
1099 cluster = max_cluster;
1100 }
1101
1082 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) 1102 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
1083 BTRFS_I(inode)->force_compress = compress_type; 1103 BTRFS_I(inode)->force_compress = compress_type;
1084 1104
1085 btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster); 1105 if (i + cluster > ra_index) {
1106 ra_index = max(i, ra_index);
1107 btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
1108 cluster);
1109 ra_index += max_cluster;
1110 }
1086 1111
1087 ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster); 1112 ret = cluster_pages_for_defrag(inode, pages, i, cluster);
1088 if (ret < 0) 1113 if (ret < 0)
1089 goto out_ra; 1114 goto out_ra;
1090 1115
1091 defrag_count += ret; 1116 defrag_count += ret;
1092 balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); 1117 balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
1093 i += ret;
1094 1118
1095 if (newer_than) { 1119 if (newer_than) {
1096 if (newer_off == (u64)-1) 1120 if (newer_off == (u64)-1)
@@ -1105,12 +1129,17 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1105 if (!ret) { 1129 if (!ret) {
1106 range->start = newer_off; 1130 range->start = newer_off;
1107 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; 1131 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
1108 newer_left = newer_cluster;
1109 } else { 1132 } else {
1110 break; 1133 break;
1111 } 1134 }
1112 } else { 1135 } else {
1113 i++; 1136 if (ret > 0) {
1137 i += ret;
1138 last_len += ret << PAGE_CACHE_SHIFT;
1139 } else {
1140 i++;
1141 last_len = 0;
1142 }
1114 } 1143 }
1115 } 1144 }
1116 1145
@@ -1136,16 +1165,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1136 mutex_unlock(&inode->i_mutex); 1165 mutex_unlock(&inode->i_mutex);
1137 } 1166 }
1138 1167
1139 disk_super = &root->fs_info->super_copy; 1168 disk_super = root->fs_info->super_copy;
1140 features = btrfs_super_incompat_flags(disk_super); 1169 features = btrfs_super_incompat_flags(disk_super);
1141 if (range->compress_type == BTRFS_COMPRESS_LZO) { 1170 if (range->compress_type == BTRFS_COMPRESS_LZO) {
1142 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; 1171 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
1143 btrfs_set_super_incompat_flags(disk_super, features); 1172 btrfs_set_super_incompat_flags(disk_super, features);
1144 } 1173 }
1145 1174
1146 if (!file) 1175 ret = defrag_count;
1147 kfree(ra);
1148 return defrag_count;
1149 1176
1150out_ra: 1177out_ra:
1151 if (!file) 1178 if (!file)
@@ -1189,12 +1216,12 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1189 *devstr = '\0'; 1216 *devstr = '\0';
1190 devstr = vol_args->name; 1217 devstr = vol_args->name;
1191 devid = simple_strtoull(devstr, &end, 10); 1218 devid = simple_strtoull(devstr, &end, 10);
1192 printk(KERN_INFO "resizing devid %llu\n", 1219 printk(KERN_INFO "btrfs: resizing devid %llu\n",
1193 (unsigned long long)devid); 1220 (unsigned long long)devid);
1194 } 1221 }
1195 device = btrfs_find_device(root, devid, NULL, NULL); 1222 device = btrfs_find_device(root, devid, NULL, NULL);
1196 if (!device) { 1223 if (!device) {
1197 printk(KERN_INFO "resizer unable to find device %llu\n", 1224 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
1198 (unsigned long long)devid); 1225 (unsigned long long)devid);
1199 ret = -EINVAL; 1226 ret = -EINVAL;
1200 goto out_unlock; 1227 goto out_unlock;
@@ -1240,7 +1267,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1240 do_div(new_size, root->sectorsize); 1267 do_div(new_size, root->sectorsize);
1241 new_size *= root->sectorsize; 1268 new_size *= root->sectorsize;
1242 1269
1243 printk(KERN_INFO "new size for %s is %llu\n", 1270 printk(KERN_INFO "btrfs: new size for %s is %llu\n",
1244 device->name, (unsigned long long)new_size); 1271 device->name, (unsigned long long)new_size);
1245 1272
1246 if (new_size > old_size) { 1273 if (new_size > old_size) {
@@ -1251,7 +1278,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1251 } 1278 }
1252 ret = btrfs_grow_device(trans, device, new_size); 1279 ret = btrfs_grow_device(trans, device, new_size);
1253 btrfs_commit_transaction(trans, root); 1280 btrfs_commit_transaction(trans, root);
1254 } else { 1281 } else if (new_size < old_size) {
1255 ret = btrfs_shrink_device(device, new_size); 1282 ret = btrfs_shrink_device(device, new_size);
1256 } 1283 }
1257 1284
@@ -2587,7 +2614,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2587 return PTR_ERR(trans); 2614 return PTR_ERR(trans);
2588 } 2615 }
2589 2616
2590 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 2617 dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
2591 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, 2618 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
2592 dir_id, "default", 7, 1); 2619 dir_id, "default", 7, 1);
2593 if (IS_ERR_OR_NULL(di)) { 2620 if (IS_ERR_OR_NULL(di)) {
@@ -2603,7 +2630,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2603 btrfs_mark_buffer_dirty(path->nodes[0]); 2630 btrfs_mark_buffer_dirty(path->nodes[0]);
2604 btrfs_free_path(path); 2631 btrfs_free_path(path);
2605 2632
2606 disk_super = &root->fs_info->super_copy; 2633 disk_super = root->fs_info->super_copy;
2607 features = btrfs_super_incompat_flags(disk_super); 2634 features = btrfs_super_incompat_flags(disk_super);
2608 if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) { 2635 if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
2609 features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL; 2636 features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
@@ -2864,6 +2891,147 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
2864 return ret; 2891 return ret;
2865} 2892}
2866 2893
2894static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
2895{
2896 int ret = 0;
2897 int i;
2898 u64 rel_ptr;
2899 int size;
2900 struct btrfs_ioctl_ino_path_args *ipa = NULL;
2901 struct inode_fs_paths *ipath = NULL;
2902 struct btrfs_path *path;
2903
2904 if (!capable(CAP_SYS_ADMIN))
2905 return -EPERM;
2906
2907 path = btrfs_alloc_path();
2908 if (!path) {
2909 ret = -ENOMEM;
2910 goto out;
2911 }
2912
2913 ipa = memdup_user(arg, sizeof(*ipa));
2914 if (IS_ERR(ipa)) {
2915 ret = PTR_ERR(ipa);
2916 ipa = NULL;
2917 goto out;
2918 }
2919
2920 size = min_t(u32, ipa->size, 4096);
2921 ipath = init_ipath(size, root, path);
2922 if (IS_ERR(ipath)) {
2923 ret = PTR_ERR(ipath);
2924 ipath = NULL;
2925 goto out;
2926 }
2927
2928 ret = paths_from_inode(ipa->inum, ipath);
2929 if (ret < 0)
2930 goto out;
2931
2932 for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
2933 rel_ptr = ipath->fspath->val[i] -
2934 (u64)(unsigned long)ipath->fspath->val;
2935 ipath->fspath->val[i] = rel_ptr;
2936 }
2937
2938 ret = copy_to_user((void *)(unsigned long)ipa->fspath,
2939 (void *)(unsigned long)ipath->fspath, size);
2940 if (ret) {
2941 ret = -EFAULT;
2942 goto out;
2943 }
2944
2945out:
2946 btrfs_free_path(path);
2947 free_ipath(ipath);
2948 kfree(ipa);
2949
2950 return ret;
2951}
2952
2953static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
2954{
2955 struct btrfs_data_container *inodes = ctx;
2956 const size_t c = 3 * sizeof(u64);
2957
2958 if (inodes->bytes_left >= c) {
2959 inodes->bytes_left -= c;
2960 inodes->val[inodes->elem_cnt] = inum;
2961 inodes->val[inodes->elem_cnt + 1] = offset;
2962 inodes->val[inodes->elem_cnt + 2] = root;
2963 inodes->elem_cnt += 3;
2964 } else {
2965 inodes->bytes_missing += c - inodes->bytes_left;
2966 inodes->bytes_left = 0;
2967 inodes->elem_missed += 3;
2968 }
2969
2970 return 0;
2971}
2972
2973static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
2974 void __user *arg)
2975{
2976 int ret = 0;
2977 int size;
2978 u64 extent_offset;
2979 struct btrfs_ioctl_logical_ino_args *loi;
2980 struct btrfs_data_container *inodes = NULL;
2981 struct btrfs_path *path = NULL;
2982 struct btrfs_key key;
2983
2984 if (!capable(CAP_SYS_ADMIN))
2985 return -EPERM;
2986
2987 loi = memdup_user(arg, sizeof(*loi));
2988 if (IS_ERR(loi)) {
2989 ret = PTR_ERR(loi);
2990 loi = NULL;
2991 goto out;
2992 }
2993
2994 path = btrfs_alloc_path();
2995 if (!path) {
2996 ret = -ENOMEM;
2997 goto out;
2998 }
2999
3000 size = min_t(u32, loi->size, 4096);
3001 inodes = init_data_container(size);
3002 if (IS_ERR(inodes)) {
3003 ret = PTR_ERR(inodes);
3004 inodes = NULL;
3005 goto out;
3006 }
3007
3008 ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
3009
3010 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
3011 ret = -ENOENT;
3012 if (ret < 0)
3013 goto out;
3014
3015 extent_offset = loi->logical - key.objectid;
3016 ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
3017 extent_offset, build_ino_list, inodes);
3018
3019 if (ret < 0)
3020 goto out;
3021
3022 ret = copy_to_user((void *)(unsigned long)loi->inodes,
3023 (void *)(unsigned long)inodes, size);
3024 if (ret)
3025 ret = -EFAULT;
3026
3027out:
3028 btrfs_free_path(path);
3029 kfree(inodes);
3030 kfree(loi);
3031
3032 return ret;
3033}
3034
2867long btrfs_ioctl(struct file *file, unsigned int 3035long btrfs_ioctl(struct file *file, unsigned int
2868 cmd, unsigned long arg) 3036 cmd, unsigned long arg)
2869{ 3037{
@@ -2921,6 +3089,10 @@ long btrfs_ioctl(struct file *file, unsigned int
2921 return btrfs_ioctl_tree_search(file, argp); 3089 return btrfs_ioctl_tree_search(file, argp);
2922 case BTRFS_IOC_INO_LOOKUP: 3090 case BTRFS_IOC_INO_LOOKUP:
2923 return btrfs_ioctl_ino_lookup(file, argp); 3091 return btrfs_ioctl_ino_lookup(file, argp);
3092 case BTRFS_IOC_INO_PATHS:
3093 return btrfs_ioctl_ino_to_path(root, argp);
3094 case BTRFS_IOC_LOGICAL_INO:
3095 return btrfs_ioctl_logical_to_ino(root, argp);
2924 case BTRFS_IOC_SPACE_INFO: 3096 case BTRFS_IOC_SPACE_INFO:
2925 return btrfs_ioctl_space_info(root, argp); 3097 return btrfs_ioctl_space_info(root, argp);
2926 case BTRFS_IOC_SYNC: 3098 case BTRFS_IOC_SYNC:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index ad1ea789fcb..252ae9915de 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -193,6 +193,30 @@ struct btrfs_ioctl_space_args {
193 struct btrfs_ioctl_space_info spaces[0]; 193 struct btrfs_ioctl_space_info spaces[0];
194}; 194};
195 195
196struct btrfs_data_container {
197 __u32 bytes_left; /* out -- bytes not needed to deliver output */
198 __u32 bytes_missing; /* out -- additional bytes needed for result */
199 __u32 elem_cnt; /* out */
200 __u32 elem_missed; /* out */
201 __u64 val[0]; /* out */
202};
203
204struct btrfs_ioctl_ino_path_args {
205 __u64 inum; /* in */
206 __u32 size; /* in */
207 __u64 reserved[4];
208 /* struct btrfs_data_container *fspath; out */
209 __u64 fspath; /* out */
210};
211
212struct btrfs_ioctl_logical_ino_args {
213 __u64 logical; /* in */
214 __u32 size; /* in */
215 __u64 reserved[4];
216 /* struct btrfs_data_container *inodes; out */
217 __u64 inodes;
218};
219
196#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ 220#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
197 struct btrfs_ioctl_vol_args) 221 struct btrfs_ioctl_vol_args)
198#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ 222#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -248,4 +272,9 @@ struct btrfs_ioctl_space_args {
248 struct btrfs_ioctl_dev_info_args) 272 struct btrfs_ioctl_dev_info_args)
249#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ 273#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
250 struct btrfs_ioctl_fs_info_args) 274 struct btrfs_ioctl_fs_info_args)
275#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
276 struct btrfs_ioctl_ino_path_args)
277#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
278 struct btrfs_ioctl_ino_path_args)
279
251#endif 280#endif
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index fb2605d998e..f38e452486b 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -158,8 +158,7 @@ static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
158void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) 158void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
159{ 159{
160 int i; 160 int i;
161 u32 type; 161 u32 type, nr;
162 u32 nr = btrfs_header_nritems(l);
163 struct btrfs_item *item; 162 struct btrfs_item *item;
164 struct btrfs_root_item *ri; 163 struct btrfs_root_item *ri;
165 struct btrfs_dir_item *di; 164 struct btrfs_dir_item *di;
@@ -172,6 +171,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
172 struct btrfs_key key; 171 struct btrfs_key key;
173 struct btrfs_key found_key; 172 struct btrfs_key found_key;
174 173
174 if (!l)
175 return;
176
177 nr = btrfs_header_nritems(l);
178
175 printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", 179 printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
176 (unsigned long long)btrfs_header_bytenr(l), nr, 180 (unsigned long long)btrfs_header_bytenr(l), nr,
177 btrfs_leaf_free_space(root, l)); 181 btrfs_leaf_free_space(root, l));
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
new file mode 100644
index 00000000000..2373b39a132
--- /dev/null
+++ b/fs/btrfs/reada.c
@@ -0,0 +1,951 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/pagemap.h>
21#include <linux/writeback.h>
22#include <linux/blkdev.h>
23#include <linux/rbtree.h>
24#include <linux/slab.h>
25#include <linux/workqueue.h>
26#include "ctree.h"
27#include "volumes.h"
28#include "disk-io.h"
29#include "transaction.h"
30
31#undef DEBUG
32
33/*
34 * This is the implementation for the generic read ahead framework.
35 *
36 * To trigger a readahead, btrfs_reada_add must be called. It will start
37 * a read ahead for the given range [start, end) on tree root. The returned
38 * handle can either be used to wait on the readahead to finish
39 * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
40 *
41 * The read ahead works as follows:
42 * On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
43 * reada_start_machine will then search for extents to prefetch and trigger
44 * some reads. When a read finishes for a node, all contained node/leaf
45 * pointers that lie in the given range will also be enqueued. The reads will
46 * be triggered in sequential order, thus giving a big win over a naive
47 * enumeration. It will also make use of multi-device layouts. Each disk
48 * will have its on read pointer and all disks will by utilized in parallel.
49 * Also will no two disks read both sides of a mirror simultaneously, as this
50 * would waste seeking capacity. Instead both disks will read different parts
51 * of the filesystem.
52 * Any number of readaheads can be started in parallel. The read order will be
53 * determined globally, i.e. 2 parallel readaheads will normally finish faster
54 * than the 2 started one after another.
55 */
56
57#define MAX_MIRRORS 2
58#define MAX_IN_FLIGHT 6
59
60struct reada_extctl {
61 struct list_head list;
62 struct reada_control *rc;
63 u64 generation;
64};
65
66struct reada_extent {
67 u64 logical;
68 struct btrfs_key top;
69 u32 blocksize;
70 int err;
71 struct list_head extctl;
72 struct kref refcnt;
73 spinlock_t lock;
74 struct reada_zone *zones[MAX_MIRRORS];
75 int nzones;
76 struct btrfs_device *scheduled_for;
77};
78
79struct reada_zone {
80 u64 start;
81 u64 end;
82 u64 elems;
83 struct list_head list;
84 spinlock_t lock;
85 int locked;
86 struct btrfs_device *device;
87 struct btrfs_device *devs[MAX_MIRRORS]; /* full list, incl self */
88 int ndevs;
89 struct kref refcnt;
90};
91
92struct reada_machine_work {
93 struct btrfs_work work;
94 struct btrfs_fs_info *fs_info;
95};
96
97static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *);
98static void reada_control_release(struct kref *kref);
99static void reada_zone_release(struct kref *kref);
100static void reada_start_machine(struct btrfs_fs_info *fs_info);
101static void __reada_start_machine(struct btrfs_fs_info *fs_info);
102
103static int reada_add_block(struct reada_control *rc, u64 logical,
104 struct btrfs_key *top, int level, u64 generation);
105
106/* recurses */
107/* in case of err, eb might be NULL */
108static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
109 u64 start, int err)
110{
111 int level = 0;
112 int nritems;
113 int i;
114 u64 bytenr;
115 u64 generation;
116 struct reada_extent *re;
117 struct btrfs_fs_info *fs_info = root->fs_info;
118 struct list_head list;
119 unsigned long index = start >> PAGE_CACHE_SHIFT;
120 struct btrfs_device *for_dev;
121
122 if (eb)
123 level = btrfs_header_level(eb);
124
125 /* find extent */
126 spin_lock(&fs_info->reada_lock);
127 re = radix_tree_lookup(&fs_info->reada_tree, index);
128 if (re)
129 kref_get(&re->refcnt);
130 spin_unlock(&fs_info->reada_lock);
131
132 if (!re)
133 return -1;
134
135 spin_lock(&re->lock);
136 /*
137 * just take the full list from the extent. afterwards we
138 * don't need the lock anymore
139 */
140 list_replace_init(&re->extctl, &list);
141 for_dev = re->scheduled_for;
142 re->scheduled_for = NULL;
143 spin_unlock(&re->lock);
144
145 if (err == 0) {
146 nritems = level ? btrfs_header_nritems(eb) : 0;
147 generation = btrfs_header_generation(eb);
148 /*
149 * FIXME: currently we just set nritems to 0 if this is a leaf,
150 * effectively ignoring the content. In a next step we could
151 * trigger more readahead depending from the content, e.g.
152 * fetch the checksums for the extents in the leaf.
153 */
154 } else {
155 /*
156 * this is the error case, the extent buffer has not been
157 * read correctly. We won't access anything from it and
158 * just cleanup our data structures. Effectively this will
159 * cut the branch below this node from read ahead.
160 */
161 nritems = 0;
162 generation = 0;
163 }
164
165 for (i = 0; i < nritems; i++) {
166 struct reada_extctl *rec;
167 u64 n_gen;
168 struct btrfs_key key;
169 struct btrfs_key next_key;
170
171 btrfs_node_key_to_cpu(eb, &key, i);
172 if (i + 1 < nritems)
173 btrfs_node_key_to_cpu(eb, &next_key, i + 1);
174 else
175 next_key = re->top;
176 bytenr = btrfs_node_blockptr(eb, i);
177 n_gen = btrfs_node_ptr_generation(eb, i);
178
179 list_for_each_entry(rec, &list, list) {
180 struct reada_control *rc = rec->rc;
181
182 /*
183 * if the generation doesn't match, just ignore this
184 * extctl. This will probably cut off a branch from
185 * prefetch. Alternatively one could start a new (sub-)
186 * prefetch for this branch, starting again from root.
187 * FIXME: move the generation check out of this loop
188 */
189#ifdef DEBUG
190 if (rec->generation != generation) {
191 printk(KERN_DEBUG "generation mismatch for "
192 "(%llu,%d,%llu) %llu != %llu\n",
193 key.objectid, key.type, key.offset,
194 rec->generation, generation);
195 }
196#endif
197 if (rec->generation == generation &&
198 btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
199 btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
200 reada_add_block(rc, bytenr, &next_key,
201 level - 1, n_gen);
202 }
203 }
204 /*
205 * free extctl records
206 */
207 while (!list_empty(&list)) {
208 struct reada_control *rc;
209 struct reada_extctl *rec;
210
211 rec = list_first_entry(&list, struct reada_extctl, list);
212 list_del(&rec->list);
213 rc = rec->rc;
214 kfree(rec);
215
216 kref_get(&rc->refcnt);
217 if (atomic_dec_and_test(&rc->elems)) {
218 kref_put(&rc->refcnt, reada_control_release);
219 wake_up(&rc->wait);
220 }
221 kref_put(&rc->refcnt, reada_control_release);
222
223 reada_extent_put(fs_info, re); /* one ref for each entry */
224 }
225 reada_extent_put(fs_info, re); /* our ref */
226 if (for_dev)
227 atomic_dec(&for_dev->reada_in_flight);
228
229 return 0;
230}
231
232/*
233 * start is passed separately in case eb in NULL, which may be the case with
234 * failed I/O
235 */
236int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
237 u64 start, int err)
238{
239 int ret;
240
241 ret = __readahead_hook(root, eb, start, err);
242
243 reada_start_machine(root->fs_info);
244
245 return ret;
246}
247
248static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
249 struct btrfs_device *dev, u64 logical,
250 struct btrfs_bio *bbio)
251{
252 int ret;
253 int looped = 0;
254 struct reada_zone *zone;
255 struct btrfs_block_group_cache *cache = NULL;
256 u64 start;
257 u64 end;
258 int i;
259
260again:
261 zone = NULL;
262 spin_lock(&fs_info->reada_lock);
263 ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
264 logical >> PAGE_CACHE_SHIFT, 1);
265 if (ret == 1)
266 kref_get(&zone->refcnt);
267 spin_unlock(&fs_info->reada_lock);
268
269 if (ret == 1) {
270 if (logical >= zone->start && logical < zone->end)
271 return zone;
272 spin_lock(&fs_info->reada_lock);
273 kref_put(&zone->refcnt, reada_zone_release);
274 spin_unlock(&fs_info->reada_lock);
275 }
276
277 if (looped)
278 return NULL;
279
280 cache = btrfs_lookup_block_group(fs_info, logical);
281 if (!cache)
282 return NULL;
283
284 start = cache->key.objectid;
285 end = start + cache->key.offset - 1;
286 btrfs_put_block_group(cache);
287
288 zone = kzalloc(sizeof(*zone), GFP_NOFS);
289 if (!zone)
290 return NULL;
291
292 zone->start = start;
293 zone->end = end;
294 INIT_LIST_HEAD(&zone->list);
295 spin_lock_init(&zone->lock);
296 zone->locked = 0;
297 kref_init(&zone->refcnt);
298 zone->elems = 0;
299 zone->device = dev; /* our device always sits at index 0 */
300 for (i = 0; i < bbio->num_stripes; ++i) {
301 /* bounds have already been checked */
302 zone->devs[i] = bbio->stripes[i].dev;
303 }
304 zone->ndevs = bbio->num_stripes;
305
306 spin_lock(&fs_info->reada_lock);
307 ret = radix_tree_insert(&dev->reada_zones,
308 (unsigned long)zone->end >> PAGE_CACHE_SHIFT,
309 zone);
310 spin_unlock(&fs_info->reada_lock);
311
312 if (ret) {
313 kfree(zone);
314 looped = 1;
315 goto again;
316 }
317
318 return zone;
319}
320
321static struct reada_extent *reada_find_extent(struct btrfs_root *root,
322 u64 logical,
323 struct btrfs_key *top, int level)
324{
325 int ret;
326 int looped = 0;
327 struct reada_extent *re = NULL;
328 struct btrfs_fs_info *fs_info = root->fs_info;
329 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
330 struct btrfs_bio *bbio = NULL;
331 struct btrfs_device *dev;
332 u32 blocksize;
333 u64 length;
334 int nzones = 0;
335 int i;
336 unsigned long index = logical >> PAGE_CACHE_SHIFT;
337
338again:
339 spin_lock(&fs_info->reada_lock);
340 re = radix_tree_lookup(&fs_info->reada_tree, index);
341 if (re)
342 kref_get(&re->refcnt);
343 spin_unlock(&fs_info->reada_lock);
344
345 if (re || looped)
346 return re;
347
348 re = kzalloc(sizeof(*re), GFP_NOFS);
349 if (!re)
350 return NULL;
351
352 blocksize = btrfs_level_size(root, level);
353 re->logical = logical;
354 re->blocksize = blocksize;
355 re->top = *top;
356 INIT_LIST_HEAD(&re->extctl);
357 spin_lock_init(&re->lock);
358 kref_init(&re->refcnt);
359
360 /*
361 * map block
362 */
363 length = blocksize;
364 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0);
365 if (ret || !bbio || length < blocksize)
366 goto error;
367
368 if (bbio->num_stripes > MAX_MIRRORS) {
369 printk(KERN_ERR "btrfs readahead: more than %d copies not "
370 "supported", MAX_MIRRORS);
371 goto error;
372 }
373
374 for (nzones = 0; nzones < bbio->num_stripes; ++nzones) {
375 struct reada_zone *zone;
376
377 dev = bbio->stripes[nzones].dev;
378 zone = reada_find_zone(fs_info, dev, logical, bbio);
379 if (!zone)
380 break;
381
382 re->zones[nzones] = zone;
383 spin_lock(&zone->lock);
384 if (!zone->elems)
385 kref_get(&zone->refcnt);
386 ++zone->elems;
387 spin_unlock(&zone->lock);
388 spin_lock(&fs_info->reada_lock);
389 kref_put(&zone->refcnt, reada_zone_release);
390 spin_unlock(&fs_info->reada_lock);
391 }
392 re->nzones = nzones;
393 if (nzones == 0) {
394 /* not a single zone found, error and out */
395 goto error;
396 }
397
398 /* insert extent in reada_tree + all per-device trees, all or nothing */
399 spin_lock(&fs_info->reada_lock);
400 ret = radix_tree_insert(&fs_info->reada_tree, index, re);
401 if (ret) {
402 spin_unlock(&fs_info->reada_lock);
403 if (ret != -ENOMEM) {
404 /* someone inserted the extent in the meantime */
405 looped = 1;
406 }
407 goto error;
408 }
409 for (i = 0; i < nzones; ++i) {
410 dev = bbio->stripes[i].dev;
411 ret = radix_tree_insert(&dev->reada_extents, index, re);
412 if (ret) {
413 while (--i >= 0) {
414 dev = bbio->stripes[i].dev;
415 BUG_ON(dev == NULL);
416 radix_tree_delete(&dev->reada_extents, index);
417 }
418 BUG_ON(fs_info == NULL);
419 radix_tree_delete(&fs_info->reada_tree, index);
420 spin_unlock(&fs_info->reada_lock);
421 goto error;
422 }
423 }
424 spin_unlock(&fs_info->reada_lock);
425
426 kfree(bbio);
427 return re;
428
429error:
430 while (nzones) {
431 struct reada_zone *zone;
432
433 --nzones;
434 zone = re->zones[nzones];
435 kref_get(&zone->refcnt);
436 spin_lock(&zone->lock);
437 --zone->elems;
438 if (zone->elems == 0) {
439 /*
440 * no fs_info->reada_lock needed, as this can't be
441 * the last ref
442 */
443 kref_put(&zone->refcnt, reada_zone_release);
444 }
445 spin_unlock(&zone->lock);
446
447 spin_lock(&fs_info->reada_lock);
448 kref_put(&zone->refcnt, reada_zone_release);
449 spin_unlock(&fs_info->reada_lock);
450 }
451 kfree(bbio);
452 kfree(re);
453 if (looped)
454 goto again;
455 return NULL;
456}
457
458static void reada_kref_dummy(struct kref *kr)
459{
460}
461
462static void reada_extent_put(struct btrfs_fs_info *fs_info,
463 struct reada_extent *re)
464{
465 int i;
466 unsigned long index = re->logical >> PAGE_CACHE_SHIFT;
467
468 spin_lock(&fs_info->reada_lock);
469 if (!kref_put(&re->refcnt, reada_kref_dummy)) {
470 spin_unlock(&fs_info->reada_lock);
471 return;
472 }
473
474 radix_tree_delete(&fs_info->reada_tree, index);
475 for (i = 0; i < re->nzones; ++i) {
476 struct reada_zone *zone = re->zones[i];
477
478 radix_tree_delete(&zone->device->reada_extents, index);
479 }
480
481 spin_unlock(&fs_info->reada_lock);
482
483 for (i = 0; i < re->nzones; ++i) {
484 struct reada_zone *zone = re->zones[i];
485
486 kref_get(&zone->refcnt);
487 spin_lock(&zone->lock);
488 --zone->elems;
489 if (zone->elems == 0) {
490 /* no fs_info->reada_lock needed, as this can't be
491 * the last ref */
492 kref_put(&zone->refcnt, reada_zone_release);
493 }
494 spin_unlock(&zone->lock);
495
496 spin_lock(&fs_info->reada_lock);
497 kref_put(&zone->refcnt, reada_zone_release);
498 spin_unlock(&fs_info->reada_lock);
499 }
500 if (re->scheduled_for)
501 atomic_dec(&re->scheduled_for->reada_in_flight);
502
503 kfree(re);
504}
505
506static void reada_zone_release(struct kref *kref)
507{
508 struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
509
510 radix_tree_delete(&zone->device->reada_zones,
511 zone->end >> PAGE_CACHE_SHIFT);
512
513 kfree(zone);
514}
515
516static void reada_control_release(struct kref *kref)
517{
518 struct reada_control *rc = container_of(kref, struct reada_control,
519 refcnt);
520
521 kfree(rc);
522}
523
524static int reada_add_block(struct reada_control *rc, u64 logical,
525 struct btrfs_key *top, int level, u64 generation)
526{
527 struct btrfs_root *root = rc->root;
528 struct reada_extent *re;
529 struct reada_extctl *rec;
530
531 re = reada_find_extent(root, logical, top, level); /* takes one ref */
532 if (!re)
533 return -1;
534
535 rec = kzalloc(sizeof(*rec), GFP_NOFS);
536 if (!rec) {
537 reada_extent_put(root->fs_info, re);
538 return -1;
539 }
540
541 rec->rc = rc;
542 rec->generation = generation;
543 atomic_inc(&rc->elems);
544
545 spin_lock(&re->lock);
546 list_add_tail(&rec->list, &re->extctl);
547 spin_unlock(&re->lock);
548
549 /* leave the ref on the extent */
550
551 return 0;
552}
553
554/*
555 * called with fs_info->reada_lock held
556 */
557static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
558{
559 int i;
560 unsigned long index = zone->end >> PAGE_CACHE_SHIFT;
561
562 for (i = 0; i < zone->ndevs; ++i) {
563 struct reada_zone *peer;
564 peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index);
565 if (peer && peer->device != zone->device)
566 peer->locked = lock;
567 }
568}
569
570/*
571 * called with fs_info->reada_lock held
572 */
573static int reada_pick_zone(struct btrfs_device *dev)
574{
575 struct reada_zone *top_zone = NULL;
576 struct reada_zone *top_locked_zone = NULL;
577 u64 top_elems = 0;
578 u64 top_locked_elems = 0;
579 unsigned long index = 0;
580 int ret;
581
582 if (dev->reada_curr_zone) {
583 reada_peer_zones_set_lock(dev->reada_curr_zone, 0);
584 kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release);
585 dev->reada_curr_zone = NULL;
586 }
587 /* pick the zone with the most elements */
588 while (1) {
589 struct reada_zone *zone;
590
591 ret = radix_tree_gang_lookup(&dev->reada_zones,
592 (void **)&zone, index, 1);
593 if (ret == 0)
594 break;
595 index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
596 if (zone->locked) {
597 if (zone->elems > top_locked_elems) {
598 top_locked_elems = zone->elems;
599 top_locked_zone = zone;
600 }
601 } else {
602 if (zone->elems > top_elems) {
603 top_elems = zone->elems;
604 top_zone = zone;
605 }
606 }
607 }
608 if (top_zone)
609 dev->reada_curr_zone = top_zone;
610 else if (top_locked_zone)
611 dev->reada_curr_zone = top_locked_zone;
612 else
613 return 0;
614
615 dev->reada_next = dev->reada_curr_zone->start;
616 kref_get(&dev->reada_curr_zone->refcnt);
617 reada_peer_zones_set_lock(dev->reada_curr_zone, 1);
618
619 return 1;
620}
621
622static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
623 struct btrfs_device *dev)
624{
625 struct reada_extent *re = NULL;
626 int mirror_num = 0;
627 struct extent_buffer *eb = NULL;
628 u64 logical;
629 u32 blocksize;
630 int ret;
631 int i;
632 int need_kick = 0;
633
634 spin_lock(&fs_info->reada_lock);
635 if (dev->reada_curr_zone == NULL) {
636 ret = reada_pick_zone(dev);
637 if (!ret) {
638 spin_unlock(&fs_info->reada_lock);
639 return 0;
640 }
641 }
642 /*
643 * FIXME currently we issue the reads one extent at a time. If we have
644 * a contiguous block of extents, we could also coagulate them or use
645 * plugging to speed things up
646 */
647 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
648 dev->reada_next >> PAGE_CACHE_SHIFT, 1);
649 if (ret == 0 || re->logical >= dev->reada_curr_zone->end) {
650 ret = reada_pick_zone(dev);
651 if (!ret) {
652 spin_unlock(&fs_info->reada_lock);
653 return 0;
654 }
655 re = NULL;
656 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
657 dev->reada_next >> PAGE_CACHE_SHIFT, 1);
658 }
659 if (ret == 0) {
660 spin_unlock(&fs_info->reada_lock);
661 return 0;
662 }
663 dev->reada_next = re->logical + re->blocksize;
664 kref_get(&re->refcnt);
665
666 spin_unlock(&fs_info->reada_lock);
667
668 /*
669 * find mirror num
670 */
671 for (i = 0; i < re->nzones; ++i) {
672 if (re->zones[i]->device == dev) {
673 mirror_num = i + 1;
674 break;
675 }
676 }
677 logical = re->logical;
678 blocksize = re->blocksize;
679
680 spin_lock(&re->lock);
681 if (re->scheduled_for == NULL) {
682 re->scheduled_for = dev;
683 need_kick = 1;
684 }
685 spin_unlock(&re->lock);
686
687 reada_extent_put(fs_info, re);
688
689 if (!need_kick)
690 return 0;
691
692 atomic_inc(&dev->reada_in_flight);
693 ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize,
694 mirror_num, &eb);
695 if (ret)
696 __readahead_hook(fs_info->extent_root, NULL, logical, ret);
697 else if (eb)
698 __readahead_hook(fs_info->extent_root, eb, eb->start, ret);
699
700 if (eb)
701 free_extent_buffer(eb);
702
703 return 1;
704
705}
706
707static void reada_start_machine_worker(struct btrfs_work *work)
708{
709 struct reada_machine_work *rmw;
710 struct btrfs_fs_info *fs_info;
711
712 rmw = container_of(work, struct reada_machine_work, work);
713 fs_info = rmw->fs_info;
714
715 kfree(rmw);
716
717 __reada_start_machine(fs_info);
718}
719
720static void __reada_start_machine(struct btrfs_fs_info *fs_info)
721{
722 struct btrfs_device *device;
723 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
724 u64 enqueued;
725 u64 total = 0;
726 int i;
727
728 do {
729 enqueued = 0;
730 list_for_each_entry(device, &fs_devices->devices, dev_list) {
731 if (atomic_read(&device->reada_in_flight) <
732 MAX_IN_FLIGHT)
733 enqueued += reada_start_machine_dev(fs_info,
734 device);
735 }
736 total += enqueued;
737 } while (enqueued && total < 10000);
738
739 if (enqueued == 0)
740 return;
741
742 /*
743 * If everything is already in the cache, this is effectively single
744 * threaded. To a) not hold the caller for too long and b) to utilize
745 * more cores, we broke the loop above after 10000 iterations and now
746 * enqueue to workers to finish it. This will distribute the load to
747 * the cores.
748 */
749 for (i = 0; i < 2; ++i)
750 reada_start_machine(fs_info);
751}
752
753static void reada_start_machine(struct btrfs_fs_info *fs_info)
754{
755 struct reada_machine_work *rmw;
756
757 rmw = kzalloc(sizeof(*rmw), GFP_NOFS);
758 if (!rmw) {
759 /* FIXME we cannot handle this properly right now */
760 BUG();
761 }
762 rmw->work.func = reada_start_machine_worker;
763 rmw->fs_info = fs_info;
764
765 btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work);
766}
767
768#ifdef DEBUG
769static void dump_devs(struct btrfs_fs_info *fs_info, int all)
770{
771 struct btrfs_device *device;
772 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
773 unsigned long index;
774 int ret;
775 int i;
776 int j;
777 int cnt;
778
779 spin_lock(&fs_info->reada_lock);
780 list_for_each_entry(device, &fs_devices->devices, dev_list) {
781 printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid,
782 atomic_read(&device->reada_in_flight));
783 index = 0;
784 while (1) {
785 struct reada_zone *zone;
786 ret = radix_tree_gang_lookup(&device->reada_zones,
787 (void **)&zone, index, 1);
788 if (ret == 0)
789 break;
790 printk(KERN_DEBUG " zone %llu-%llu elems %llu locked "
791 "%d devs", zone->start, zone->end, zone->elems,
792 zone->locked);
793 for (j = 0; j < zone->ndevs; ++j) {
794 printk(KERN_CONT " %lld",
795 zone->devs[j]->devid);
796 }
797 if (device->reada_curr_zone == zone)
798 printk(KERN_CONT " curr off %llu",
799 device->reada_next - zone->start);
800 printk(KERN_CONT "\n");
801 index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
802 }
803 cnt = 0;
804 index = 0;
805 while (all) {
806 struct reada_extent *re = NULL;
807
808 ret = radix_tree_gang_lookup(&device->reada_extents,
809 (void **)&re, index, 1);
810 if (ret == 0)
811 break;
812 printk(KERN_DEBUG
813 " re: logical %llu size %u empty %d for %lld",
814 re->logical, re->blocksize,
815 list_empty(&re->extctl), re->scheduled_for ?
816 re->scheduled_for->devid : -1);
817
818 for (i = 0; i < re->nzones; ++i) {
819 printk(KERN_CONT " zone %llu-%llu devs",
820 re->zones[i]->start,
821 re->zones[i]->end);
822 for (j = 0; j < re->zones[i]->ndevs; ++j) {
823 printk(KERN_CONT " %lld",
824 re->zones[i]->devs[j]->devid);
825 }
826 }
827 printk(KERN_CONT "\n");
828 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
829 if (++cnt > 15)
830 break;
831 }
832 }
833
834 index = 0;
835 cnt = 0;
836 while (all) {
837 struct reada_extent *re = NULL;
838
839 ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re,
840 index, 1);
841 if (ret == 0)
842 break;
843 if (!re->scheduled_for) {
844 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
845 continue;
846 }
847 printk(KERN_DEBUG
848 "re: logical %llu size %u list empty %d for %lld",
849 re->logical, re->blocksize, list_empty(&re->extctl),
850 re->scheduled_for ? re->scheduled_for->devid : -1);
851 for (i = 0; i < re->nzones; ++i) {
852 printk(KERN_CONT " zone %llu-%llu devs",
853 re->zones[i]->start,
854 re->zones[i]->end);
855 for (i = 0; i < re->nzones; ++i) {
856 printk(KERN_CONT " zone %llu-%llu devs",
857 re->zones[i]->start,
858 re->zones[i]->end);
859 for (j = 0; j < re->zones[i]->ndevs; ++j) {
860 printk(KERN_CONT " %lld",
861 re->zones[i]->devs[j]->devid);
862 }
863 }
864 }
865 printk(KERN_CONT "\n");
866 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
867 }
868 spin_unlock(&fs_info->reada_lock);
869}
870#endif
871
872/*
873 * interface
874 */
875struct reada_control *btrfs_reada_add(struct btrfs_root *root,
876 struct btrfs_key *key_start, struct btrfs_key *key_end)
877{
878 struct reada_control *rc;
879 u64 start;
880 u64 generation;
881 int level;
882 struct extent_buffer *node;
883 static struct btrfs_key max_key = {
884 .objectid = (u64)-1,
885 .type = (u8)-1,
886 .offset = (u64)-1
887 };
888
889 rc = kzalloc(sizeof(*rc), GFP_NOFS);
890 if (!rc)
891 return ERR_PTR(-ENOMEM);
892
893 rc->root = root;
894 rc->key_start = *key_start;
895 rc->key_end = *key_end;
896 atomic_set(&rc->elems, 0);
897 init_waitqueue_head(&rc->wait);
898 kref_init(&rc->refcnt);
899 kref_get(&rc->refcnt); /* one ref for having elements */
900
901 node = btrfs_root_node(root);
902 start = node->start;
903 level = btrfs_header_level(node);
904 generation = btrfs_header_generation(node);
905 free_extent_buffer(node);
906
907 reada_add_block(rc, start, &max_key, level, generation);
908
909 reada_start_machine(root->fs_info);
910
911 return rc;
912}
913
914#ifdef DEBUG
915int btrfs_reada_wait(void *handle)
916{
917 struct reada_control *rc = handle;
918
919 while (atomic_read(&rc->elems)) {
920 wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
921 5 * HZ);
922 dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
923 }
924
925 dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
926
927 kref_put(&rc->refcnt, reada_control_release);
928
929 return 0;
930}
931#else
932int btrfs_reada_wait(void *handle)
933{
934 struct reada_control *rc = handle;
935
936 while (atomic_read(&rc->elems)) {
937 wait_event(rc->wait, atomic_read(&rc->elems) == 0);
938 }
939
940 kref_put(&rc->refcnt, reada_control_release);
941
942 return 0;
943}
944#endif
945
946void btrfs_reada_detach(void *handle)
947{
948 struct reada_control *rc = handle;
949
950 kref_put(&rc->refcnt, reada_control_release);
951}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 59bb1764273..dff29d5e151 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1174,6 +1174,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
1174 list_add_tail(&new_edge->list[UPPER], 1174 list_add_tail(&new_edge->list[UPPER],
1175 &new_node->lower); 1175 &new_node->lower);
1176 } 1176 }
1177 } else {
1178 list_add_tail(&new_node->lower, &cache->leaves);
1177 } 1179 }
1178 1180
1179 rb_node = tree_insert(&cache->rb_root, new_node->bytenr, 1181 rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
@@ -2041,8 +2043,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2041 BUG_ON(IS_ERR(trans)); 2043 BUG_ON(IS_ERR(trans));
2042 trans->block_rsv = rc->block_rsv; 2044 trans->block_rsv = rc->block_rsv;
2043 2045
2044 ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, 2046 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved);
2045 min_reserved, 0);
2046 if (ret) { 2047 if (ret) {
2047 BUG_ON(ret != -EAGAIN); 2048 BUG_ON(ret != -EAGAIN);
2048 ret = btrfs_commit_transaction(trans, root); 2049 ret = btrfs_commit_transaction(trans, root);
@@ -2152,8 +2153,7 @@ int prepare_to_merge(struct reloc_control *rc, int err)
2152again: 2153again:
2153 if (!err) { 2154 if (!err) {
2154 num_bytes = rc->merging_rsv_size; 2155 num_bytes = rc->merging_rsv_size;
2155 ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv, 2156 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
2156 num_bytes);
2157 if (ret) 2157 if (ret)
2158 err = ret; 2158 err = ret;
2159 } 2159 }
@@ -2427,7 +2427,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2427 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2427 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2428 2428
2429 trans->block_rsv = rc->block_rsv; 2429 trans->block_rsv = rc->block_rsv;
2430 ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes); 2430 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
2431 if (ret) { 2431 if (ret) {
2432 if (ret == -EAGAIN) 2432 if (ret == -EAGAIN)
2433 rc->commit_transaction = 1; 2433 rc->commit_transaction = 1;
@@ -2922,6 +2922,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
2922 unsigned long last_index; 2922 unsigned long last_index;
2923 struct page *page; 2923 struct page *page;
2924 struct file_ra_state *ra; 2924 struct file_ra_state *ra;
2925 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
2925 int nr = 0; 2926 int nr = 0;
2926 int ret = 0; 2927 int ret = 0;
2927 2928
@@ -2956,7 +2957,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
2956 ra, NULL, index, 2957 ra, NULL, index,
2957 last_index + 1 - index); 2958 last_index + 1 - index);
2958 page = find_or_create_page(inode->i_mapping, index, 2959 page = find_or_create_page(inode->i_mapping, index,
2959 GFP_NOFS); 2960 mask);
2960 if (!page) { 2961 if (!page) {
2961 btrfs_delalloc_release_metadata(inode, 2962 btrfs_delalloc_release_metadata(inode,
2962 PAGE_CACHE_SIZE); 2963 PAGE_CACHE_SIZE);
@@ -3323,8 +3324,11 @@ static int find_data_references(struct reloc_control *rc,
3323 } 3324 }
3324 3325
3325 key.objectid = ref_objectid; 3326 key.objectid = ref_objectid;
3326 key.offset = ref_offset;
3327 key.type = BTRFS_EXTENT_DATA_KEY; 3327 key.type = BTRFS_EXTENT_DATA_KEY;
3328 if (ref_offset > ((u64)-1 << 32))
3329 key.offset = 0;
3330 else
3331 key.offset = ref_offset;
3328 3332
3329 path->search_commit_root = 1; 3333 path->search_commit_root = 1;
3330 path->skip_locking = 1; 3334 path->skip_locking = 1;
@@ -3645,14 +3649,11 @@ int prepare_to_relocate(struct reloc_control *rc)
3645 * btrfs_init_reloc_root will use them when there 3649 * btrfs_init_reloc_root will use them when there
3646 * is no reservation in transaction handle. 3650 * is no reservation in transaction handle.
3647 */ 3651 */
3648 ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv, 3652 ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
3649 rc->extent_root->nodesize * 256); 3653 rc->extent_root->nodesize * 256);
3650 if (ret) 3654 if (ret)
3651 return ret; 3655 return ret;
3652 3656
3653 rc->block_rsv->refill_used = 1;
3654 btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
3655
3656 memset(&rc->cluster, 0, sizeof(rc->cluster)); 3657 memset(&rc->cluster, 0, sizeof(rc->cluster));
3657 rc->search_start = rc->block_group->key.objectid; 3658 rc->search_start = rc->block_group->key.objectid;
3658 rc->extents_found = 0; 3659 rc->extents_found = 0;
@@ -3777,8 +3778,7 @@ restart:
3777 } 3778 }
3778 } 3779 }
3779 3780
3780 ret = btrfs_block_rsv_check(trans, rc->extent_root, 3781 ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5);
3781 rc->block_rsv, 0, 5);
3782 if (ret < 0) { 3782 if (ret < 0) {
3783 if (ret != -EAGAIN) { 3783 if (ret != -EAGAIN) {
3784 err = ret; 3784 err = ret;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a8d03d5efb5..c27bcb67f33 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -17,10 +17,14 @@
17 */ 17 */
18 18
19#include <linux/blkdev.h> 19#include <linux/blkdev.h>
20#include <linux/ratelimit.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "volumes.h" 22#include "volumes.h"
22#include "disk-io.h" 23#include "disk-io.h"
23#include "ordered-data.h" 24#include "ordered-data.h"
25#include "transaction.h"
26#include "backref.h"
27#include "extent_io.h"
24 28
25/* 29/*
26 * This is only the first step towards a full-features scrub. It reads all 30 * This is only the first step towards a full-features scrub. It reads all
@@ -29,15 +33,12 @@
29 * any can be found. 33 * any can be found.
30 * 34 *
31 * Future enhancements: 35 * Future enhancements:
32 * - To enhance the performance, better read-ahead strategies for the
33 * extent-tree can be employed.
34 * - In case an unrepairable extent is encountered, track which files are 36 * - In case an unrepairable extent is encountered, track which files are
35 * affected and report them 37 * affected and report them
36 * - In case of a read error on files with nodatasum, map the file and read 38 * - In case of a read error on files with nodatasum, map the file and read
37 * the extent to trigger a writeback of the good copy 39 * the extent to trigger a writeback of the good copy
38 * - track and record media errors, throw out bad devices 40 * - track and record media errors, throw out bad devices
39 * - add a mode to also read unallocated space 41 * - add a mode to also read unallocated space
40 * - make the prefetch cancellable
41 */ 42 */
42 43
43struct scrub_bio; 44struct scrub_bio;
@@ -63,7 +64,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix);
63struct scrub_page { 64struct scrub_page {
64 u64 flags; /* extent flags */ 65 u64 flags; /* extent flags */
65 u64 generation; 66 u64 generation;
66 u64 mirror_num; 67 int mirror_num;
67 int have_csum; 68 int have_csum;
68 u8 csum[BTRFS_CSUM_SIZE]; 69 u8 csum[BTRFS_CSUM_SIZE];
69}; 70};
@@ -87,6 +88,7 @@ struct scrub_dev {
87 int first_free; 88 int first_free;
88 int curr; 89 int curr;
89 atomic_t in_flight; 90 atomic_t in_flight;
91 atomic_t fixup_cnt;
90 spinlock_t list_lock; 92 spinlock_t list_lock;
91 wait_queue_head_t list_wait; 93 wait_queue_head_t list_wait;
92 u16 csum_size; 94 u16 csum_size;
@@ -100,6 +102,27 @@ struct scrub_dev {
100 spinlock_t stat_lock; 102 spinlock_t stat_lock;
101}; 103};
102 104
105struct scrub_fixup_nodatasum {
106 struct scrub_dev *sdev;
107 u64 logical;
108 struct btrfs_root *root;
109 struct btrfs_work work;
110 int mirror_num;
111};
112
113struct scrub_warning {
114 struct btrfs_path *path;
115 u64 extent_item_size;
116 char *scratch_buf;
117 char *msg_buf;
118 const char *errstr;
119 sector_t sector;
120 u64 logical;
121 struct btrfs_device *dev;
122 int msg_bufsize;
123 int scratch_bufsize;
124};
125
103static void scrub_free_csums(struct scrub_dev *sdev) 126static void scrub_free_csums(struct scrub_dev *sdev)
104{ 127{
105 while (!list_empty(&sdev->csum_list)) { 128 while (!list_empty(&sdev->csum_list)) {
@@ -175,14 +198,15 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
175 198
176 if (i != SCRUB_BIOS_PER_DEV-1) 199 if (i != SCRUB_BIOS_PER_DEV-1)
177 sdev->bios[i]->next_free = i + 1; 200 sdev->bios[i]->next_free = i + 1;
178 else 201 else
179 sdev->bios[i]->next_free = -1; 202 sdev->bios[i]->next_free = -1;
180 } 203 }
181 sdev->first_free = 0; 204 sdev->first_free = 0;
182 sdev->curr = -1; 205 sdev->curr = -1;
183 atomic_set(&sdev->in_flight, 0); 206 atomic_set(&sdev->in_flight, 0);
207 atomic_set(&sdev->fixup_cnt, 0);
184 atomic_set(&sdev->cancel_req, 0); 208 atomic_set(&sdev->cancel_req, 0);
185 sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy); 209 sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
186 INIT_LIST_HEAD(&sdev->csum_list); 210 INIT_LIST_HEAD(&sdev->csum_list);
187 211
188 spin_lock_init(&sdev->list_lock); 212 spin_lock_init(&sdev->list_lock);
@@ -195,24 +219,366 @@ nomem:
195 return ERR_PTR(-ENOMEM); 219 return ERR_PTR(-ENOMEM);
196} 220}
197 221
222static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
223{
224 u64 isize;
225 u32 nlink;
226 int ret;
227 int i;
228 struct extent_buffer *eb;
229 struct btrfs_inode_item *inode_item;
230 struct scrub_warning *swarn = ctx;
231 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
232 struct inode_fs_paths *ipath = NULL;
233 struct btrfs_root *local_root;
234 struct btrfs_key root_key;
235
236 root_key.objectid = root;
237 root_key.type = BTRFS_ROOT_ITEM_KEY;
238 root_key.offset = (u64)-1;
239 local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
240 if (IS_ERR(local_root)) {
241 ret = PTR_ERR(local_root);
242 goto err;
243 }
244
245 ret = inode_item_info(inum, 0, local_root, swarn->path);
246 if (ret) {
247 btrfs_release_path(swarn->path);
248 goto err;
249 }
250
251 eb = swarn->path->nodes[0];
252 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
253 struct btrfs_inode_item);
254 isize = btrfs_inode_size(eb, inode_item);
255 nlink = btrfs_inode_nlink(eb, inode_item);
256 btrfs_release_path(swarn->path);
257
258 ipath = init_ipath(4096, local_root, swarn->path);
259 if (IS_ERR(ipath)) {
260 ret = PTR_ERR(ipath);
261 ipath = NULL;
262 goto err;
263 }
264 ret = paths_from_inode(inum, ipath);
265
266 if (ret < 0)
267 goto err;
268
269 /*
270 * we deliberately ignore the bit ipath might have been too small to
271 * hold all of the paths here
272 */
273 for (i = 0; i < ipath->fspath->elem_cnt; ++i)
274 printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
275 "%s, sector %llu, root %llu, inode %llu, offset %llu, "
276 "length %llu, links %u (path: %s)\n", swarn->errstr,
277 swarn->logical, swarn->dev->name,
278 (unsigned long long)swarn->sector, root, inum, offset,
279 min(isize - offset, (u64)PAGE_SIZE), nlink,
280 (char *)(unsigned long)ipath->fspath->val[i]);
281
282 free_ipath(ipath);
283 return 0;
284
285err:
286 printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
287 "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
288 "resolving failed with ret=%d\n", swarn->errstr,
289 swarn->logical, swarn->dev->name,
290 (unsigned long long)swarn->sector, root, inum, offset, ret);
291
292 free_ipath(ipath);
293 return 0;
294}
295
296static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
297 int ix)
298{
299 struct btrfs_device *dev = sbio->sdev->dev;
300 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
301 struct btrfs_path *path;
302 struct btrfs_key found_key;
303 struct extent_buffer *eb;
304 struct btrfs_extent_item *ei;
305 struct scrub_warning swarn;
306 u32 item_size;
307 int ret;
308 u64 ref_root;
309 u8 ref_level;
310 unsigned long ptr = 0;
311 const int bufsize = 4096;
312 u64 extent_offset;
313
314 path = btrfs_alloc_path();
315
316 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
317 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
318 swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
319 swarn.logical = sbio->logical + ix * PAGE_SIZE;
320 swarn.errstr = errstr;
321 swarn.dev = dev;
322 swarn.msg_bufsize = bufsize;
323 swarn.scratch_bufsize = bufsize;
324
325 if (!path || !swarn.scratch_buf || !swarn.msg_buf)
326 goto out;
327
328 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key);
329 if (ret < 0)
330 goto out;
331
332 extent_offset = swarn.logical - found_key.objectid;
333 swarn.extent_item_size = found_key.offset;
334
335 eb = path->nodes[0];
336 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
337 item_size = btrfs_item_size_nr(eb, path->slots[0]);
338
339 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
340 do {
341 ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
342 &ref_root, &ref_level);
343 printk(KERN_WARNING "%s at logical %llu on dev %s, "
344 "sector %llu: metadata %s (level %d) in tree "
345 "%llu\n", errstr, swarn.logical, dev->name,
346 (unsigned long long)swarn.sector,
347 ref_level ? "node" : "leaf",
348 ret < 0 ? -1 : ref_level,
349 ret < 0 ? -1 : ref_root);
350 } while (ret != 1);
351 } else {
352 swarn.path = path;
353 iterate_extent_inodes(fs_info, path, found_key.objectid,
354 extent_offset,
355 scrub_print_warning_inode, &swarn);
356 }
357
358out:
359 btrfs_free_path(path);
360 kfree(swarn.scratch_buf);
361 kfree(swarn.msg_buf);
362}
363
364static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
365{
366 struct page *page = NULL;
367 unsigned long index;
368 struct scrub_fixup_nodatasum *fixup = ctx;
369 int ret;
370 int corrected = 0;
371 struct btrfs_key key;
372 struct inode *inode = NULL;
373 u64 end = offset + PAGE_SIZE - 1;
374 struct btrfs_root *local_root;
375
376 key.objectid = root;
377 key.type = BTRFS_ROOT_ITEM_KEY;
378 key.offset = (u64)-1;
379 local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
380 if (IS_ERR(local_root))
381 return PTR_ERR(local_root);
382
383 key.type = BTRFS_INODE_ITEM_KEY;
384 key.objectid = inum;
385 key.offset = 0;
386 inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
387 if (IS_ERR(inode))
388 return PTR_ERR(inode);
389
390 index = offset >> PAGE_CACHE_SHIFT;
391
392 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
393 if (!page) {
394 ret = -ENOMEM;
395 goto out;
396 }
397
398 if (PageUptodate(page)) {
399 struct btrfs_mapping_tree *map_tree;
400 if (PageDirty(page)) {
401 /*
402 * we need to write the data to the defect sector. the
403 * data that was in that sector is not in memory,
404 * because the page was modified. we must not write the
405 * modified page to that sector.
406 *
407 * TODO: what could be done here: wait for the delalloc
408 * runner to write out that page (might involve
409 * COW) and see whether the sector is still
410 * referenced afterwards.
411 *
412 * For the meantime, we'll treat this error
413 * incorrectable, although there is a chance that a
414 * later scrub will find the bad sector again and that
415 * there's no dirty page in memory, then.
416 */
417 ret = -EIO;
418 goto out;
419 }
420 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
421 ret = repair_io_failure(map_tree, offset, PAGE_SIZE,
422 fixup->logical, page,
423 fixup->mirror_num);
424 unlock_page(page);
425 corrected = !ret;
426 } else {
427 /*
428 * we need to get good data first. the general readpage path
429 * will call repair_io_failure for us, we just have to make
430 * sure we read the bad mirror.
431 */
432 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
433 EXTENT_DAMAGED, GFP_NOFS);
434 if (ret) {
435 /* set_extent_bits should give proper error */
436 WARN_ON(ret > 0);
437 if (ret > 0)
438 ret = -EFAULT;
439 goto out;
440 }
441
442 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
443 btrfs_get_extent,
444 fixup->mirror_num);
445 wait_on_page_locked(page);
446
447 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
448 end, EXTENT_DAMAGED, 0, NULL);
449 if (!corrected)
450 clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
451 EXTENT_DAMAGED, GFP_NOFS);
452 }
453
454out:
455 if (page)
456 put_page(page);
457 if (inode)
458 iput(inode);
459
460 if (ret < 0)
461 return ret;
462
463 if (ret == 0 && corrected) {
464 /*
465 * we only need to call readpage for one of the inodes belonging
466 * to this extent. so make iterate_extent_inodes stop
467 */
468 return 1;
469 }
470
471 return -EIO;
472}
473
474static void scrub_fixup_nodatasum(struct btrfs_work *work)
475{
476 int ret;
477 struct scrub_fixup_nodatasum *fixup;
478 struct scrub_dev *sdev;
479 struct btrfs_trans_handle *trans = NULL;
480 struct btrfs_fs_info *fs_info;
481 struct btrfs_path *path;
482 int uncorrectable = 0;
483
484 fixup = container_of(work, struct scrub_fixup_nodatasum, work);
485 sdev = fixup->sdev;
486 fs_info = fixup->root->fs_info;
487
488 path = btrfs_alloc_path();
489 if (!path) {
490 spin_lock(&sdev->stat_lock);
491 ++sdev->stat.malloc_errors;
492 spin_unlock(&sdev->stat_lock);
493 uncorrectable = 1;
494 goto out;
495 }
496
497 trans = btrfs_join_transaction(fixup->root);
498 if (IS_ERR(trans)) {
499 uncorrectable = 1;
500 goto out;
501 }
502
503 /*
504 * the idea is to trigger a regular read through the standard path. we
505 * read a page from the (failed) logical address by specifying the
506 * corresponding copynum of the failed sector. thus, that readpage is
507 * expected to fail.
508 * that is the point where on-the-fly error correction will kick in
509 * (once it's finished) and rewrite the failed sector if a good copy
510 * can be found.
511 */
512 ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
513 path, scrub_fixup_readpage,
514 fixup);
515 if (ret < 0) {
516 uncorrectable = 1;
517 goto out;
518 }
519 WARN_ON(ret != 1);
520
521 spin_lock(&sdev->stat_lock);
522 ++sdev->stat.corrected_errors;
523 spin_unlock(&sdev->stat_lock);
524
525out:
526 if (trans && !IS_ERR(trans))
527 btrfs_end_transaction(trans, fixup->root);
528 if (uncorrectable) {
529 spin_lock(&sdev->stat_lock);
530 ++sdev->stat.uncorrectable_errors;
531 spin_unlock(&sdev->stat_lock);
532 printk_ratelimited(KERN_ERR "btrfs: unable to fixup "
533 "(nodatasum) error at logical %llu\n",
534 fixup->logical);
535 }
536
537 btrfs_free_path(path);
538 kfree(fixup);
539
540 /* see caller why we're pretending to be paused in the scrub counters */
541 mutex_lock(&fs_info->scrub_lock);
542 atomic_dec(&fs_info->scrubs_running);
543 atomic_dec(&fs_info->scrubs_paused);
544 mutex_unlock(&fs_info->scrub_lock);
545 atomic_dec(&sdev->fixup_cnt);
546 wake_up(&fs_info->scrub_pause_wait);
547 wake_up(&sdev->list_wait);
548}
549
198/* 550/*
199 * scrub_recheck_error gets called when either verification of the page 551 * scrub_recheck_error gets called when either verification of the page
200 * failed or the bio failed to read, e.g. with EIO. In the latter case, 552 * failed or the bio failed to read, e.g. with EIO. In the latter case,
201 * recheck_error gets called for every page in the bio, even though only 553 * recheck_error gets called for every page in the bio, even though only
202 * one may be bad 554 * one may be bad
203 */ 555 */
204static void scrub_recheck_error(struct scrub_bio *sbio, int ix) 556static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
205{ 557{
558 struct scrub_dev *sdev = sbio->sdev;
559 u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
560 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
561 DEFAULT_RATELIMIT_BURST);
562
206 if (sbio->err) { 563 if (sbio->err) {
207 if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, 564 if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,
208 (sbio->physical + ix * PAGE_SIZE) >> 9,
209 sbio->bio->bi_io_vec[ix].bv_page) == 0) { 565 sbio->bio->bi_io_vec[ix].bv_page) == 0) {
210 if (scrub_fixup_check(sbio, ix) == 0) 566 if (scrub_fixup_check(sbio, ix) == 0)
211 return; 567 return 0;
212 } 568 }
569 if (__ratelimit(&_rs))
570 scrub_print_warning("i/o error", sbio, ix);
571 } else {
572 if (__ratelimit(&_rs))
573 scrub_print_warning("checksum error", sbio, ix);
213 } 574 }
214 575
576 spin_lock(&sdev->stat_lock);
577 ++sdev->stat.read_errors;
578 spin_unlock(&sdev->stat_lock);
579
215 scrub_fixup(sbio, ix); 580 scrub_fixup(sbio, ix);
581 return 1;
216} 582}
217 583
218static int scrub_fixup_check(struct scrub_bio *sbio, int ix) 584static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
@@ -250,7 +616,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
250 struct scrub_dev *sdev = sbio->sdev; 616 struct scrub_dev *sdev = sbio->sdev;
251 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 617 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
252 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 618 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
253 struct btrfs_multi_bio *multi = NULL; 619 struct btrfs_bio *bbio = NULL;
620 struct scrub_fixup_nodatasum *fixup;
254 u64 logical = sbio->logical + ix * PAGE_SIZE; 621 u64 logical = sbio->logical + ix * PAGE_SIZE;
255 u64 length; 622 u64 length;
256 int i; 623 int i;
@@ -259,38 +626,57 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
259 626
260 if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && 627 if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
261 (sbio->spag[ix].have_csum == 0)) { 628 (sbio->spag[ix].have_csum == 0)) {
629 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
630 if (!fixup)
631 goto uncorrectable;
632 fixup->sdev = sdev;
633 fixup->logical = logical;
634 fixup->root = fs_info->extent_root;
635 fixup->mirror_num = sbio->spag[ix].mirror_num;
262 /* 636 /*
263 * nodatasum, don't try to fix anything 637 * increment scrubs_running to prevent cancel requests from
264 * FIXME: we can do better, open the inode and trigger a 638 * completing as long as a fixup worker is running. we must also
265 * writeback 639 * increment scrubs_paused to prevent deadlocking on pause
640 * requests used for transactions commits (as the worker uses a
641 * transaction context). it is safe to regard the fixup worker
642 * as paused for all matters practical. effectively, we only
643 * avoid cancellation requests from completing.
266 */ 644 */
267 goto uncorrectable; 645 mutex_lock(&fs_info->scrub_lock);
646 atomic_inc(&fs_info->scrubs_running);
647 atomic_inc(&fs_info->scrubs_paused);
648 mutex_unlock(&fs_info->scrub_lock);
649 atomic_inc(&sdev->fixup_cnt);
650 fixup->work.func = scrub_fixup_nodatasum;
651 btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
652 return;
268 } 653 }
269 654
270 length = PAGE_SIZE; 655 length = PAGE_SIZE;
271 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, 656 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
272 &multi, 0); 657 &bbio, 0);
273 if (ret || !multi || length < PAGE_SIZE) { 658 if (ret || !bbio || length < PAGE_SIZE) {
274 printk(KERN_ERR 659 printk(KERN_ERR
275 "scrub_fixup: btrfs_map_block failed us for %llu\n", 660 "scrub_fixup: btrfs_map_block failed us for %llu\n",
276 (unsigned long long)logical); 661 (unsigned long long)logical);
277 WARN_ON(1); 662 WARN_ON(1);
663 kfree(bbio);
278 return; 664 return;
279 } 665 }
280 666
281 if (multi->num_stripes == 1) 667 if (bbio->num_stripes == 1)
282 /* there aren't any replicas */ 668 /* there aren't any replicas */
283 goto uncorrectable; 669 goto uncorrectable;
284 670
285 /* 671 /*
286 * first find a good copy 672 * first find a good copy
287 */ 673 */
288 for (i = 0; i < multi->num_stripes; ++i) { 674 for (i = 0; i < bbio->num_stripes; ++i) {
289 if (i == sbio->spag[ix].mirror_num) 675 if (i + 1 == sbio->spag[ix].mirror_num)
290 continue; 676 continue;
291 677
292 if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev, 678 if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev,
293 multi->stripes[i].physical >> 9, 679 bbio->stripes[i].physical >> 9,
294 sbio->bio->bi_io_vec[ix].bv_page)) { 680 sbio->bio->bi_io_vec[ix].bv_page)) {
295 /* I/O-error, this is not a good copy */ 681 /* I/O-error, this is not a good copy */
296 continue; 682 continue;
@@ -299,7 +685,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
299 if (scrub_fixup_check(sbio, ix) == 0) 685 if (scrub_fixup_check(sbio, ix) == 0)
300 break; 686 break;
301 } 687 }
302 if (i == multi->num_stripes) 688 if (i == bbio->num_stripes)
303 goto uncorrectable; 689 goto uncorrectable;
304 690
305 if (!sdev->readonly) { 691 if (!sdev->readonly) {
@@ -314,25 +700,23 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
314 } 700 }
315 } 701 }
316 702
317 kfree(multi); 703 kfree(bbio);
318 spin_lock(&sdev->stat_lock); 704 spin_lock(&sdev->stat_lock);
319 ++sdev->stat.corrected_errors; 705 ++sdev->stat.corrected_errors;
320 spin_unlock(&sdev->stat_lock); 706 spin_unlock(&sdev->stat_lock);
321 707
322 if (printk_ratelimit()) 708 printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n",
323 printk(KERN_ERR "btrfs: fixed up at %llu\n", 709 (unsigned long long)logical);
324 (unsigned long long)logical);
325 return; 710 return;
326 711
327uncorrectable: 712uncorrectable:
328 kfree(multi); 713 kfree(bbio);
329 spin_lock(&sdev->stat_lock); 714 spin_lock(&sdev->stat_lock);
330 ++sdev->stat.uncorrectable_errors; 715 ++sdev->stat.uncorrectable_errors;
331 spin_unlock(&sdev->stat_lock); 716 spin_unlock(&sdev->stat_lock);
332 717
333 if (printk_ratelimit()) 718 printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at "
334 printk(KERN_ERR "btrfs: unable to fixup at %llu\n", 719 "logical %llu\n", (unsigned long long)logical);
335 (unsigned long long)logical);
336} 720}
337 721
338static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, 722static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
@@ -382,8 +766,14 @@ static void scrub_checksum(struct btrfs_work *work)
382 int ret; 766 int ret;
383 767
384 if (sbio->err) { 768 if (sbio->err) {
769 ret = 0;
385 for (i = 0; i < sbio->count; ++i) 770 for (i = 0; i < sbio->count; ++i)
386 scrub_recheck_error(sbio, i); 771 ret |= scrub_recheck_error(sbio, i);
772 if (!ret) {
773 spin_lock(&sdev->stat_lock);
774 ++sdev->stat.unverified_errors;
775 spin_unlock(&sdev->stat_lock);
776 }
387 777
388 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); 778 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
389 sbio->bio->bi_flags |= 1 << BIO_UPTODATE; 779 sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
@@ -396,10 +786,6 @@ static void scrub_checksum(struct btrfs_work *work)
396 bi->bv_offset = 0; 786 bi->bv_offset = 0;
397 bi->bv_len = PAGE_SIZE; 787 bi->bv_len = PAGE_SIZE;
398 } 788 }
399
400 spin_lock(&sdev->stat_lock);
401 ++sdev->stat.read_errors;
402 spin_unlock(&sdev->stat_lock);
403 goto out; 789 goto out;
404 } 790 }
405 for (i = 0; i < sbio->count; ++i) { 791 for (i = 0; i < sbio->count; ++i) {
@@ -420,8 +806,14 @@ static void scrub_checksum(struct btrfs_work *work)
420 WARN_ON(1); 806 WARN_ON(1);
421 } 807 }
422 kunmap_atomic(buffer, KM_USER0); 808 kunmap_atomic(buffer, KM_USER0);
423 if (ret) 809 if (ret) {
424 scrub_recheck_error(sbio, i); 810 ret = scrub_recheck_error(sbio, i);
811 if (!ret) {
812 spin_lock(&sdev->stat_lock);
813 ++sdev->stat.unverified_errors;
814 spin_unlock(&sdev->stat_lock);
815 }
816 }
425 } 817 }
426 818
427out: 819out:
@@ -557,57 +949,27 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
557static int scrub_submit(struct scrub_dev *sdev) 949static int scrub_submit(struct scrub_dev *sdev)
558{ 950{
559 struct scrub_bio *sbio; 951 struct scrub_bio *sbio;
560 struct bio *bio;
561 int i;
562 952
563 if (sdev->curr == -1) 953 if (sdev->curr == -1)
564 return 0; 954 return 0;
565 955
566 sbio = sdev->bios[sdev->curr]; 956 sbio = sdev->bios[sdev->curr];
567
568 bio = bio_alloc(GFP_NOFS, sbio->count);
569 if (!bio)
570 goto nomem;
571
572 bio->bi_private = sbio;
573 bio->bi_end_io = scrub_bio_end_io;
574 bio->bi_bdev = sdev->dev->bdev;
575 bio->bi_sector = sbio->physical >> 9;
576
577 for (i = 0; i < sbio->count; ++i) {
578 struct page *page;
579 int ret;
580
581 page = alloc_page(GFP_NOFS);
582 if (!page)
583 goto nomem;
584
585 ret = bio_add_page(bio, page, PAGE_SIZE, 0);
586 if (!ret) {
587 __free_page(page);
588 goto nomem;
589 }
590 }
591
592 sbio->err = 0; 957 sbio->err = 0;
593 sdev->curr = -1; 958 sdev->curr = -1;
594 atomic_inc(&sdev->in_flight); 959 atomic_inc(&sdev->in_flight);
595 960
596 submit_bio(READ, bio); 961 submit_bio(READ, sbio->bio);
597 962
598 return 0; 963 return 0;
599
600nomem:
601 scrub_free_bio(bio);
602
603 return -ENOMEM;
604} 964}
605 965
606static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, 966static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
607 u64 physical, u64 flags, u64 gen, u64 mirror_num, 967 u64 physical, u64 flags, u64 gen, int mirror_num,
608 u8 *csum, int force) 968 u8 *csum, int force)
609{ 969{
610 struct scrub_bio *sbio; 970 struct scrub_bio *sbio;
971 struct page *page;
972 int ret;
611 973
612again: 974again:
613 /* 975 /*
@@ -628,12 +990,22 @@ again:
628 } 990 }
629 sbio = sdev->bios[sdev->curr]; 991 sbio = sdev->bios[sdev->curr];
630 if (sbio->count == 0) { 992 if (sbio->count == 0) {
993 struct bio *bio;
994
631 sbio->physical = physical; 995 sbio->physical = physical;
632 sbio->logical = logical; 996 sbio->logical = logical;
997 bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
998 if (!bio)
999 return -ENOMEM;
1000
1001 bio->bi_private = sbio;
1002 bio->bi_end_io = scrub_bio_end_io;
1003 bio->bi_bdev = sdev->dev->bdev;
1004 bio->bi_sector = sbio->physical >> 9;
1005 sbio->err = 0;
1006 sbio->bio = bio;
633 } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || 1007 } else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
634 sbio->logical + sbio->count * PAGE_SIZE != logical) { 1008 sbio->logical + sbio->count * PAGE_SIZE != logical) {
635 int ret;
636
637 ret = scrub_submit(sdev); 1009 ret = scrub_submit(sdev);
638 if (ret) 1010 if (ret)
639 return ret; 1011 return ret;
@@ -643,6 +1015,20 @@ again:
643 sbio->spag[sbio->count].generation = gen; 1015 sbio->spag[sbio->count].generation = gen;
644 sbio->spag[sbio->count].have_csum = 0; 1016 sbio->spag[sbio->count].have_csum = 0;
645 sbio->spag[sbio->count].mirror_num = mirror_num; 1017 sbio->spag[sbio->count].mirror_num = mirror_num;
1018
1019 page = alloc_page(GFP_NOFS);
1020 if (!page)
1021 return -ENOMEM;
1022
1023 ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0);
1024 if (!ret) {
1025 __free_page(page);
1026 ret = scrub_submit(sdev);
1027 if (ret)
1028 return ret;
1029 goto again;
1030 }
1031
646 if (csum) { 1032 if (csum) {
647 sbio->spag[sbio->count].have_csum = 1; 1033 sbio->spag[sbio->count].have_csum = 1;
648 memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); 1034 memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
@@ -701,7 +1087,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
701 1087
702/* scrub extent tries to collect up to 64 kB for each bio */ 1088/* scrub extent tries to collect up to 64 kB for each bio */
703static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, 1089static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
704 u64 physical, u64 flags, u64 gen, u64 mirror_num) 1090 u64 physical, u64 flags, u64 gen, int mirror_num)
705{ 1091{
706 int ret; 1092 int ret;
707 u8 csum[BTRFS_CSUM_SIZE]; 1093 u8 csum[BTRFS_CSUM_SIZE];
@@ -741,13 +1127,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
741 int slot; 1127 int slot;
742 int i; 1128 int i;
743 u64 nstripes; 1129 u64 nstripes;
744 int start_stripe;
745 struct extent_buffer *l; 1130 struct extent_buffer *l;
746 struct btrfs_key key; 1131 struct btrfs_key key;
747 u64 physical; 1132 u64 physical;
748 u64 logical; 1133 u64 logical;
749 u64 generation; 1134 u64 generation;
750 u64 mirror_num; 1135 int mirror_num;
1136 struct reada_control *reada1;
1137 struct reada_control *reada2;
1138 struct btrfs_key key_start;
1139 struct btrfs_key key_end;
751 1140
752 u64 increment = map->stripe_len; 1141 u64 increment = map->stripe_len;
753 u64 offset; 1142 u64 offset;
@@ -758,102 +1147,88 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
758 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 1147 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
759 offset = map->stripe_len * num; 1148 offset = map->stripe_len * num;
760 increment = map->stripe_len * map->num_stripes; 1149 increment = map->stripe_len * map->num_stripes;
761 mirror_num = 0; 1150 mirror_num = 1;
762 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 1151 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
763 int factor = map->num_stripes / map->sub_stripes; 1152 int factor = map->num_stripes / map->sub_stripes;
764 offset = map->stripe_len * (num / map->sub_stripes); 1153 offset = map->stripe_len * (num / map->sub_stripes);
765 increment = map->stripe_len * factor; 1154 increment = map->stripe_len * factor;
766 mirror_num = num % map->sub_stripes; 1155 mirror_num = num % map->sub_stripes + 1;
767 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 1156 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
768 increment = map->stripe_len; 1157 increment = map->stripe_len;
769 mirror_num = num % map->num_stripes; 1158 mirror_num = num % map->num_stripes + 1;
770 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 1159 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
771 increment = map->stripe_len; 1160 increment = map->stripe_len;
772 mirror_num = num % map->num_stripes; 1161 mirror_num = num % map->num_stripes + 1;
773 } else { 1162 } else {
774 increment = map->stripe_len; 1163 increment = map->stripe_len;
775 mirror_num = 0; 1164 mirror_num = 1;
776 } 1165 }
777 1166
778 path = btrfs_alloc_path(); 1167 path = btrfs_alloc_path();
779 if (!path) 1168 if (!path)
780 return -ENOMEM; 1169 return -ENOMEM;
781 1170
782 path->reada = 2;
783 path->search_commit_root = 1; 1171 path->search_commit_root = 1;
784 path->skip_locking = 1; 1172 path->skip_locking = 1;
785 1173
786 /* 1174 /*
787 * find all extents for each stripe and just read them to get 1175 * trigger the readahead for extent tree csum tree and wait for
788 * them into the page cache 1176 * completion. During readahead, the scrub is officially paused
789 * FIXME: we can do better. build a more intelligent prefetching 1177 * to not hold off transaction commits
790 */ 1178 */
791 logical = base + offset; 1179 logical = base + offset;
792 physical = map->stripes[num].physical;
793 ret = 0;
794 for (i = 0; i < nstripes; ++i) {
795 key.objectid = logical;
796 key.type = BTRFS_EXTENT_ITEM_KEY;
797 key.offset = (u64)0;
798
799 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
800 if (ret < 0)
801 goto out_noplug;
802
803 /*
804 * we might miss half an extent here, but that doesn't matter,
805 * as it's only the prefetch
806 */
807 while (1) {
808 l = path->nodes[0];
809 slot = path->slots[0];
810 if (slot >= btrfs_header_nritems(l)) {
811 ret = btrfs_next_leaf(root, path);
812 if (ret == 0)
813 continue;
814 if (ret < 0)
815 goto out_noplug;
816 1180
817 break; 1181 wait_event(sdev->list_wait,
818 } 1182 atomic_read(&sdev->in_flight) == 0);
819 btrfs_item_key_to_cpu(l, &key, slot); 1183 atomic_inc(&fs_info->scrubs_paused);
1184 wake_up(&fs_info->scrub_pause_wait);
820 1185
821 if (key.objectid >= logical + map->stripe_len) 1186 /* FIXME it might be better to start readahead at commit root */
822 break; 1187 key_start.objectid = logical;
1188 key_start.type = BTRFS_EXTENT_ITEM_KEY;
1189 key_start.offset = (u64)0;
1190 key_end.objectid = base + offset + nstripes * increment;
1191 key_end.type = BTRFS_EXTENT_ITEM_KEY;
1192 key_end.offset = (u64)0;
1193 reada1 = btrfs_reada_add(root, &key_start, &key_end);
1194
1195 key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1196 key_start.type = BTRFS_EXTENT_CSUM_KEY;
1197 key_start.offset = logical;
1198 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1199 key_end.type = BTRFS_EXTENT_CSUM_KEY;
1200 key_end.offset = base + offset + nstripes * increment;
1201 reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
1202
1203 if (!IS_ERR(reada1))
1204 btrfs_reada_wait(reada1);
1205 if (!IS_ERR(reada2))
1206 btrfs_reada_wait(reada2);
823 1207
824 path->slots[0]++; 1208 mutex_lock(&fs_info->scrub_lock);
825 } 1209 while (atomic_read(&fs_info->scrub_pause_req)) {
826 btrfs_release_path(path); 1210 mutex_unlock(&fs_info->scrub_lock);
827 logical += increment; 1211 wait_event(fs_info->scrub_pause_wait,
828 physical += map->stripe_len; 1212 atomic_read(&fs_info->scrub_pause_req) == 0);
829 cond_resched(); 1213 mutex_lock(&fs_info->scrub_lock);
830 } 1214 }
1215 atomic_dec(&fs_info->scrubs_paused);
1216 mutex_unlock(&fs_info->scrub_lock);
1217 wake_up(&fs_info->scrub_pause_wait);
831 1218
832 /* 1219 /*
833 * collect all data csums for the stripe to avoid seeking during 1220 * collect all data csums for the stripe to avoid seeking during
834 * the scrub. This might currently (crc32) end up to be about 1MB 1221 * the scrub. This might currently (crc32) end up to be about 1MB
835 */ 1222 */
836 start_stripe = 0;
837 blk_start_plug(&plug); 1223 blk_start_plug(&plug);
838again:
839 logical = base + offset + start_stripe * increment;
840 for (i = start_stripe; i < nstripes; ++i) {
841 ret = btrfs_lookup_csums_range(csum_root, logical,
842 logical + map->stripe_len - 1,
843 &sdev->csum_list, 1);
844 if (ret)
845 goto out;
846 1224
847 logical += increment;
848 cond_resched();
849 }
850 /* 1225 /*
851 * now find all extents for each stripe and scrub them 1226 * now find all extents for each stripe and scrub them
852 */ 1227 */
853 logical = base + offset + start_stripe * increment; 1228 logical = base + offset;
854 physical = map->stripes[num].physical + start_stripe * map->stripe_len; 1229 physical = map->stripes[num].physical;
855 ret = 0; 1230 ret = 0;
856 for (i = start_stripe; i < nstripes; ++i) { 1231 for (i = 0; i < nstripes; ++i) {
857 /* 1232 /*
858 * canceled? 1233 * canceled?
859 */ 1234 */
@@ -882,11 +1257,14 @@ again:
882 atomic_dec(&fs_info->scrubs_paused); 1257 atomic_dec(&fs_info->scrubs_paused);
883 mutex_unlock(&fs_info->scrub_lock); 1258 mutex_unlock(&fs_info->scrub_lock);
884 wake_up(&fs_info->scrub_pause_wait); 1259 wake_up(&fs_info->scrub_pause_wait);
885 scrub_free_csums(sdev);
886 start_stripe = i;
887 goto again;
888 } 1260 }
889 1261
1262 ret = btrfs_lookup_csums_range(csum_root, logical,
1263 logical + map->stripe_len - 1,
1264 &sdev->csum_list, 1);
1265 if (ret)
1266 goto out;
1267
890 key.objectid = logical; 1268 key.objectid = logical;
891 key.type = BTRFS_EXTENT_ITEM_KEY; 1269 key.type = BTRFS_EXTENT_ITEM_KEY;
892 key.offset = (u64)0; 1270 key.offset = (u64)0;
@@ -982,7 +1360,6 @@ next:
982 1360
983out: 1361out:
984 blk_finish_plug(&plug); 1362 blk_finish_plug(&plug);
985out_noplug:
986 btrfs_free_path(path); 1363 btrfs_free_path(path);
987 return ret < 0 ? ret : 0; 1364 return ret < 0 ? ret : 0;
988} 1365}
@@ -1253,10 +1630,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
1253 ret = scrub_enumerate_chunks(sdev, start, end); 1630 ret = scrub_enumerate_chunks(sdev, start, end);
1254 1631
1255 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 1632 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
1256
1257 atomic_dec(&fs_info->scrubs_running); 1633 atomic_dec(&fs_info->scrubs_running);
1258 wake_up(&fs_info->scrub_pause_wait); 1634 wake_up(&fs_info->scrub_pause_wait);
1259 1635
1636 wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0);
1637
1260 if (progress) 1638 if (progress)
1261 memcpy(progress, &sdev->stat, sizeof(*progress)); 1639 memcpy(progress, &sdev->stat, sizeof(*progress));
1262 1640
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 15634d4648d..e28ad4baf48 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -40,6 +40,7 @@
40#include <linux/magic.h> 40#include <linux/magic.h>
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/cleancache.h> 42#include <linux/cleancache.h>
43#include <linux/mnt_namespace.h>
43#include "compat.h" 44#include "compat.h"
44#include "delayed-inode.h" 45#include "delayed-inode.h"
45#include "ctree.h" 46#include "ctree.h"
@@ -58,6 +59,7 @@
58#include <trace/events/btrfs.h> 59#include <trace/events/btrfs.h>
59 60
60static const struct super_operations btrfs_super_ops; 61static const struct super_operations btrfs_super_ops;
62static struct file_system_type btrfs_fs_type;
61 63
62static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, 64static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
63 char nbuf[16]) 65 char nbuf[16])
@@ -162,7 +164,7 @@ enum {
162 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, 164 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
163 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, 165 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
164 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, 166 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
165 Opt_inode_cache, Opt_err, 167 Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
166}; 168};
167 169
168static match_table_t tokens = { 170static match_table_t tokens = {
@@ -195,6 +197,8 @@ static match_table_t tokens = {
195 {Opt_subvolrootid, "subvolrootid=%d"}, 197 {Opt_subvolrootid, "subvolrootid=%d"},
196 {Opt_defrag, "autodefrag"}, 198 {Opt_defrag, "autodefrag"},
197 {Opt_inode_cache, "inode_cache"}, 199 {Opt_inode_cache, "inode_cache"},
200 {Opt_no_space_cache, "nospace_cache"},
201 {Opt_recovery, "recovery"},
198 {Opt_err, NULL}, 202 {Opt_err, NULL},
199}; 203};
200 204
@@ -206,14 +210,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
206{ 210{
207 struct btrfs_fs_info *info = root->fs_info; 211 struct btrfs_fs_info *info = root->fs_info;
208 substring_t args[MAX_OPT_ARGS]; 212 substring_t args[MAX_OPT_ARGS];
209 char *p, *num, *orig; 213 char *p, *num, *orig = NULL;
214 u64 cache_gen;
210 int intarg; 215 int intarg;
211 int ret = 0; 216 int ret = 0;
212 char *compress_type; 217 char *compress_type;
213 bool compress_force = false; 218 bool compress_force = false;
214 219
220 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
221 if (cache_gen)
222 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
223
215 if (!options) 224 if (!options)
216 return 0; 225 goto out;
217 226
218 /* 227 /*
219 * strsep changes the string, duplicate it because parse_options 228 * strsep changes the string, duplicate it because parse_options
@@ -360,9 +369,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
360 btrfs_set_opt(info->mount_opt, DISCARD); 369 btrfs_set_opt(info->mount_opt, DISCARD);
361 break; 370 break;
362 case Opt_space_cache: 371 case Opt_space_cache:
363 printk(KERN_INFO "btrfs: enabling disk space caching\n");
364 btrfs_set_opt(info->mount_opt, SPACE_CACHE); 372 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
365 break; 373 break;
374 case Opt_no_space_cache:
375 printk(KERN_INFO "btrfs: disabling disk space caching\n");
376 btrfs_clear_opt(info->mount_opt, SPACE_CACHE);
377 break;
366 case Opt_inode_cache: 378 case Opt_inode_cache:
367 printk(KERN_INFO "btrfs: enabling inode map caching\n"); 379 printk(KERN_INFO "btrfs: enabling inode map caching\n");
368 btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE); 380 btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE);
@@ -381,6 +393,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
381 printk(KERN_INFO "btrfs: enabling auto defrag"); 393 printk(KERN_INFO "btrfs: enabling auto defrag");
382 btrfs_set_opt(info->mount_opt, AUTO_DEFRAG); 394 btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
383 break; 395 break;
396 case Opt_recovery:
397 printk(KERN_INFO "btrfs: enabling auto recovery");
398 btrfs_set_opt(info->mount_opt, RECOVERY);
399 break;
384 case Opt_err: 400 case Opt_err:
385 printk(KERN_INFO "btrfs: unrecognized mount option " 401 printk(KERN_INFO "btrfs: unrecognized mount option "
386 "'%s'\n", p); 402 "'%s'\n", p);
@@ -391,6 +407,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
391 } 407 }
392 } 408 }
393out: 409out:
410 if (!ret && btrfs_test_opt(root, SPACE_CACHE))
411 printk(KERN_INFO "btrfs: disk space caching is enabled\n");
394 kfree(orig); 412 kfree(orig);
395 return ret; 413 return ret;
396} 414}
@@ -406,12 +424,12 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
406 u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices) 424 u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
407{ 425{
408 substring_t args[MAX_OPT_ARGS]; 426 substring_t args[MAX_OPT_ARGS];
409 char *opts, *orig, *p; 427 char *device_name, *opts, *orig, *p;
410 int error = 0; 428 int error = 0;
411 int intarg; 429 int intarg;
412 430
413 if (!options) 431 if (!options)
414 goto out; 432 return 0;
415 433
416 /* 434 /*
417 * strsep changes the string, duplicate it because parse_options 435 * strsep changes the string, duplicate it because parse_options
@@ -430,6 +448,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
430 token = match_token(p, tokens, args); 448 token = match_token(p, tokens, args);
431 switch (token) { 449 switch (token) {
432 case Opt_subvol: 450 case Opt_subvol:
451 kfree(*subvol_name);
433 *subvol_name = match_strdup(&args[0]); 452 *subvol_name = match_strdup(&args[0]);
434 break; 453 break;
435 case Opt_subvolid: 454 case Opt_subvolid:
@@ -457,29 +476,24 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
457 } 476 }
458 break; 477 break;
459 case Opt_device: 478 case Opt_device:
460 error = btrfs_scan_one_device(match_strdup(&args[0]), 479 device_name = match_strdup(&args[0]);
480 if (!device_name) {
481 error = -ENOMEM;
482 goto out;
483 }
484 error = btrfs_scan_one_device(device_name,
461 flags, holder, fs_devices); 485 flags, holder, fs_devices);
486 kfree(device_name);
462 if (error) 487 if (error)
463 goto out_free_opts; 488 goto out;
464 break; 489 break;
465 default: 490 default:
466 break; 491 break;
467 } 492 }
468 } 493 }
469 494
470 out_free_opts: 495out:
471 kfree(orig); 496 kfree(orig);
472 out:
473 /*
474 * If no subvolume name is specified we use the default one. Allocate
475 * a copy of the string "." here so that code later in the
476 * mount path doesn't care if it's the default volume or another one.
477 */
478 if (!*subvol_name) {
479 *subvol_name = kstrdup(".", GFP_KERNEL);
480 if (!*subvol_name)
481 return -ENOMEM;
482 }
483 return error; 497 return error;
484} 498}
485 499
@@ -492,7 +506,6 @@ static struct dentry *get_default_root(struct super_block *sb,
492 struct btrfs_path *path; 506 struct btrfs_path *path;
493 struct btrfs_key location; 507 struct btrfs_key location;
494 struct inode *inode; 508 struct inode *inode;
495 struct dentry *dentry;
496 u64 dir_id; 509 u64 dir_id;
497 int new = 0; 510 int new = 0;
498 511
@@ -517,7 +530,7 @@ static struct dentry *get_default_root(struct super_block *sb,
517 * will mount by default if we haven't been given a specific subvolume 530 * will mount by default if we haven't been given a specific subvolume
518 * to mount. 531 * to mount.
519 */ 532 */
520 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 533 dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
521 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); 534 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
522 if (IS_ERR(di)) { 535 if (IS_ERR(di)) {
523 btrfs_free_path(path); 536 btrfs_free_path(path);
@@ -566,29 +579,7 @@ setup_root:
566 return dget(sb->s_root); 579 return dget(sb->s_root);
567 } 580 }
568 581
569 if (new) { 582 return d_obtain_alias(inode);
570 const struct qstr name = { .name = "/", .len = 1 };
571
572 /*
573 * New inode, we need to make the dentry a sibling of s_root so
574 * everything gets cleaned up properly on unmount.
575 */
576 dentry = d_alloc(sb->s_root, &name);
577 if (!dentry) {
578 iput(inode);
579 return ERR_PTR(-ENOMEM);
580 }
581 d_splice_alias(inode, dentry);
582 } else {
583 /*
584 * We found the inode in cache, just find a dentry for it and
585 * put the reference to the inode we just got.
586 */
587 dentry = d_find_alias(inode);
588 iput(inode);
589 }
590
591 return dentry;
592} 583}
593 584
594static int btrfs_fill_super(struct super_block *sb, 585static int btrfs_fill_super(struct super_block *sb,
@@ -719,6 +710,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
719 seq_puts(seq, ",noacl"); 710 seq_puts(seq, ",noacl");
720 if (btrfs_test_opt(root, SPACE_CACHE)) 711 if (btrfs_test_opt(root, SPACE_CACHE))
721 seq_puts(seq, ",space_cache"); 712 seq_puts(seq, ",space_cache");
713 else
714 seq_puts(seq, ",nospace_cache");
722 if (btrfs_test_opt(root, CLEAR_CACHE)) 715 if (btrfs_test_opt(root, CLEAR_CACHE))
723 seq_puts(seq, ",clear_cache"); 716 seq_puts(seq, ",clear_cache");
724 if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) 717 if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
@@ -753,6 +746,111 @@ static int btrfs_set_super(struct super_block *s, void *data)
753 return set_anon_super(s, data); 746 return set_anon_super(s, data);
754} 747}
755 748
749/*
750 * subvolumes are identified by ino 256
751 */
752static inline int is_subvolume_inode(struct inode *inode)
753{
754 if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
755 return 1;
756 return 0;
757}
758
759/*
760 * This will strip out the subvol=%s argument for an argument string and add
761 * subvolid=0 to make sure we get the actual tree root for path walking to the
762 * subvol we want.
763 */
764static char *setup_root_args(char *args)
765{
766 unsigned copied = 0;
767 unsigned len = strlen(args) + 2;
768 char *pos;
769 char *ret;
770
771 /*
772 * We need the same args as before, but minus
773 *
774 * subvol=a
775 *
776 * and add
777 *
778 * subvolid=0
779 *
780 * which is a difference of 2 characters, so we allocate strlen(args) +
781 * 2 characters.
782 */
783 ret = kzalloc(len * sizeof(char), GFP_NOFS);
784 if (!ret)
785 return NULL;
786 pos = strstr(args, "subvol=");
787
788 /* This shouldn't happen, but just in case.. */
789 if (!pos) {
790 kfree(ret);
791 return NULL;
792 }
793
794 /*
795 * The subvol=<> arg is not at the front of the string, copy everybody
796 * up to that into ret.
797 */
798 if (pos != args) {
799 *pos = '\0';
800 strcpy(ret, args);
801 copied += strlen(args);
802 pos++;
803 }
804
805 strncpy(ret + copied, "subvolid=0", len - copied);
806
807 /* Length of subvolid=0 */
808 copied += 10;
809
810 /*
811 * If there is no , after the subvol= option then we know there's no
812 * other options and we can just return.
813 */
814 pos = strchr(pos, ',');
815 if (!pos)
816 return ret;
817
818 /* Copy the rest of the arguments into our buffer */
819 strncpy(ret + copied, pos, len - copied);
820 copied += strlen(pos);
821
822 return ret;
823}
824
825static struct dentry *mount_subvol(const char *subvol_name, int flags,
826 const char *device_name, char *data)
827{
828 struct dentry *root;
829 struct vfsmount *mnt;
830 char *newargs;
831
832 newargs = setup_root_args(data);
833 if (!newargs)
834 return ERR_PTR(-ENOMEM);
835 mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
836 newargs);
837 kfree(newargs);
838 if (IS_ERR(mnt))
839 return ERR_CAST(mnt);
840
841 root = mount_subtree(mnt, subvol_name);
842
843 if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) {
844 struct super_block *s = root->d_sb;
845 dput(root);
846 root = ERR_PTR(-EINVAL);
847 deactivate_locked_super(s);
848 printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
849 subvol_name);
850 }
851
852 return root;
853}
756 854
757/* 855/*
758 * Find a superblock for the given device / mount point. 856 * Find a superblock for the given device / mount point.
@@ -767,7 +865,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
767 struct super_block *s; 865 struct super_block *s;
768 struct dentry *root; 866 struct dentry *root;
769 struct btrfs_fs_devices *fs_devices = NULL; 867 struct btrfs_fs_devices *fs_devices = NULL;
770 struct btrfs_root *tree_root = NULL;
771 struct btrfs_fs_info *fs_info = NULL; 868 struct btrfs_fs_info *fs_info = NULL;
772 fmode_t mode = FMODE_READ; 869 fmode_t mode = FMODE_READ;
773 char *subvol_name = NULL; 870 char *subvol_name = NULL;
@@ -781,21 +878,20 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
781 error = btrfs_parse_early_options(data, mode, fs_type, 878 error = btrfs_parse_early_options(data, mode, fs_type,
782 &subvol_name, &subvol_objectid, 879 &subvol_name, &subvol_objectid,
783 &subvol_rootid, &fs_devices); 880 &subvol_rootid, &fs_devices);
784 if (error) 881 if (error) {
882 kfree(subvol_name);
785 return ERR_PTR(error); 883 return ERR_PTR(error);
884 }
786 885
787 error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices); 886 if (subvol_name) {
788 if (error) 887 root = mount_subvol(subvol_name, flags, device_name, data);
789 goto error_free_subvol_name; 888 kfree(subvol_name);
889 return root;
890 }
790 891
791 error = btrfs_open_devices(fs_devices, mode, fs_type); 892 error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
792 if (error) 893 if (error)
793 goto error_free_subvol_name; 894 return ERR_PTR(error);
794
795 if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
796 error = -EACCES;
797 goto error_close_devices;
798 }
799 895
800 /* 896 /*
801 * Setup a dummy root and fs_info for test/set super. This is because 897 * Setup a dummy root and fs_info for test/set super. This is because
@@ -804,19 +900,40 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
804 * then open_ctree will properly initialize everything later. 900 * then open_ctree will properly initialize everything later.
805 */ 901 */
806 fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); 902 fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
807 tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); 903 if (!fs_info)
808 if (!fs_info || !tree_root) { 904 return ERR_PTR(-ENOMEM);
905
906 fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
907 if (!fs_info->tree_root) {
809 error = -ENOMEM; 908 error = -ENOMEM;
810 goto error_close_devices; 909 goto error_fs_info;
811 } 910 }
812 fs_info->tree_root = tree_root; 911 fs_info->tree_root->fs_info = fs_info;
813 fs_info->fs_devices = fs_devices; 912 fs_info->fs_devices = fs_devices;
814 tree_root->fs_info = fs_info; 913
914 fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
915 fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
916 if (!fs_info->super_copy || !fs_info->super_for_commit) {
917 error = -ENOMEM;
918 goto error_fs_info;
919 }
920
921 error = btrfs_open_devices(fs_devices, mode, fs_type);
922 if (error)
923 goto error_fs_info;
924
925 if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
926 error = -EACCES;
927 goto error_close_devices;
928 }
815 929
816 bdev = fs_devices->latest_bdev; 930 bdev = fs_devices->latest_bdev;
817 s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root); 931 s = sget(fs_type, btrfs_test_super, btrfs_set_super,
818 if (IS_ERR(s)) 932 fs_info->tree_root);
819 goto error_s; 933 if (IS_ERR(s)) {
934 error = PTR_ERR(s);
935 goto error_close_devices;
936 }
820 937
821 if (s->s_root) { 938 if (s->s_root) {
822 if ((flags ^ s->s_flags) & MS_RDONLY) { 939 if ((flags ^ s->s_flags) & MS_RDONLY) {
@@ -826,75 +943,35 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
826 } 943 }
827 944
828 btrfs_close_devices(fs_devices); 945 btrfs_close_devices(fs_devices);
829 kfree(fs_info); 946 free_fs_info(fs_info);
830 kfree(tree_root);
831 } else { 947 } else {
832 char b[BDEVNAME_SIZE]; 948 char b[BDEVNAME_SIZE];
833 949
834 s->s_flags = flags | MS_NOSEC; 950 s->s_flags = flags | MS_NOSEC;
835 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); 951 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
952 btrfs_sb(s)->fs_info->bdev_holder = fs_type;
836 error = btrfs_fill_super(s, fs_devices, data, 953 error = btrfs_fill_super(s, fs_devices, data,
837 flags & MS_SILENT ? 1 : 0); 954 flags & MS_SILENT ? 1 : 0);
838 if (error) { 955 if (error) {
839 deactivate_locked_super(s); 956 deactivate_locked_super(s);
840 goto error_free_subvol_name; 957 return ERR_PTR(error);
841 } 958 }
842 959
843 btrfs_sb(s)->fs_info->bdev_holder = fs_type;
844 s->s_flags |= MS_ACTIVE; 960 s->s_flags |= MS_ACTIVE;
845 } 961 }
846 962
847 /* if they gave us a subvolume name bind mount into that */ 963 root = get_default_root(s, subvol_objectid);
848 if (strcmp(subvol_name, ".")) { 964 if (IS_ERR(root)) {
849 struct dentry *new_root; 965 deactivate_locked_super(s);
850 966 return root;
851 root = get_default_root(s, subvol_rootid);
852 if (IS_ERR(root)) {
853 error = PTR_ERR(root);
854 deactivate_locked_super(s);
855 goto error_free_subvol_name;
856 }
857
858 mutex_lock(&root->d_inode->i_mutex);
859 new_root = lookup_one_len(subvol_name, root,
860 strlen(subvol_name));
861 mutex_unlock(&root->d_inode->i_mutex);
862
863 if (IS_ERR(new_root)) {
864 dput(root);
865 deactivate_locked_super(s);
866 error = PTR_ERR(new_root);
867 goto error_free_subvol_name;
868 }
869 if (!new_root->d_inode) {
870 dput(root);
871 dput(new_root);
872 deactivate_locked_super(s);
873 error = -ENXIO;
874 goto error_free_subvol_name;
875 }
876 dput(root);
877 root = new_root;
878 } else {
879 root = get_default_root(s, subvol_objectid);
880 if (IS_ERR(root)) {
881 error = PTR_ERR(root);
882 deactivate_locked_super(s);
883 goto error_free_subvol_name;
884 }
885 } 967 }
886 968
887 kfree(subvol_name);
888 return root; 969 return root;
889 970
890error_s:
891 error = PTR_ERR(s);
892error_close_devices: 971error_close_devices:
893 btrfs_close_devices(fs_devices); 972 btrfs_close_devices(fs_devices);
894 kfree(fs_info); 973error_fs_info:
895 kfree(tree_root); 974 free_fs_info(fs_info);
896error_free_subvol_name:
897 kfree(subvol_name);
898 return ERR_PTR(error); 975 return ERR_PTR(error);
899} 976}
900 977
@@ -919,7 +996,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
919 if (root->fs_info->fs_devices->rw_devices == 0) 996 if (root->fs_info->fs_devices->rw_devices == 0)
920 return -EACCES; 997 return -EACCES;
921 998
922 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) 999 if (btrfs_super_log_root(root->fs_info->super_copy) != 0)
923 return -EINVAL; 1000 return -EINVAL;
924 1001
925 ret = btrfs_cleanup_fs_roots(root->fs_info); 1002 ret = btrfs_cleanup_fs_roots(root->fs_info);
@@ -980,7 +1057,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
980 int i = 0, nr_devices; 1057 int i = 0, nr_devices;
981 int ret; 1058 int ret;
982 1059
983 nr_devices = fs_info->fs_devices->rw_devices; 1060 nr_devices = fs_info->fs_devices->open_devices;
984 BUG_ON(!nr_devices); 1061 BUG_ON(!nr_devices);
985 1062
986 devices_info = kmalloc(sizeof(*devices_info) * nr_devices, 1063 devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
@@ -1002,8 +1079,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1002 else 1079 else
1003 min_stripe_size = BTRFS_STRIPE_LEN; 1080 min_stripe_size = BTRFS_STRIPE_LEN;
1004 1081
1005 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 1082 list_for_each_entry(device, &fs_devices->devices, dev_list) {
1006 if (!device->in_fs_metadata) 1083 if (!device->in_fs_metadata || !device->bdev)
1007 continue; 1084 continue;
1008 1085
1009 avail_space = device->total_bytes - device->bytes_used; 1086 avail_space = device->total_bytes - device->bytes_used;
@@ -1085,7 +1162,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1085static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) 1162static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1086{ 1163{
1087 struct btrfs_root *root = btrfs_sb(dentry->d_sb); 1164 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
1088 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 1165 struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1089 struct list_head *head = &root->fs_info->space_info; 1166 struct list_head *head = &root->fs_info->space_info;
1090 struct btrfs_space_info *found; 1167 struct btrfs_space_info *found;
1091 u64 total_used = 0; 1168 u64 total_used = 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e24b7964a15..81376d94cd3 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -55,6 +55,7 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
55 struct btrfs_transaction *cur_trans; 55 struct btrfs_transaction *cur_trans;
56 56
57 spin_lock(&root->fs_info->trans_lock); 57 spin_lock(&root->fs_info->trans_lock);
58loop:
58 if (root->fs_info->trans_no_join) { 59 if (root->fs_info->trans_no_join) {
59 if (!nofail) { 60 if (!nofail) {
60 spin_unlock(&root->fs_info->trans_lock); 61 spin_unlock(&root->fs_info->trans_lock);
@@ -75,16 +76,18 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
75 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); 76 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
76 if (!cur_trans) 77 if (!cur_trans)
77 return -ENOMEM; 78 return -ENOMEM;
79
78 spin_lock(&root->fs_info->trans_lock); 80 spin_lock(&root->fs_info->trans_lock);
79 if (root->fs_info->running_transaction) { 81 if (root->fs_info->running_transaction) {
82 /*
83 * someone started a transaction after we unlocked. Make sure
84 * to redo the trans_no_join checks above
85 */
80 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 86 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
81 cur_trans = root->fs_info->running_transaction; 87 cur_trans = root->fs_info->running_transaction;
82 atomic_inc(&cur_trans->use_count); 88 goto loop;
83 atomic_inc(&cur_trans->num_writers);
84 cur_trans->num_joined++;
85 spin_unlock(&root->fs_info->trans_lock);
86 return 0;
87 } 89 }
90
88 atomic_set(&cur_trans->num_writers, 1); 91 atomic_set(&cur_trans->num_writers, 1);
89 cur_trans->num_joined = 0; 92 cur_trans->num_joined = 0;
90 init_waitqueue_head(&cur_trans->writer_wait); 93 init_waitqueue_head(&cur_trans->writer_wait);
@@ -275,7 +278,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
275 */ 278 */
276 if (num_items > 0 && root != root->fs_info->chunk_root) { 279 if (num_items > 0 && root != root->fs_info->chunk_root) {
277 num_bytes = btrfs_calc_trans_metadata_size(root, num_items); 280 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
278 ret = btrfs_block_rsv_add(NULL, root, 281 ret = btrfs_block_rsv_add(root,
279 &root->fs_info->trans_block_rsv, 282 &root->fs_info->trans_block_rsv,
280 num_bytes); 283 num_bytes);
281 if (ret) 284 if (ret)
@@ -418,8 +421,8 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
418 struct btrfs_root *root) 421 struct btrfs_root *root)
419{ 422{
420 int ret; 423 int ret;
421 ret = btrfs_block_rsv_check(trans, root, 424
422 &root->fs_info->global_block_rsv, 0, 5); 425 ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
423 return ret ? 1 : 0; 426 return ret ? 1 : 0;
424} 427}
425 428
@@ -427,17 +430,26 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
427 struct btrfs_root *root) 430 struct btrfs_root *root)
428{ 431{
429 struct btrfs_transaction *cur_trans = trans->transaction; 432 struct btrfs_transaction *cur_trans = trans->transaction;
433 struct btrfs_block_rsv *rsv = trans->block_rsv;
430 int updates; 434 int updates;
431 435
432 smp_mb(); 436 smp_mb();
433 if (cur_trans->blocked || cur_trans->delayed_refs.flushing) 437 if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
434 return 1; 438 return 1;
435 439
440 /*
441 * We need to do this in case we're deleting csums so the global block
442 * rsv get's used instead of the csum block rsv.
443 */
444 trans->block_rsv = NULL;
445
436 updates = trans->delayed_ref_updates; 446 updates = trans->delayed_ref_updates;
437 trans->delayed_ref_updates = 0; 447 trans->delayed_ref_updates = 0;
438 if (updates) 448 if (updates)
439 btrfs_run_delayed_refs(trans, root, updates); 449 btrfs_run_delayed_refs(trans, root, updates);
440 450
451 trans->block_rsv = rsv;
452
441 return should_end_transaction(trans, root); 453 return should_end_transaction(trans, root);
442} 454}
443 455
@@ -453,6 +465,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
453 return 0; 465 return 0;
454 } 466 }
455 467
468 btrfs_trans_release_metadata(trans, root);
469 trans->block_rsv = NULL;
456 while (count < 4) { 470 while (count < 4) {
457 unsigned long cur = trans->delayed_ref_updates; 471 unsigned long cur = trans->delayed_ref_updates;
458 trans->delayed_ref_updates = 0; 472 trans->delayed_ref_updates = 0;
@@ -473,8 +487,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
473 count++; 487 count++;
474 } 488 }
475 489
476 btrfs_trans_release_metadata(trans, root);
477
478 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && 490 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
479 should_end_transaction(trans, root)) { 491 should_end_transaction(trans, root)) {
480 trans->transaction->blocked = 1; 492 trans->transaction->blocked = 1;
@@ -562,50 +574,21 @@ int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
562int btrfs_write_marked_extents(struct btrfs_root *root, 574int btrfs_write_marked_extents(struct btrfs_root *root,
563 struct extent_io_tree *dirty_pages, int mark) 575 struct extent_io_tree *dirty_pages, int mark)
564{ 576{
565 int ret;
566 int err = 0; 577 int err = 0;
567 int werr = 0; 578 int werr = 0;
568 struct page *page; 579 struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
569 struct inode *btree_inode = root->fs_info->btree_inode;
570 u64 start = 0; 580 u64 start = 0;
571 u64 end; 581 u64 end;
572 unsigned long index;
573
574 while (1) {
575 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
576 mark);
577 if (ret)
578 break;
579 while (start <= end) {
580 cond_resched();
581
582 index = start >> PAGE_CACHE_SHIFT;
583 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
584 page = find_get_page(btree_inode->i_mapping, index);
585 if (!page)
586 continue;
587
588 btree_lock_page_hook(page);
589 if (!page->mapping) {
590 unlock_page(page);
591 page_cache_release(page);
592 continue;
593 }
594 582
595 if (PageWriteback(page)) { 583 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
596 if (PageDirty(page)) 584 mark)) {
597 wait_on_page_writeback(page); 585 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
598 else { 586 GFP_NOFS);
599 unlock_page(page); 587 err = filemap_fdatawrite_range(mapping, start, end);
600 page_cache_release(page); 588 if (err)
601 continue; 589 werr = err;
602 } 590 cond_resched();
603 } 591 start = end + 1;
604 err = write_one_page(page, 0);
605 if (err)
606 werr = err;
607 page_cache_release(page);
608 }
609 } 592 }
610 if (err) 593 if (err)
611 werr = err; 594 werr = err;
@@ -621,39 +604,20 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
621int btrfs_wait_marked_extents(struct btrfs_root *root, 604int btrfs_wait_marked_extents(struct btrfs_root *root,
622 struct extent_io_tree *dirty_pages, int mark) 605 struct extent_io_tree *dirty_pages, int mark)
623{ 606{
624 int ret;
625 int err = 0; 607 int err = 0;
626 int werr = 0; 608 int werr = 0;
627 struct page *page; 609 struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
628 struct inode *btree_inode = root->fs_info->btree_inode;
629 u64 start = 0; 610 u64 start = 0;
630 u64 end; 611 u64 end;
631 unsigned long index;
632 612
633 while (1) { 613 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
634 ret = find_first_extent_bit(dirty_pages, start, &start, &end, 614 EXTENT_NEED_WAIT)) {
635 mark); 615 clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
636 if (ret) 616 err = filemap_fdatawait_range(mapping, start, end);
637 break; 617 if (err)
638 618 werr = err;
639 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); 619 cond_resched();
640 while (start <= end) { 620 start = end + 1;
641 index = start >> PAGE_CACHE_SHIFT;
642 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
643 page = find_get_page(btree_inode->i_mapping, index);
644 if (!page)
645 continue;
646 if (PageDirty(page)) {
647 btree_lock_page_hook(page);
648 wait_on_page_writeback(page);
649 err = write_one_page(page, 0);
650 if (err)
651 werr = err;
652 }
653 wait_on_page_writeback(page);
654 page_cache_release(page);
655 cond_resched();
656 }
657 } 621 }
658 if (err) 622 if (err)
659 werr = err; 623 werr = err;
@@ -673,7 +637,12 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
673 637
674 ret = btrfs_write_marked_extents(root, dirty_pages, mark); 638 ret = btrfs_write_marked_extents(root, dirty_pages, mark);
675 ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); 639 ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
676 return ret || ret2; 640
641 if (ret)
642 return ret;
643 if (ret2)
644 return ret2;
645 return 0;
677} 646}
678 647
679int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 648int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
@@ -816,6 +785,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
816 785
817 btrfs_save_ino_cache(root, trans); 786 btrfs_save_ino_cache(root, trans);
818 787
788 /* see comments in should_cow_block() */
789 root->force_cow = 0;
790 smp_wmb();
791
819 if (root->commit_root != root->node) { 792 if (root->commit_root != root->node) {
820 mutex_lock(&root->fs_commit_mutex); 793 mutex_lock(&root->fs_commit_mutex);
821 switch_commit_root(root); 794 switch_commit_root(root);
@@ -911,11 +884,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
911 } 884 }
912 885
913 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); 886 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
914 btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
915 887
916 if (to_reserve > 0) { 888 if (to_reserve > 0) {
917 ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, 889 ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv,
918 to_reserve); 890 to_reserve);
919 if (ret) { 891 if (ret) {
920 pending->error = ret; 892 pending->error = ret;
921 goto fail; 893 goto fail;
@@ -979,6 +951,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
979 btrfs_tree_unlock(old); 951 btrfs_tree_unlock(old);
980 free_extent_buffer(old); 952 free_extent_buffer(old);
981 953
954 /* see comments in should_cow_block() */
955 root->force_cow = 1;
956 smp_wmb();
957
982 btrfs_set_root_node(new_root_item, tmp); 958 btrfs_set_root_node(new_root_item, tmp);
983 /* record when the snapshot was created in key.offset */ 959 /* record when the snapshot was created in key.offset */
984 key.offset = trans->transid; 960 key.offset = trans->transid;
@@ -1002,7 +978,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1002 BUG_ON(IS_ERR(pending->snap)); 978 BUG_ON(IS_ERR(pending->snap));
1003 979
1004 btrfs_reloc_post_snapshot(trans, pending); 980 btrfs_reloc_post_snapshot(trans, pending);
1005 btrfs_orphan_post_snapshot(trans, pending);
1006fail: 981fail:
1007 kfree(new_root_item); 982 kfree(new_root_item);
1008 trans->block_rsv = rsv; 983 trans->block_rsv = rsv;
@@ -1032,7 +1007,7 @@ static void update_super_roots(struct btrfs_root *root)
1032 struct btrfs_root_item *root_item; 1007 struct btrfs_root_item *root_item;
1033 struct btrfs_super_block *super; 1008 struct btrfs_super_block *super;
1034 1009
1035 super = &root->fs_info->super_copy; 1010 super = root->fs_info->super_copy;
1036 1011
1037 root_item = &root->fs_info->chunk_root->root_item; 1012 root_item = &root->fs_info->chunk_root->root_item;
1038 super->chunk_root = root_item->bytenr; 1013 super->chunk_root = root_item->bytenr;
@@ -1043,7 +1018,7 @@ static void update_super_roots(struct btrfs_root *root)
1043 super->root = root_item->bytenr; 1018 super->root = root_item->bytenr;
1044 super->generation = root_item->generation; 1019 super->generation = root_item->generation;
1045 super->root_level = root_item->level; 1020 super->root_level = root_item->level;
1046 if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE)) 1021 if (btrfs_test_opt(root, SPACE_CACHE))
1047 super->cache_generation = root_item->generation; 1022 super->cache_generation = root_item->generation;
1048} 1023}
1049 1024
@@ -1168,14 +1143,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1168 1143
1169 btrfs_run_ordered_operations(root, 0); 1144 btrfs_run_ordered_operations(root, 0);
1170 1145
1146 btrfs_trans_release_metadata(trans, root);
1147 trans->block_rsv = NULL;
1148
1171 /* make a pass through all the delayed refs we have so far 1149 /* make a pass through all the delayed refs we have so far
1172 * any runnings procs may add more while we are here 1150 * any runnings procs may add more while we are here
1173 */ 1151 */
1174 ret = btrfs_run_delayed_refs(trans, root, 0); 1152 ret = btrfs_run_delayed_refs(trans, root, 0);
1175 BUG_ON(ret); 1153 BUG_ON(ret);
1176 1154
1177 btrfs_trans_release_metadata(trans, root);
1178
1179 cur_trans = trans->transaction; 1155 cur_trans = trans->transaction;
1180 /* 1156 /*
1181 * set the flushing flag so procs in this transaction have to 1157 * set the flushing flag so procs in this transaction have to
@@ -1341,12 +1317,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1341 update_super_roots(root); 1317 update_super_roots(root);
1342 1318
1343 if (!root->fs_info->log_root_recovering) { 1319 if (!root->fs_info->log_root_recovering) {
1344 btrfs_set_super_log_root(&root->fs_info->super_copy, 0); 1320 btrfs_set_super_log_root(root->fs_info->super_copy, 0);
1345 btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0); 1321 btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
1346 } 1322 }
1347 1323
1348 memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, 1324 memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
1349 sizeof(root->fs_info->super_copy)); 1325 sizeof(*root->fs_info->super_copy));
1350 1326
1351 trans->transaction->blocked = 0; 1327 trans->transaction->blocked = 0;
1352 spin_lock(&root->fs_info->trans_lock); 1328 spin_lock(&root->fs_info->trans_lock);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 0618aa39740..3568374d419 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -276,8 +276,9 @@ static int process_one_buffer(struct btrfs_root *log,
276 struct walk_control *wc, u64 gen) 276 struct walk_control *wc, u64 gen)
277{ 277{
278 if (wc->pin) 278 if (wc->pin)
279 btrfs_pin_extent(log->fs_info->extent_root, 279 btrfs_pin_extent_for_log_replay(wc->trans,
280 eb->start, eb->len, 0); 280 log->fs_info->extent_root,
281 eb->start, eb->len);
281 282
282 if (btrfs_buffer_uptodate(eb, gen)) { 283 if (btrfs_buffer_uptodate(eb, gen)) {
283 if (wc->write) 284 if (wc->write)
@@ -1760,7 +1761,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1760 1761
1761 WARN_ON(root_owner != 1762 WARN_ON(root_owner !=
1762 BTRFS_TREE_LOG_OBJECTID); 1763 BTRFS_TREE_LOG_OBJECTID);
1763 ret = btrfs_free_reserved_extent(root, 1764 ret = btrfs_free_and_pin_reserved_extent(root,
1764 bytenr, blocksize); 1765 bytenr, blocksize);
1765 BUG_ON(ret); 1766 BUG_ON(ret);
1766 } 1767 }
@@ -1828,7 +1829,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1828 btrfs_tree_unlock(next); 1829 btrfs_tree_unlock(next);
1829 1830
1830 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1831 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1831 ret = btrfs_free_reserved_extent(root, 1832 ret = btrfs_free_and_pin_reserved_extent(root,
1832 path->nodes[*level]->start, 1833 path->nodes[*level]->start,
1833 path->nodes[*level]->len); 1834 path->nodes[*level]->len);
1834 BUG_ON(ret); 1835 BUG_ON(ret);
@@ -1897,7 +1898,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1897 1898
1898 WARN_ON(log->root_key.objectid != 1899 WARN_ON(log->root_key.objectid !=
1899 BTRFS_TREE_LOG_OBJECTID); 1900 BTRFS_TREE_LOG_OBJECTID);
1900 ret = btrfs_free_reserved_extent(log, next->start, 1901 ret = btrfs_free_and_pin_reserved_extent(log, next->start,
1901 next->len); 1902 next->len);
1902 BUG_ON(ret); 1903 BUG_ON(ret);
1903 } 1904 }
@@ -2013,10 +2014,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2013 /* wait for previous tree log sync to complete */ 2014 /* wait for previous tree log sync to complete */
2014 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2015 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2015 wait_log_commit(trans, root, root->log_transid - 1); 2016 wait_log_commit(trans, root, root->log_transid - 1);
2016
2017 while (1) { 2017 while (1) {
2018 unsigned long batch = root->log_batch; 2018 unsigned long batch = root->log_batch;
2019 if (root->log_multiple_pids) { 2019 /* when we're on an ssd, just kick the log commit out */
2020 if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
2020 mutex_unlock(&root->log_mutex); 2021 mutex_unlock(&root->log_mutex);
2021 schedule_timeout_uninterruptible(1); 2022 schedule_timeout_uninterruptible(1);
2022 mutex_lock(&root->log_mutex); 2023 mutex_lock(&root->log_mutex);
@@ -2117,9 +2118,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2117 BUG_ON(ret); 2118 BUG_ON(ret);
2118 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2119 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2119 2120
2120 btrfs_set_super_log_root(&root->fs_info->super_for_commit, 2121 btrfs_set_super_log_root(root->fs_info->super_for_commit,
2121 log_root_tree->node->start); 2122 log_root_tree->node->start);
2122 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, 2123 btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
2123 btrfs_header_level(log_root_tree->node)); 2124 btrfs_header_level(log_root_tree->node));
2124 2125
2125 log_root_tree->log_batch = 0; 2126 log_root_tree->log_batch = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f2a4cc79da6..0a8c8f8304b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -366,6 +366,14 @@ static noinline int device_list_add(const char *path,
366 } 366 }
367 INIT_LIST_HEAD(&device->dev_alloc_list); 367 INIT_LIST_HEAD(&device->dev_alloc_list);
368 368
369 /* init readahead state */
370 spin_lock_init(&device->reada_lock);
371 device->reada_curr_zone = NULL;
372 atomic_set(&device->reada_in_flight, 0);
373 device->reada_next = 0;
374 INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
375 INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
376
369 mutex_lock(&fs_devices->device_list_mutex); 377 mutex_lock(&fs_devices->device_list_mutex);
370 list_add_rcu(&device->dev_list, &fs_devices->devices); 378 list_add_rcu(&device->dev_list, &fs_devices->devices);
371 mutex_unlock(&fs_devices->device_list_mutex); 379 mutex_unlock(&fs_devices->device_list_mutex);
@@ -597,10 +605,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
597 set_blocksize(bdev, 4096); 605 set_blocksize(bdev, 4096);
598 606
599 bh = btrfs_read_dev_super(bdev); 607 bh = btrfs_read_dev_super(bdev);
600 if (!bh) { 608 if (!bh)
601 ret = -EINVAL;
602 goto error_close; 609 goto error_close;
603 }
604 610
605 disk_super = (struct btrfs_super_block *)bh->b_data; 611 disk_super = (struct btrfs_super_block *)bh->b_data;
606 devid = btrfs_stack_device_id(&disk_super->dev_item); 612 devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -655,7 +661,7 @@ error:
655 continue; 661 continue;
656 } 662 }
657 if (fs_devices->open_devices == 0) { 663 if (fs_devices->open_devices == 0) {
658 ret = -EIO; 664 ret = -EINVAL;
659 goto out; 665 goto out;
660 } 666 }
661 fs_devices->seeding = seeding; 667 fs_devices->seeding = seeding;
@@ -993,7 +999,7 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
993 key.objectid = device->devid; 999 key.objectid = device->devid;
994 key.offset = start; 1000 key.offset = start;
995 key.type = BTRFS_DEV_EXTENT_KEY; 1001 key.type = BTRFS_DEV_EXTENT_KEY;
996 1002again:
997 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1003 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
998 if (ret > 0) { 1004 if (ret > 0) {
999 ret = btrfs_previous_item(root, path, key.objectid, 1005 ret = btrfs_previous_item(root, path, key.objectid,
@@ -1006,6 +1012,9 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1006 struct btrfs_dev_extent); 1012 struct btrfs_dev_extent);
1007 BUG_ON(found_key.offset > start || found_key.offset + 1013 BUG_ON(found_key.offset > start || found_key.offset +
1008 btrfs_dev_extent_length(leaf, extent) < start); 1014 btrfs_dev_extent_length(leaf, extent) < start);
1015 key = found_key;
1016 btrfs_release_path(path);
1017 goto again;
1009 } else if (ret == 0) { 1018 } else if (ret == 0) {
1010 leaf = path->nodes[0]; 1019 leaf = path->nodes[0];
1011 extent = btrfs_item_ptr(leaf, path->slots[0], 1020 extent = btrfs_item_ptr(leaf, path->slots[0],
@@ -1013,8 +1022,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1013 } 1022 }
1014 BUG_ON(ret); 1023 BUG_ON(ret);
1015 1024
1016 if (device->bytes_used > 0) 1025 if (device->bytes_used > 0) {
1017 device->bytes_used -= btrfs_dev_extent_length(leaf, extent); 1026 u64 len = btrfs_dev_extent_length(leaf, extent);
1027 device->bytes_used -= len;
1028 spin_lock(&root->fs_info->free_chunk_lock);
1029 root->fs_info->free_chunk_space += len;
1030 spin_unlock(&root->fs_info->free_chunk_lock);
1031 }
1018 ret = btrfs_del_item(trans, root, path); 1032 ret = btrfs_del_item(trans, root, path);
1019 1033
1020out: 1034out:
@@ -1356,6 +1370,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1356 if (ret) 1370 if (ret)
1357 goto error_undo; 1371 goto error_undo;
1358 1372
1373 spin_lock(&root->fs_info->free_chunk_lock);
1374 root->fs_info->free_chunk_space = device->total_bytes -
1375 device->bytes_used;
1376 spin_unlock(&root->fs_info->free_chunk_lock);
1377
1359 device->in_fs_metadata = 0; 1378 device->in_fs_metadata = 0;
1360 btrfs_scrub_cancel_dev(root, device); 1379 btrfs_scrub_cancel_dev(root, device);
1361 1380
@@ -1387,8 +1406,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1387 call_rcu(&device->rcu, free_device); 1406 call_rcu(&device->rcu, free_device);
1388 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1407 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1389 1408
1390 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 1409 num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
1391 btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); 1410 btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
1392 1411
1393 if (cur_devices->open_devices == 0) { 1412 if (cur_devices->open_devices == 0) {
1394 struct btrfs_fs_devices *fs_devices; 1413 struct btrfs_fs_devices *fs_devices;
@@ -1450,7 +1469,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1450 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1469 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1451 struct btrfs_fs_devices *old_devices; 1470 struct btrfs_fs_devices *old_devices;
1452 struct btrfs_fs_devices *seed_devices; 1471 struct btrfs_fs_devices *seed_devices;
1453 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 1472 struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1454 struct btrfs_device *device; 1473 struct btrfs_device *device;
1455 u64 super_flags; 1474 u64 super_flags;
1456 1475
@@ -1592,7 +1611,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1592 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1611 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1593 return -EINVAL; 1612 return -EINVAL;
1594 1613
1595 bdev = blkdev_get_by_path(device_path, FMODE_EXCL, 1614 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
1596 root->fs_info->bdev_holder); 1615 root->fs_info->bdev_holder);
1597 if (IS_ERR(bdev)) 1616 if (IS_ERR(bdev))
1598 return PTR_ERR(bdev); 1617 return PTR_ERR(bdev);
@@ -1691,15 +1710,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1691 root->fs_info->fs_devices->num_can_discard++; 1710 root->fs_info->fs_devices->num_can_discard++;
1692 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1711 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1693 1712
1713 spin_lock(&root->fs_info->free_chunk_lock);
1714 root->fs_info->free_chunk_space += device->total_bytes;
1715 spin_unlock(&root->fs_info->free_chunk_lock);
1716
1694 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 1717 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
1695 root->fs_info->fs_devices->rotating = 1; 1718 root->fs_info->fs_devices->rotating = 1;
1696 1719
1697 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); 1720 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
1698 btrfs_set_super_total_bytes(&root->fs_info->super_copy, 1721 btrfs_set_super_total_bytes(root->fs_info->super_copy,
1699 total_bytes + device->total_bytes); 1722 total_bytes + device->total_bytes);
1700 1723
1701 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); 1724 total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
1702 btrfs_set_super_num_devices(&root->fs_info->super_copy, 1725 btrfs_set_super_num_devices(root->fs_info->super_copy,
1703 total_bytes + 1); 1726 total_bytes + 1);
1704 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1727 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1705 1728
@@ -1790,7 +1813,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1790 struct btrfs_device *device, u64 new_size) 1813 struct btrfs_device *device, u64 new_size)
1791{ 1814{
1792 struct btrfs_super_block *super_copy = 1815 struct btrfs_super_block *super_copy =
1793 &device->dev_root->fs_info->super_copy; 1816 device->dev_root->fs_info->super_copy;
1794 u64 old_total = btrfs_super_total_bytes(super_copy); 1817 u64 old_total = btrfs_super_total_bytes(super_copy);
1795 u64 diff = new_size - device->total_bytes; 1818 u64 diff = new_size - device->total_bytes;
1796 1819
@@ -1849,7 +1872,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1849static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 1872static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1850 chunk_offset) 1873 chunk_offset)
1851{ 1874{
1852 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 1875 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
1853 struct btrfs_disk_key *disk_key; 1876 struct btrfs_disk_key *disk_key;
1854 struct btrfs_chunk *chunk; 1877 struct btrfs_chunk *chunk;
1855 u8 *ptr; 1878 u8 *ptr;
@@ -2175,7 +2198,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2175 bool retried = false; 2198 bool retried = false;
2176 struct extent_buffer *l; 2199 struct extent_buffer *l;
2177 struct btrfs_key key; 2200 struct btrfs_key key;
2178 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2201 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2179 u64 old_total = btrfs_super_total_bytes(super_copy); 2202 u64 old_total = btrfs_super_total_bytes(super_copy);
2180 u64 old_size = device->total_bytes; 2203 u64 old_size = device->total_bytes;
2181 u64 diff = device->total_bytes - new_size; 2204 u64 diff = device->total_bytes - new_size;
@@ -2192,8 +2215,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2192 lock_chunks(root); 2215 lock_chunks(root);
2193 2216
2194 device->total_bytes = new_size; 2217 device->total_bytes = new_size;
2195 if (device->writeable) 2218 if (device->writeable) {
2196 device->fs_devices->total_rw_bytes -= diff; 2219 device->fs_devices->total_rw_bytes -= diff;
2220 spin_lock(&root->fs_info->free_chunk_lock);
2221 root->fs_info->free_chunk_space -= diff;
2222 spin_unlock(&root->fs_info->free_chunk_lock);
2223 }
2197 unlock_chunks(root); 2224 unlock_chunks(root);
2198 2225
2199again: 2226again:
@@ -2257,6 +2284,9 @@ again:
2257 device->total_bytes = old_size; 2284 device->total_bytes = old_size;
2258 if (device->writeable) 2285 if (device->writeable)
2259 device->fs_devices->total_rw_bytes += diff; 2286 device->fs_devices->total_rw_bytes += diff;
2287 spin_lock(&root->fs_info->free_chunk_lock);
2288 root->fs_info->free_chunk_space += diff;
2289 spin_unlock(&root->fs_info->free_chunk_lock);
2260 unlock_chunks(root); 2290 unlock_chunks(root);
2261 goto done; 2291 goto done;
2262 } 2292 }
@@ -2292,7 +2322,7 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
2292 struct btrfs_key *key, 2322 struct btrfs_key *key,
2293 struct btrfs_chunk *chunk, int item_size) 2323 struct btrfs_chunk *chunk, int item_size)
2294{ 2324{
2295 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2325 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2296 struct btrfs_disk_key disk_key; 2326 struct btrfs_disk_key disk_key;
2297 u32 array_size; 2327 u32 array_size;
2298 u8 *ptr; 2328 u8 *ptr;
@@ -2615,6 +2645,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2615 index++; 2645 index++;
2616 } 2646 }
2617 2647
2648 spin_lock(&extent_root->fs_info->free_chunk_lock);
2649 extent_root->fs_info->free_chunk_space -= (stripe_size *
2650 map->num_stripes);
2651 spin_unlock(&extent_root->fs_info->free_chunk_lock);
2652
2618 index = 0; 2653 index = 0;
2619 stripe = &chunk->stripe; 2654 stripe = &chunk->stripe;
2620 while (index < map->num_stripes) { 2655 while (index < map->num_stripes) {
@@ -2848,7 +2883,7 @@ static int find_live_mirror(struct map_lookup *map, int first, int num,
2848 2883
2849static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2884static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2850 u64 logical, u64 *length, 2885 u64 logical, u64 *length,
2851 struct btrfs_multi_bio **multi_ret, 2886 struct btrfs_bio **bbio_ret,
2852 int mirror_num) 2887 int mirror_num)
2853{ 2888{
2854 struct extent_map *em; 2889 struct extent_map *em;
@@ -2866,18 +2901,18 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2866 int i; 2901 int i;
2867 int num_stripes; 2902 int num_stripes;
2868 int max_errors = 0; 2903 int max_errors = 0;
2869 struct btrfs_multi_bio *multi = NULL; 2904 struct btrfs_bio *bbio = NULL;
2870 2905
2871 if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) 2906 if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
2872 stripes_allocated = 1; 2907 stripes_allocated = 1;
2873again: 2908again:
2874 if (multi_ret) { 2909 if (bbio_ret) {
2875 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), 2910 bbio = kzalloc(btrfs_bio_size(stripes_allocated),
2876 GFP_NOFS); 2911 GFP_NOFS);
2877 if (!multi) 2912 if (!bbio)
2878 return -ENOMEM; 2913 return -ENOMEM;
2879 2914
2880 atomic_set(&multi->error, 0); 2915 atomic_set(&bbio->error, 0);
2881 } 2916 }
2882 2917
2883 read_lock(&em_tree->lock); 2918 read_lock(&em_tree->lock);
@@ -2898,7 +2933,7 @@ again:
2898 if (mirror_num > map->num_stripes) 2933 if (mirror_num > map->num_stripes)
2899 mirror_num = 0; 2934 mirror_num = 0;
2900 2935
2901 /* if our multi bio struct is too small, back off and try again */ 2936 /* if our btrfs_bio struct is too small, back off and try again */
2902 if (rw & REQ_WRITE) { 2937 if (rw & REQ_WRITE) {
2903 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 2938 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2904 BTRFS_BLOCK_GROUP_DUP)) { 2939 BTRFS_BLOCK_GROUP_DUP)) {
@@ -2917,11 +2952,11 @@ again:
2917 stripes_required = map->num_stripes; 2952 stripes_required = map->num_stripes;
2918 } 2953 }
2919 } 2954 }
2920 if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && 2955 if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
2921 stripes_allocated < stripes_required) { 2956 stripes_allocated < stripes_required) {
2922 stripes_allocated = map->num_stripes; 2957 stripes_allocated = map->num_stripes;
2923 free_extent_map(em); 2958 free_extent_map(em);
2924 kfree(multi); 2959 kfree(bbio);
2925 goto again; 2960 goto again;
2926 } 2961 }
2927 stripe_nr = offset; 2962 stripe_nr = offset;
@@ -2950,7 +2985,7 @@ again:
2950 *length = em->len - offset; 2985 *length = em->len - offset;
2951 } 2986 }
2952 2987
2953 if (!multi_ret) 2988 if (!bbio_ret)
2954 goto out; 2989 goto out;
2955 2990
2956 num_stripes = 1; 2991 num_stripes = 1;
@@ -2975,13 +3010,17 @@ again:
2975 stripe_index = find_live_mirror(map, 0, 3010 stripe_index = find_live_mirror(map, 0,
2976 map->num_stripes, 3011 map->num_stripes,
2977 current->pid % map->num_stripes); 3012 current->pid % map->num_stripes);
3013 mirror_num = stripe_index + 1;
2978 } 3014 }
2979 3015
2980 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 3016 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2981 if (rw & (REQ_WRITE | REQ_DISCARD)) 3017 if (rw & (REQ_WRITE | REQ_DISCARD)) {
2982 num_stripes = map->num_stripes; 3018 num_stripes = map->num_stripes;
2983 else if (mirror_num) 3019 } else if (mirror_num) {
2984 stripe_index = mirror_num - 1; 3020 stripe_index = mirror_num - 1;
3021 } else {
3022 mirror_num = 1;
3023 }
2985 3024
2986 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3025 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2987 int factor = map->num_stripes / map->sub_stripes; 3026 int factor = map->num_stripes / map->sub_stripes;
@@ -3001,6 +3040,7 @@ again:
3001 stripe_index = find_live_mirror(map, stripe_index, 3040 stripe_index = find_live_mirror(map, stripe_index,
3002 map->sub_stripes, stripe_index + 3041 map->sub_stripes, stripe_index +
3003 current->pid % map->sub_stripes); 3042 current->pid % map->sub_stripes);
3043 mirror_num = stripe_index + 1;
3004 } 3044 }
3005 } else { 3045 } else {
3006 /* 3046 /*
@@ -3009,15 +3049,16 @@ again:
3009 * stripe_index is the number of our device in the stripe array 3049 * stripe_index is the number of our device in the stripe array
3010 */ 3050 */
3011 stripe_index = do_div(stripe_nr, map->num_stripes); 3051 stripe_index = do_div(stripe_nr, map->num_stripes);
3052 mirror_num = stripe_index + 1;
3012 } 3053 }
3013 BUG_ON(stripe_index >= map->num_stripes); 3054 BUG_ON(stripe_index >= map->num_stripes);
3014 3055
3015 if (rw & REQ_DISCARD) { 3056 if (rw & REQ_DISCARD) {
3016 for (i = 0; i < num_stripes; i++) { 3057 for (i = 0; i < num_stripes; i++) {
3017 multi->stripes[i].physical = 3058 bbio->stripes[i].physical =
3018 map->stripes[stripe_index].physical + 3059 map->stripes[stripe_index].physical +
3019 stripe_offset + stripe_nr * map->stripe_len; 3060 stripe_offset + stripe_nr * map->stripe_len;
3020 multi->stripes[i].dev = map->stripes[stripe_index].dev; 3061 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
3021 3062
3022 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3063 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3023 u64 stripes; 3064 u64 stripes;
@@ -3038,16 +3079,16 @@ again:
3038 } 3079 }
3039 stripes = stripe_nr_end - 1 - j; 3080 stripes = stripe_nr_end - 1 - j;
3040 do_div(stripes, map->num_stripes); 3081 do_div(stripes, map->num_stripes);
3041 multi->stripes[i].length = map->stripe_len * 3082 bbio->stripes[i].length = map->stripe_len *
3042 (stripes - stripe_nr + 1); 3083 (stripes - stripe_nr + 1);
3043 3084
3044 if (i == 0) { 3085 if (i == 0) {
3045 multi->stripes[i].length -= 3086 bbio->stripes[i].length -=
3046 stripe_offset; 3087 stripe_offset;
3047 stripe_offset = 0; 3088 stripe_offset = 0;
3048 } 3089 }
3049 if (stripe_index == last_stripe) 3090 if (stripe_index == last_stripe)
3050 multi->stripes[i].length -= 3091 bbio->stripes[i].length -=
3051 stripe_end_offset; 3092 stripe_end_offset;
3052 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3093 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3053 u64 stripes; 3094 u64 stripes;
@@ -3072,11 +3113,11 @@ again:
3072 } 3113 }
3073 stripes = stripe_nr_end - 1 - j; 3114 stripes = stripe_nr_end - 1 - j;
3074 do_div(stripes, factor); 3115 do_div(stripes, factor);
3075 multi->stripes[i].length = map->stripe_len * 3116 bbio->stripes[i].length = map->stripe_len *
3076 (stripes - stripe_nr + 1); 3117 (stripes - stripe_nr + 1);
3077 3118
3078 if (i < map->sub_stripes) { 3119 if (i < map->sub_stripes) {
3079 multi->stripes[i].length -= 3120 bbio->stripes[i].length -=
3080 stripe_offset; 3121 stripe_offset;
3081 if (i == map->sub_stripes - 1) 3122 if (i == map->sub_stripes - 1)
3082 stripe_offset = 0; 3123 stripe_offset = 0;
@@ -3084,11 +3125,11 @@ again:
3084 if (stripe_index >= last_stripe && 3125 if (stripe_index >= last_stripe &&
3085 stripe_index <= (last_stripe + 3126 stripe_index <= (last_stripe +
3086 map->sub_stripes - 1)) { 3127 map->sub_stripes - 1)) {
3087 multi->stripes[i].length -= 3128 bbio->stripes[i].length -=
3088 stripe_end_offset; 3129 stripe_end_offset;
3089 } 3130 }
3090 } else 3131 } else
3091 multi->stripes[i].length = *length; 3132 bbio->stripes[i].length = *length;
3092 3133
3093 stripe_index++; 3134 stripe_index++;
3094 if (stripe_index == map->num_stripes) { 3135 if (stripe_index == map->num_stripes) {
@@ -3099,19 +3140,20 @@ again:
3099 } 3140 }
3100 } else { 3141 } else {
3101 for (i = 0; i < num_stripes; i++) { 3142 for (i = 0; i < num_stripes; i++) {
3102 multi->stripes[i].physical = 3143 bbio->stripes[i].physical =
3103 map->stripes[stripe_index].physical + 3144 map->stripes[stripe_index].physical +
3104 stripe_offset + 3145 stripe_offset +
3105 stripe_nr * map->stripe_len; 3146 stripe_nr * map->stripe_len;
3106 multi->stripes[i].dev = 3147 bbio->stripes[i].dev =
3107 map->stripes[stripe_index].dev; 3148 map->stripes[stripe_index].dev;
3108 stripe_index++; 3149 stripe_index++;
3109 } 3150 }
3110 } 3151 }
3111 if (multi_ret) { 3152 if (bbio_ret) {
3112 *multi_ret = multi; 3153 *bbio_ret = bbio;
3113 multi->num_stripes = num_stripes; 3154 bbio->num_stripes = num_stripes;
3114 multi->max_errors = max_errors; 3155 bbio->max_errors = max_errors;
3156 bbio->mirror_num = mirror_num;
3115 } 3157 }
3116out: 3158out:
3117 free_extent_map(em); 3159 free_extent_map(em);
@@ -3120,9 +3162,9 @@ out:
3120 3162
3121int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 3163int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3122 u64 logical, u64 *length, 3164 u64 logical, u64 *length,
3123 struct btrfs_multi_bio **multi_ret, int mirror_num) 3165 struct btrfs_bio **bbio_ret, int mirror_num)
3124{ 3166{
3125 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, 3167 return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
3126 mirror_num); 3168 mirror_num);
3127} 3169}
3128 3170
@@ -3191,28 +3233,30 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
3191 return 0; 3233 return 0;
3192} 3234}
3193 3235
3194static void end_bio_multi_stripe(struct bio *bio, int err) 3236static void btrfs_end_bio(struct bio *bio, int err)
3195{ 3237{
3196 struct btrfs_multi_bio *multi = bio->bi_private; 3238 struct btrfs_bio *bbio = bio->bi_private;
3197 int is_orig_bio = 0; 3239 int is_orig_bio = 0;
3198 3240
3199 if (err) 3241 if (err)
3200 atomic_inc(&multi->error); 3242 atomic_inc(&bbio->error);
3201 3243
3202 if (bio == multi->orig_bio) 3244 if (bio == bbio->orig_bio)
3203 is_orig_bio = 1; 3245 is_orig_bio = 1;
3204 3246
3205 if (atomic_dec_and_test(&multi->stripes_pending)) { 3247 if (atomic_dec_and_test(&bbio->stripes_pending)) {
3206 if (!is_orig_bio) { 3248 if (!is_orig_bio) {
3207 bio_put(bio); 3249 bio_put(bio);
3208 bio = multi->orig_bio; 3250 bio = bbio->orig_bio;
3209 } 3251 }
3210 bio->bi_private = multi->private; 3252 bio->bi_private = bbio->private;
3211 bio->bi_end_io = multi->end_io; 3253 bio->bi_end_io = bbio->end_io;
3254 bio->bi_bdev = (struct block_device *)
3255 (unsigned long)bbio->mirror_num;
3212 /* only send an error to the higher layers if it is 3256 /* only send an error to the higher layers if it is
3213 * beyond the tolerance of the multi-bio 3257 * beyond the tolerance of the multi-bio
3214 */ 3258 */
3215 if (atomic_read(&multi->error) > multi->max_errors) { 3259 if (atomic_read(&bbio->error) > bbio->max_errors) {
3216 err = -EIO; 3260 err = -EIO;
3217 } else if (err) { 3261 } else if (err) {
3218 /* 3262 /*
@@ -3222,7 +3266,7 @@ static void end_bio_multi_stripe(struct bio *bio, int err)
3222 set_bit(BIO_UPTODATE, &bio->bi_flags); 3266 set_bit(BIO_UPTODATE, &bio->bi_flags);
3223 err = 0; 3267 err = 0;
3224 } 3268 }
3225 kfree(multi); 3269 kfree(bbio);
3226 3270
3227 bio_endio(bio, err); 3271 bio_endio(bio, err);
3228 } else if (!is_orig_bio) { 3272 } else if (!is_orig_bio) {
@@ -3302,20 +3346,20 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3302 u64 logical = (u64)bio->bi_sector << 9; 3346 u64 logical = (u64)bio->bi_sector << 9;
3303 u64 length = 0; 3347 u64 length = 0;
3304 u64 map_length; 3348 u64 map_length;
3305 struct btrfs_multi_bio *multi = NULL;
3306 int ret; 3349 int ret;
3307 int dev_nr = 0; 3350 int dev_nr = 0;
3308 int total_devs = 1; 3351 int total_devs = 1;
3352 struct btrfs_bio *bbio = NULL;
3309 3353
3310 length = bio->bi_size; 3354 length = bio->bi_size;
3311 map_tree = &root->fs_info->mapping_tree; 3355 map_tree = &root->fs_info->mapping_tree;
3312 map_length = length; 3356 map_length = length;
3313 3357
3314 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, 3358 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
3315 mirror_num); 3359 mirror_num);
3316 BUG_ON(ret); 3360 BUG_ON(ret);
3317 3361
3318 total_devs = multi->num_stripes; 3362 total_devs = bbio->num_stripes;
3319 if (map_length < length) { 3363 if (map_length < length) {
3320 printk(KERN_CRIT "mapping failed logical %llu bio len %llu " 3364 printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
3321 "len %llu\n", (unsigned long long)logical, 3365 "len %llu\n", (unsigned long long)logical,
@@ -3323,25 +3367,28 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3323 (unsigned long long)map_length); 3367 (unsigned long long)map_length);
3324 BUG(); 3368 BUG();
3325 } 3369 }
3326 multi->end_io = first_bio->bi_end_io; 3370
3327 multi->private = first_bio->bi_private; 3371 bbio->orig_bio = first_bio;
3328 multi->orig_bio = first_bio; 3372 bbio->private = first_bio->bi_private;
3329 atomic_set(&multi->stripes_pending, multi->num_stripes); 3373 bbio->end_io = first_bio->bi_end_io;
3374 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
3330 3375
3331 while (dev_nr < total_devs) { 3376 while (dev_nr < total_devs) {
3332 if (total_devs > 1) { 3377 if (dev_nr < total_devs - 1) {
3333 if (dev_nr < total_devs - 1) { 3378 bio = bio_clone(first_bio, GFP_NOFS);
3334 bio = bio_clone(first_bio, GFP_NOFS); 3379 BUG_ON(!bio);
3335 BUG_ON(!bio); 3380 } else {
3336 } else { 3381 bio = first_bio;
3337 bio = first_bio;
3338 }
3339 bio->bi_private = multi;
3340 bio->bi_end_io = end_bio_multi_stripe;
3341 } 3382 }
3342 bio->bi_sector = multi->stripes[dev_nr].physical >> 9; 3383 bio->bi_private = bbio;
3343 dev = multi->stripes[dev_nr].dev; 3384 bio->bi_end_io = btrfs_end_bio;
3385 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
3386 dev = bbio->stripes[dev_nr].dev;
3344 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { 3387 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
3388 pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
3389 "(%s id %llu), size=%u\n", rw,
3390 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
3391 dev->name, dev->devid, bio->bi_size);
3345 bio->bi_bdev = dev->bdev; 3392 bio->bi_bdev = dev->bdev;
3346 if (async_submit) 3393 if (async_submit)
3347 schedule_bio(root, dev, rw, bio); 3394 schedule_bio(root, dev, rw, bio);
@@ -3354,8 +3401,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3354 } 3401 }
3355 dev_nr++; 3402 dev_nr++;
3356 } 3403 }
3357 if (total_devs == 1)
3358 kfree(multi);
3359 return 0; 3404 return 0;
3360} 3405}
3361 3406
@@ -3616,15 +3661,20 @@ static int read_one_dev(struct btrfs_root *root,
3616 fill_device_from_item(leaf, dev_item, device); 3661 fill_device_from_item(leaf, dev_item, device);
3617 device->dev_root = root->fs_info->dev_root; 3662 device->dev_root = root->fs_info->dev_root;
3618 device->in_fs_metadata = 1; 3663 device->in_fs_metadata = 1;
3619 if (device->writeable) 3664 if (device->writeable) {
3620 device->fs_devices->total_rw_bytes += device->total_bytes; 3665 device->fs_devices->total_rw_bytes += device->total_bytes;
3666 spin_lock(&root->fs_info->free_chunk_lock);
3667 root->fs_info->free_chunk_space += device->total_bytes -
3668 device->bytes_used;
3669 spin_unlock(&root->fs_info->free_chunk_lock);
3670 }
3621 ret = 0; 3671 ret = 0;
3622 return ret; 3672 return ret;
3623} 3673}
3624 3674
3625int btrfs_read_sys_array(struct btrfs_root *root) 3675int btrfs_read_sys_array(struct btrfs_root *root)
3626{ 3676{
3627 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 3677 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3628 struct extent_buffer *sb; 3678 struct extent_buffer *sb;
3629 struct btrfs_disk_key *disk_key; 3679 struct btrfs_disk_key *disk_key;
3630 struct btrfs_chunk *chunk; 3680 struct btrfs_chunk *chunk;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 6d866db4e17..78f2d4d4f37 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -92,6 +92,20 @@ struct btrfs_device {
92 struct btrfs_work work; 92 struct btrfs_work work;
93 struct rcu_head rcu; 93 struct rcu_head rcu;
94 struct work_struct rcu_work; 94 struct work_struct rcu_work;
95
96 /* readahead state */
97 spinlock_t reada_lock;
98 atomic_t reada_in_flight;
99 u64 reada_next;
100 struct reada_zone *reada_curr_zone;
101 struct radix_tree_root reada_zones;
102 struct radix_tree_root reada_extents;
103
104 /* for sending down flush barriers */
105 struct bio *flush_bio;
106 struct completion flush_wait;
107 int nobarriers;
108
95}; 109};
96 110
97struct btrfs_fs_devices { 111struct btrfs_fs_devices {
@@ -136,7 +150,10 @@ struct btrfs_bio_stripe {
136 u64 length; /* only used for discard mappings */ 150 u64 length; /* only used for discard mappings */
137}; 151};
138 152
139struct btrfs_multi_bio { 153struct btrfs_bio;
154typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
155
156struct btrfs_bio {
140 atomic_t stripes_pending; 157 atomic_t stripes_pending;
141 bio_end_io_t *end_io; 158 bio_end_io_t *end_io;
142 struct bio *orig_bio; 159 struct bio *orig_bio;
@@ -144,6 +161,7 @@ struct btrfs_multi_bio {
144 atomic_t error; 161 atomic_t error;
145 int max_errors; 162 int max_errors;
146 int num_stripes; 163 int num_stripes;
164 int mirror_num;
147 struct btrfs_bio_stripe stripes[]; 165 struct btrfs_bio_stripe stripes[];
148}; 166};
149 167
@@ -171,7 +189,7 @@ struct map_lookup {
171int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 189int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
172 u64 end, u64 *length); 190 u64 end, u64 *length);
173 191
174#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ 192#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \
175 (sizeof(struct btrfs_bio_stripe) * (n))) 193 (sizeof(struct btrfs_bio_stripe) * (n)))
176 194
177int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 195int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
@@ -180,7 +198,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
180 u64 chunk_offset, u64 start, u64 num_bytes); 198 u64 chunk_offset, u64 start, u64 num_bytes);
181int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 199int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
182 u64 logical, u64 *length, 200 u64 logical, u64 *length,
183 struct btrfs_multi_bio **multi_ret, int mirror_num); 201 struct btrfs_bio **bbio_ret, int mirror_num);
184int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 202int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
185 u64 chunk_start, u64 physical, u64 devid, 203 u64 chunk_start, u64 physical, u64 devid,
186 u64 **logical, int *naddrs, int *stripe_len); 204 u64 **logical, int *naddrs, int *stripe_len);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 426aa464f1a..3848b04e310 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -127,6 +127,17 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
127again: 127again:
128 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), 128 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
129 name, name_len, value, size); 129 name, name_len, value, size);
130 /*
131 * If we're setting an xattr to a new value but the new value is say
132 * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting
133 * back from split_leaf. This is because it thinks we'll be extending
134 * the existing item size, but we're asking for enough space to add the
135 * item itself. So if we get EOVERFLOW just set ret to EEXIST and let
136 * the rest of the function figure it out.
137 */
138 if (ret == -EOVERFLOW)
139 ret = -EEXIST;
140
130 if (ret == -EEXIST) { 141 if (ret == -EEXIST) {
131 if (flags & XATTR_CREATE) 142 if (flags & XATTR_CREATE)
132 goto out; 143 goto out;
diff --git a/fs/buffer.c b/fs/buffer.c
index 70a19745cb6..19d8eb7fdc8 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -288,7 +288,7 @@ static void free_more_memory(void)
288 struct zone *zone; 288 struct zone *zone;
289 int nid; 289 int nid;
290 290
291 wakeup_flusher_threads(1024); 291 wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
292 yield(); 292 yield();
293 293
294 for_each_online_node(nid) { 294 for_each_online_node(nid) {
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 2cfb695d1f8..5d9b9acc5fc 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -204,7 +204,7 @@ int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov,
204} 204}
205 205
206/* first calculate 24 bytes ntlm response and then 16 byte session key */ 206/* first calculate 24 bytes ntlm response and then 16 byte session key */
207int setup_ntlm_response(struct cifs_ses *ses) 207int setup_ntlm_response(struct cifs_ses *ses, const struct nls_table *nls_cp)
208{ 208{
209 int rc = 0; 209 int rc = 0;
210 unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE; 210 unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE;
@@ -221,14 +221,14 @@ int setup_ntlm_response(struct cifs_ses *ses)
221 ses->auth_key.len = temp_len; 221 ses->auth_key.len = temp_len;
222 222
223 rc = SMBNTencrypt(ses->password, ses->server->cryptkey, 223 rc = SMBNTencrypt(ses->password, ses->server->cryptkey,
224 ses->auth_key.response + CIFS_SESS_KEY_SIZE); 224 ses->auth_key.response + CIFS_SESS_KEY_SIZE, nls_cp);
225 if (rc) { 225 if (rc) {
226 cFYI(1, "%s Can't generate NTLM response, error: %d", 226 cFYI(1, "%s Can't generate NTLM response, error: %d",
227 __func__, rc); 227 __func__, rc);
228 return rc; 228 return rc;
229 } 229 }
230 230
231 rc = E_md4hash(ses->password, temp_key); 231 rc = E_md4hash(ses->password, temp_key, nls_cp);
232 if (rc) { 232 if (rc) {
233 cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc); 233 cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
234 return rc; 234 return rc;
@@ -404,7 +404,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
404 } 404 }
405 405
406 /* calculate md4 hash of password */ 406 /* calculate md4 hash of password */
407 E_md4hash(ses->password, nt_hash); 407 E_md4hash(ses->password, nt_hash, nls_cp);
408 408
409 rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash, 409 rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash,
410 CIFS_NTHASH_SIZE); 410 CIFS_NTHASH_SIZE);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index d9dbaf869cd..30ff56005d8 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -125,5 +125,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
125extern const struct export_operations cifs_export_ops; 125extern const struct export_operations cifs_export_ops;
126#endif /* CONFIG_CIFS_NFSD_EXPORT */ 126#endif /* CONFIG_CIFS_NFSD_EXPORT */
127 127
128#define CIFS_VERSION "1.75" 128#define CIFS_VERSION "1.76"
129#endif /* _CIFSFS_H */ 129#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index ef4f631e4c0..6f4e243e0f6 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -395,8 +395,9 @@ extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
395extern int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov, 395extern int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov,
396 struct TCP_Server_Info *server, 396 struct TCP_Server_Info *server,
397 __u32 expected_sequence_number); 397 __u32 expected_sequence_number);
398extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *); 398extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *,
399extern int setup_ntlm_response(struct cifs_ses *); 399 const struct nls_table *);
400extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *);
400extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *); 401extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *);
401extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *); 402extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
402extern void cifs_crypto_shash_release(struct TCP_Server_Info *); 403extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
@@ -448,7 +449,8 @@ extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr,
448 const unsigned char *path, 449 const unsigned char *path,
449 struct cifs_sb_info *cifs_sb, int xid); 450 struct cifs_sb_info *cifs_sb, int xid);
450extern int mdfour(unsigned char *, unsigned char *, int); 451extern int mdfour(unsigned char *, unsigned char *, int);
451extern int E_md4hash(const unsigned char *passwd, unsigned char *p16); 452extern int E_md4hash(const unsigned char *passwd, unsigned char *p16,
453 const struct nls_table *codepage);
452extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8, 454extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8,
453 unsigned char *p24); 455 unsigned char *p24);
454 456
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index d545a95c30e..8cd4b52d421 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -37,6 +37,7 @@
37#include <asm/uaccess.h> 37#include <asm/uaccess.h>
38#include <asm/processor.h> 38#include <asm/processor.h>
39#include <linux/inet.h> 39#include <linux/inet.h>
40#include <linux/module.h>
40#include <net/ipv6.h> 41#include <net/ipv6.h>
41#include "cifspdu.h" 42#include "cifspdu.h"
42#include "cifsglob.h" 43#include "cifsglob.h"
@@ -440,6 +441,8 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
440 smb_msg.msg_controllen = 0; 441 smb_msg.msg_controllen = 0;
441 442
442 for (total_read = 0; to_read; total_read += length, to_read -= length) { 443 for (total_read = 0; to_read; total_read += length, to_read -= length) {
444 try_to_freeze();
445
443 if (server_unresponsive(server)) { 446 if (server_unresponsive(server)) {
444 total_read = -EAGAIN; 447 total_read = -EAGAIN;
445 break; 448 break;
@@ -3452,7 +3455,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses,
3452 else 3455 else
3453#endif /* CIFS_WEAK_PW_HASH */ 3456#endif /* CIFS_WEAK_PW_HASH */
3454 rc = SMBNTencrypt(tcon->password, ses->server->cryptkey, 3457 rc = SMBNTencrypt(tcon->password, ses->server->cryptkey,
3455 bcc_ptr); 3458 bcc_ptr, nls_codepage);
3456 3459
3457 bcc_ptr += CIFS_AUTH_RESP_SIZE; 3460 bcc_ptr += CIFS_AUTH_RESP_SIZE;
3458 if (ses->capabilities & CAP_UNICODE) { 3461 if (ses->capabilities & CAP_UNICODE) {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ea096ce5d4f..4dd9283885e 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -645,20 +645,20 @@ int cifs_closedir(struct inode *inode, struct file *file)
645} 645}
646 646
647static struct cifsLockInfo * 647static struct cifsLockInfo *
648cifs_lock_init(__u64 len, __u64 offset, __u8 type, __u16 netfid) 648cifs_lock_init(__u64 offset, __u64 length, __u8 type, __u16 netfid)
649{ 649{
650 struct cifsLockInfo *li = 650 struct cifsLockInfo *lock =
651 kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL); 651 kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL);
652 if (!li) 652 if (!lock)
653 return li; 653 return lock;
654 li->netfid = netfid; 654 lock->offset = offset;
655 li->offset = offset; 655 lock->length = length;
656 li->length = len; 656 lock->type = type;
657 li->type = type; 657 lock->netfid = netfid;
658 li->pid = current->tgid; 658 lock->pid = current->tgid;
659 INIT_LIST_HEAD(&li->blist); 659 INIT_LIST_HEAD(&lock->blist);
660 init_waitqueue_head(&li->block_q); 660 init_waitqueue_head(&lock->block_q);
661 return li; 661 return lock;
662} 662}
663 663
664static void 664static void
@@ -672,7 +672,7 @@ cifs_del_lock_waiters(struct cifsLockInfo *lock)
672} 672}
673 673
674static bool 674static bool
675cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset, 675__cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset,
676 __u64 length, __u8 type, __u16 netfid, 676 __u64 length, __u8 type, __u16 netfid,
677 struct cifsLockInfo **conf_lock) 677 struct cifsLockInfo **conf_lock)
678{ 678{
@@ -694,6 +694,21 @@ cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset,
694 return false; 694 return false;
695} 695}
696 696
697static bool
698cifs_find_lock_conflict(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock,
699 struct cifsLockInfo **conf_lock)
700{
701 return __cifs_find_lock_conflict(cinode, lock->offset, lock->length,
702 lock->type, lock->netfid, conf_lock);
703}
704
705/*
706 * Check if there is another lock that prevents us to set the lock (mandatory
707 * style). If such a lock exists, update the flock structure with its
708 * properties. Otherwise, set the flock type to F_UNLCK if we can cache brlocks
709 * or leave it the same if we can't. Returns 0 if we don't need to request to
710 * the server or 1 otherwise.
711 */
697static int 712static int
698cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length, 713cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
699 __u8 type, __u16 netfid, struct file_lock *flock) 714 __u8 type, __u16 netfid, struct file_lock *flock)
@@ -704,8 +719,8 @@ cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
704 719
705 mutex_lock(&cinode->lock_mutex); 720 mutex_lock(&cinode->lock_mutex);
706 721
707 exist = cifs_find_lock_conflict(cinode, offset, length, type, netfid, 722 exist = __cifs_find_lock_conflict(cinode, offset, length, type, netfid,
708 &conf_lock); 723 &conf_lock);
709 if (exist) { 724 if (exist) {
710 flock->fl_start = conf_lock->offset; 725 flock->fl_start = conf_lock->offset;
711 flock->fl_end = conf_lock->offset + conf_lock->length - 1; 726 flock->fl_end = conf_lock->offset + conf_lock->length - 1;
@@ -723,40 +738,33 @@ cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
723 return rc; 738 return rc;
724} 739}
725 740
726static int 741static void
727cifs_lock_add(struct cifsInodeInfo *cinode, __u64 len, __u64 offset, 742cifs_lock_add(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock)
728 __u8 type, __u16 netfid)
729{ 743{
730 struct cifsLockInfo *li;
731
732 li = cifs_lock_init(len, offset, type, netfid);
733 if (!li)
734 return -ENOMEM;
735
736 mutex_lock(&cinode->lock_mutex); 744 mutex_lock(&cinode->lock_mutex);
737 list_add_tail(&li->llist, &cinode->llist); 745 list_add_tail(&lock->llist, &cinode->llist);
738 mutex_unlock(&cinode->lock_mutex); 746 mutex_unlock(&cinode->lock_mutex);
739 return 0;
740} 747}
741 748
749/*
750 * Set the byte-range lock (mandatory style). Returns:
751 * 1) 0, if we set the lock and don't need to request to the server;
752 * 2) 1, if no locks prevent us but we need to request to the server;
753 * 3) -EACCESS, if there is a lock that prevents us and wait is false.
754 */
742static int 755static int
743cifs_lock_add_if(struct cifsInodeInfo *cinode, __u64 offset, __u64 length, 756cifs_lock_add_if(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock,
744 __u8 type, __u16 netfid, bool wait) 757 bool wait)
745{ 758{
746 struct cifsLockInfo *lock, *conf_lock; 759 struct cifsLockInfo *conf_lock;
747 bool exist; 760 bool exist;
748 int rc = 0; 761 int rc = 0;
749 762
750 lock = cifs_lock_init(length, offset, type, netfid);
751 if (!lock)
752 return -ENOMEM;
753
754try_again: 763try_again:
755 exist = false; 764 exist = false;
756 mutex_lock(&cinode->lock_mutex); 765 mutex_lock(&cinode->lock_mutex);
757 766
758 exist = cifs_find_lock_conflict(cinode, offset, length, type, netfid, 767 exist = cifs_find_lock_conflict(cinode, lock, &conf_lock);
759 &conf_lock);
760 if (!exist && cinode->can_cache_brlcks) { 768 if (!exist && cinode->can_cache_brlcks) {
761 list_add_tail(&lock->llist, &cinode->llist); 769 list_add_tail(&lock->llist, &cinode->llist);
762 mutex_unlock(&cinode->lock_mutex); 770 mutex_unlock(&cinode->lock_mutex);
@@ -775,18 +783,21 @@ try_again:
775 (lock->blist.next == &lock->blist)); 783 (lock->blist.next == &lock->blist));
776 if (!rc) 784 if (!rc)
777 goto try_again; 785 goto try_again;
778 else { 786 mutex_lock(&cinode->lock_mutex);
779 mutex_lock(&cinode->lock_mutex); 787 list_del_init(&lock->blist);
780 list_del_init(&lock->blist);
781 mutex_unlock(&cinode->lock_mutex);
782 }
783 } 788 }
784 789
785 kfree(lock);
786 mutex_unlock(&cinode->lock_mutex); 790 mutex_unlock(&cinode->lock_mutex);
787 return rc; 791 return rc;
788} 792}
789 793
794/*
795 * Check if there is another lock that prevents us to set the lock (posix
796 * style). If such a lock exists, update the flock structure with its
797 * properties. Otherwise, set the flock type to F_UNLCK if we can cache brlocks
798 * or leave it the same if we can't. Returns 0 if we don't need to request to
799 * the server or 1 otherwise.
800 */
790static int 801static int
791cifs_posix_lock_test(struct file *file, struct file_lock *flock) 802cifs_posix_lock_test(struct file *file, struct file_lock *flock)
792{ 803{
@@ -794,6 +805,9 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock)
794 struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode); 805 struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
795 unsigned char saved_type = flock->fl_type; 806 unsigned char saved_type = flock->fl_type;
796 807
808 if ((flock->fl_flags & FL_POSIX) == 0)
809 return 1;
810
797 mutex_lock(&cinode->lock_mutex); 811 mutex_lock(&cinode->lock_mutex);
798 posix_test_lock(file, flock); 812 posix_test_lock(file, flock);
799 813
@@ -806,16 +820,25 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock)
806 return rc; 820 return rc;
807} 821}
808 822
823/*
824 * Set the byte-range lock (posix style). Returns:
825 * 1) 0, if we set the lock and don't need to request to the server;
826 * 2) 1, if we need to request to the server;
827 * 3) <0, if the error occurs while setting the lock.
828 */
809static int 829static int
810cifs_posix_lock_set(struct file *file, struct file_lock *flock) 830cifs_posix_lock_set(struct file *file, struct file_lock *flock)
811{ 831{
812 struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode); 832 struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
813 int rc; 833 int rc = 1;
834
835 if ((flock->fl_flags & FL_POSIX) == 0)
836 return rc;
814 837
815 mutex_lock(&cinode->lock_mutex); 838 mutex_lock(&cinode->lock_mutex);
816 if (!cinode->can_cache_brlcks) { 839 if (!cinode->can_cache_brlcks) {
817 mutex_unlock(&cinode->lock_mutex); 840 mutex_unlock(&cinode->lock_mutex);
818 return 1; 841 return rc;
819 } 842 }
820 rc = posix_lock_file_wait(file, flock); 843 rc = posix_lock_file_wait(file, flock);
821 mutex_unlock(&cinode->lock_mutex); 844 mutex_unlock(&cinode->lock_mutex);
@@ -928,7 +951,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
928 else 951 else
929 type = CIFS_WRLCK; 952 type = CIFS_WRLCK;
930 953
931 lck = cifs_lock_init(length, flock->fl_start, type, 954 lck = cifs_lock_init(flock->fl_start, length, type,
932 cfile->netfid); 955 cfile->netfid);
933 if (!lck) { 956 if (!lck) {
934 rc = -ENOMEM; 957 rc = -ENOMEM;
@@ -1065,14 +1088,12 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u8 type,
1065 if (rc != 0) 1088 if (rc != 0)
1066 cERROR(1, "Error unlocking previously locked " 1089 cERROR(1, "Error unlocking previously locked "
1067 "range %d during test of lock", rc); 1090 "range %d during test of lock", rc);
1068 rc = 0; 1091 return 0;
1069 return rc;
1070 } 1092 }
1071 1093
1072 if (type & LOCKING_ANDX_SHARED_LOCK) { 1094 if (type & LOCKING_ANDX_SHARED_LOCK) {
1073 flock->fl_type = F_WRLCK; 1095 flock->fl_type = F_WRLCK;
1074 rc = 0; 1096 return 0;
1075 return rc;
1076 } 1097 }
1077 1098
1078 rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length, 1099 rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
@@ -1090,8 +1111,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u8 type,
1090 } else 1111 } else
1091 flock->fl_type = F_WRLCK; 1112 flock->fl_type = F_WRLCK;
1092 1113
1093 rc = 0; 1114 return 0;
1094 return rc;
1095} 1115}
1096 1116
1097static void 1117static void
@@ -1249,20 +1269,26 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u8 type,
1249 } 1269 }
1250 1270
1251 if (lock) { 1271 if (lock) {
1252 rc = cifs_lock_add_if(cinode, flock->fl_start, length, 1272 struct cifsLockInfo *lock;
1253 type, netfid, wait_flag); 1273
1274 lock = cifs_lock_init(flock->fl_start, length, type, netfid);
1275 if (!lock)
1276 return -ENOMEM;
1277
1278 rc = cifs_lock_add_if(cinode, lock, wait_flag);
1254 if (rc < 0) 1279 if (rc < 0)
1255 return rc; 1280 kfree(lock);
1256 else if (!rc) 1281 if (rc <= 0)
1257 goto out; 1282 goto out;
1258 1283
1259 rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length, 1284 rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
1260 flock->fl_start, 0, 1, type, wait_flag, 0); 1285 flock->fl_start, 0, 1, type, wait_flag, 0);
1261 if (rc == 0) { 1286 if (rc) {
1262 /* For Windows locks we must store them. */ 1287 kfree(lock);
1263 rc = cifs_lock_add(cinode, length, flock->fl_start, 1288 goto out;
1264 type, netfid);
1265 } 1289 }
1290
1291 cifs_lock_add(cinode, lock);
1266 } else if (unlock) 1292 } else if (unlock)
1267 rc = cifs_unlock_range(cfile, flock, xid); 1293 rc = cifs_unlock_range(cfile, flock, xid);
1268 1294
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 5de03ec2014..a090bbe6ee2 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -554,7 +554,10 @@ static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon,
554 rc); 554 rc);
555 return rc; 555 return rc;
556 } 556 }
557 cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile); 557 /* FindFirst/Next set last_entry to NULL on malformed reply */
558 if (cifsFile->srch_inf.last_entry)
559 cifs_save_resume_key(cifsFile->srch_inf.last_entry,
560 cifsFile);
558 } 561 }
559 562
560 while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) && 563 while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) &&
@@ -562,7 +565,10 @@ static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon,
562 cFYI(1, "calling findnext2"); 565 cFYI(1, "calling findnext2");
563 rc = CIFSFindNext(xid, pTcon, cifsFile->netfid, 566 rc = CIFSFindNext(xid, pTcon, cifsFile->netfid,
564 &cifsFile->srch_inf); 567 &cifsFile->srch_inf);
565 cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile); 568 /* FindFirst/Next set last_entry to NULL on malformed reply */
569 if (cifsFile->srch_inf.last_entry)
570 cifs_save_resume_key(cifsFile->srch_inf.last_entry,
571 cifsFile);
566 if (rc) 572 if (rc)
567 return -ENOENT; 573 return -ENOENT;
568 } 574 }
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index c7d80e24f24..4ec3ee9d72c 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -683,7 +683,7 @@ ssetup_ntlmssp_authenticate:
683 cpu_to_le16(CIFS_AUTH_RESP_SIZE); 683 cpu_to_le16(CIFS_AUTH_RESP_SIZE);
684 684
685 /* calculate ntlm response and session key */ 685 /* calculate ntlm response and session key */
686 rc = setup_ntlm_response(ses); 686 rc = setup_ntlm_response(ses, nls_cp);
687 if (rc) { 687 if (rc) {
688 cERROR(1, "Error %d during NTLM authentication", rc); 688 cERROR(1, "Error %d during NTLM authentication", rc);
689 goto ssetup_exit; 689 goto ssetup_exit;
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index ac1221d969d..80d85088193 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -199,75 +199,36 @@ SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24)
199 return rc; 199 return rc;
200} 200}
201 201
202/* Routines for Windows NT MD4 Hash functions. */
203static int
204_my_wcslen(__u16 *str)
205{
206 int len = 0;
207 while (*str++ != 0)
208 len++;
209 return len;
210}
211
212/*
213 * Convert a string into an NT UNICODE string.
214 * Note that regardless of processor type
215 * this must be in intel (little-endian)
216 * format.
217 */
218
219static int
220_my_mbstowcs(__u16 *dst, const unsigned char *src, int len)
221{ /* BB not a very good conversion routine - change/fix */
222 int i;
223 __u16 val;
224
225 for (i = 0; i < len; i++) {
226 val = *src;
227 SSVAL(dst, 0, val);
228 dst++;
229 src++;
230 if (val == 0)
231 break;
232 }
233 return i;
234}
235
236/* 202/*
237 * Creates the MD4 Hash of the users password in NT UNICODE. 203 * Creates the MD4 Hash of the users password in NT UNICODE.
238 */ 204 */
239 205
240int 206int
241E_md4hash(const unsigned char *passwd, unsigned char *p16) 207E_md4hash(const unsigned char *passwd, unsigned char *p16,
208 const struct nls_table *codepage)
242{ 209{
243 int rc; 210 int rc;
244 int len; 211 int len;
245 __u16 wpwd[129]; 212 __le16 wpwd[129];
246 213
247 /* Password cannot be longer than 128 characters */ 214 /* Password cannot be longer than 128 characters */
248 if (passwd) { 215 if (passwd) /* Password must be converted to NT unicode */
249 len = strlen((char *) passwd); 216 len = cifs_strtoUCS(wpwd, passwd, 128, codepage);
250 if (len > 128) 217 else {
251 len = 128;
252
253 /* Password must be converted to NT unicode */
254 _my_mbstowcs(wpwd, passwd, len);
255 } else
256 len = 0; 218 len = 0;
219 *wpwd = 0; /* Ensure string is null terminated */
220 }
257 221
258 wpwd[len] = 0; /* Ensure string is null terminated */ 222 rc = mdfour(p16, (unsigned char *) wpwd, len * sizeof(__le16));
259 /* Calculate length in bytes */ 223 memset(wpwd, 0, 129 * sizeof(__le16));
260 len = _my_wcslen(wpwd) * sizeof(__u16);
261
262 rc = mdfour(p16, (unsigned char *) wpwd, len);
263 memset(wpwd, 0, 129 * 2);
264 224
265 return rc; 225 return rc;
266} 226}
267 227
268/* Does the NT MD4 hash then des encryption. */ 228/* Does the NT MD4 hash then des encryption. */
269int 229int
270SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24) 230SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24,
231 const struct nls_table *codepage)
271{ 232{
272 int rc; 233 int rc;
273 unsigned char p16[16], p21[21]; 234 unsigned char p16[16], p21[21];
@@ -275,7 +236,7 @@ SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
275 memset(p16, '\0', 16); 236 memset(p16, '\0', 16);
276 memset(p21, '\0', 21); 237 memset(p21, '\0', 21);
277 238
278 rc = E_md4hash(passwd, p16); 239 rc = E_md4hash(passwd, p16, codepage);
279 if (rc) { 240 if (rc) {
280 cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc); 241 cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
281 return rc; 242 return rc;
diff --git a/fs/dcache.c b/fs/dcache.c
index 274f13e2f09..89509b5a090 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -36,6 +36,7 @@
36#include <linux/bit_spinlock.h> 36#include <linux/bit_spinlock.h>
37#include <linux/rculist_bl.h> 37#include <linux/rculist_bl.h>
38#include <linux/prefetch.h> 38#include <linux/prefetch.h>
39#include <linux/ratelimit.h>
39#include "internal.h" 40#include "internal.h"
40 41
41/* 42/*
@@ -546,9 +547,11 @@ int d_invalidate(struct dentry * dentry)
546 * would make it unreachable from the root, 547 * would make it unreachable from the root,
547 * we might still populate it if it was a 548 * we might still populate it if it was a
548 * working directory or similar). 549 * working directory or similar).
550 * We also need to leave mountpoints alone,
551 * directory or not.
549 */ 552 */
550 if (dentry->d_count > 1) { 553 if (dentry->d_count > 1 && dentry->d_inode) {
551 if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) { 554 if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry)) {
552 spin_unlock(&dentry->d_lock); 555 spin_unlock(&dentry->d_lock);
553 return -EBUSY; 556 return -EBUSY;
554 } 557 }
@@ -2381,8 +2384,16 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
2381 actual = __d_unalias(inode, dentry, alias); 2384 actual = __d_unalias(inode, dentry, alias);
2382 } 2385 }
2383 write_sequnlock(&rename_lock); 2386 write_sequnlock(&rename_lock);
2384 if (IS_ERR(actual)) 2387 if (IS_ERR(actual)) {
2388 if (PTR_ERR(actual) == -ELOOP)
2389 pr_warn_ratelimited(
2390 "VFS: Lookup of '%s' in %s %s"
2391 " would have caused loop\n",
2392 dentry->d_name.name,
2393 inode->i_sb->s_type->name,
2394 inode->i_sb->s_id);
2385 dput(alias); 2395 dput(alias);
2396 }
2386 goto out_nolock; 2397 goto out_nolock;
2387 } 2398 }
2388 } 2399 }
@@ -2428,16 +2439,14 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
2428/** 2439/**
2429 * prepend_path - Prepend path string to a buffer 2440 * prepend_path - Prepend path string to a buffer
2430 * @path: the dentry/vfsmount to report 2441 * @path: the dentry/vfsmount to report
2431 * @root: root vfsmnt/dentry (may be modified by this function) 2442 * @root: root vfsmnt/dentry
2432 * @buffer: pointer to the end of the buffer 2443 * @buffer: pointer to the end of the buffer
2433 * @buflen: pointer to buffer length 2444 * @buflen: pointer to buffer length
2434 * 2445 *
2435 * Caller holds the rename_lock. 2446 * Caller holds the rename_lock.
2436 *
2437 * If path is not reachable from the supplied root, then the value of
2438 * root is changed (without modifying refcounts).
2439 */ 2447 */
2440static int prepend_path(const struct path *path, struct path *root, 2448static int prepend_path(const struct path *path,
2449 const struct path *root,
2441 char **buffer, int *buflen) 2450 char **buffer, int *buflen)
2442{ 2451{
2443 struct dentry *dentry = path->dentry; 2452 struct dentry *dentry = path->dentry;
@@ -2472,10 +2481,10 @@ static int prepend_path(const struct path *path, struct path *root,
2472 dentry = parent; 2481 dentry = parent;
2473 } 2482 }
2474 2483
2475out:
2476 if (!error && !slash) 2484 if (!error && !slash)
2477 error = prepend(buffer, buflen, "/", 1); 2485 error = prepend(buffer, buflen, "/", 1);
2478 2486
2487out:
2479 br_read_unlock(vfsmount_lock); 2488 br_read_unlock(vfsmount_lock);
2480 return error; 2489 return error;
2481 2490
@@ -2489,15 +2498,17 @@ global_root:
2489 WARN(1, "Root dentry has weird name <%.*s>\n", 2498 WARN(1, "Root dentry has weird name <%.*s>\n",
2490 (int) dentry->d_name.len, dentry->d_name.name); 2499 (int) dentry->d_name.len, dentry->d_name.name);
2491 } 2500 }
2492 root->mnt = vfsmnt; 2501 if (!slash)
2493 root->dentry = dentry; 2502 error = prepend(buffer, buflen, "/", 1);
2503 if (!error)
2504 error = vfsmnt->mnt_ns ? 1 : 2;
2494 goto out; 2505 goto out;
2495} 2506}
2496 2507
2497/** 2508/**
2498 * __d_path - return the path of a dentry 2509 * __d_path - return the path of a dentry
2499 * @path: the dentry/vfsmount to report 2510 * @path: the dentry/vfsmount to report
2500 * @root: root vfsmnt/dentry (may be modified by this function) 2511 * @root: root vfsmnt/dentry
2501 * @buf: buffer to return value in 2512 * @buf: buffer to return value in
2502 * @buflen: buffer length 2513 * @buflen: buffer length
2503 * 2514 *
@@ -2508,10 +2519,10 @@ global_root:
2508 * 2519 *
2509 * "buflen" should be positive. 2520 * "buflen" should be positive.
2510 * 2521 *
2511 * If path is not reachable from the supplied root, then the value of 2522 * If the path is not reachable from the supplied root, return %NULL.
2512 * root is changed (without modifying refcounts).
2513 */ 2523 */
2514char *__d_path(const struct path *path, struct path *root, 2524char *__d_path(const struct path *path,
2525 const struct path *root,
2515 char *buf, int buflen) 2526 char *buf, int buflen)
2516{ 2527{
2517 char *res = buf + buflen; 2528 char *res = buf + buflen;
@@ -2522,7 +2533,28 @@ char *__d_path(const struct path *path, struct path *root,
2522 error = prepend_path(path, root, &res, &buflen); 2533 error = prepend_path(path, root, &res, &buflen);
2523 write_sequnlock(&rename_lock); 2534 write_sequnlock(&rename_lock);
2524 2535
2525 if (error) 2536 if (error < 0)
2537 return ERR_PTR(error);
2538 if (error > 0)
2539 return NULL;
2540 return res;
2541}
2542
2543char *d_absolute_path(const struct path *path,
2544 char *buf, int buflen)
2545{
2546 struct path root = {};
2547 char *res = buf + buflen;
2548 int error;
2549
2550 prepend(&res, &buflen, "\0", 1);
2551 write_seqlock(&rename_lock);
2552 error = prepend_path(path, &root, &res, &buflen);
2553 write_sequnlock(&rename_lock);
2554
2555 if (error > 1)
2556 error = -EINVAL;
2557 if (error < 0)
2526 return ERR_PTR(error); 2558 return ERR_PTR(error);
2527 return res; 2559 return res;
2528} 2560}
@@ -2530,8 +2562,9 @@ char *__d_path(const struct path *path, struct path *root,
2530/* 2562/*
2531 * same as __d_path but appends "(deleted)" for unlinked files. 2563 * same as __d_path but appends "(deleted)" for unlinked files.
2532 */ 2564 */
2533static int path_with_deleted(const struct path *path, struct path *root, 2565static int path_with_deleted(const struct path *path,
2534 char **buf, int *buflen) 2566 const struct path *root,
2567 char **buf, int *buflen)
2535{ 2568{
2536 prepend(buf, buflen, "\0", 1); 2569 prepend(buf, buflen, "\0", 1);
2537 if (d_unlinked(path->dentry)) { 2570 if (d_unlinked(path->dentry)) {
@@ -2568,7 +2601,6 @@ char *d_path(const struct path *path, char *buf, int buflen)
2568{ 2601{
2569 char *res = buf + buflen; 2602 char *res = buf + buflen;
2570 struct path root; 2603 struct path root;
2571 struct path tmp;
2572 int error; 2604 int error;
2573 2605
2574 /* 2606 /*
@@ -2583,9 +2615,8 @@ char *d_path(const struct path *path, char *buf, int buflen)
2583 2615
2584 get_fs_root(current->fs, &root); 2616 get_fs_root(current->fs, &root);
2585 write_seqlock(&rename_lock); 2617 write_seqlock(&rename_lock);
2586 tmp = root; 2618 error = path_with_deleted(path, &root, &res, &buflen);
2587 error = path_with_deleted(path, &tmp, &res, &buflen); 2619 if (error < 0)
2588 if (error)
2589 res = ERR_PTR(error); 2620 res = ERR_PTR(error);
2590 write_sequnlock(&rename_lock); 2621 write_sequnlock(&rename_lock);
2591 path_put(&root); 2622 path_put(&root);
@@ -2606,7 +2637,6 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
2606{ 2637{
2607 char *res = buf + buflen; 2638 char *res = buf + buflen;
2608 struct path root; 2639 struct path root;
2609 struct path tmp;
2610 int error; 2640 int error;
2611 2641
2612 if (path->dentry->d_op && path->dentry->d_op->d_dname) 2642 if (path->dentry->d_op && path->dentry->d_op->d_dname)
@@ -2614,9 +2644,8 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
2614 2644
2615 get_fs_root(current->fs, &root); 2645 get_fs_root(current->fs, &root);
2616 write_seqlock(&rename_lock); 2646 write_seqlock(&rename_lock);
2617 tmp = root; 2647 error = path_with_deleted(path, &root, &res, &buflen);
2618 error = path_with_deleted(path, &tmp, &res, &buflen); 2648 if (error > 0)
2619 if (!error && !path_equal(&tmp, &root))
2620 error = prepend_unreachable(&res, &buflen); 2649 error = prepend_unreachable(&res, &buflen);
2621 write_sequnlock(&rename_lock); 2650 write_sequnlock(&rename_lock);
2622 path_put(&root); 2651 path_put(&root);
@@ -2747,19 +2776,18 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
2747 write_seqlock(&rename_lock); 2776 write_seqlock(&rename_lock);
2748 if (!d_unlinked(pwd.dentry)) { 2777 if (!d_unlinked(pwd.dentry)) {
2749 unsigned long len; 2778 unsigned long len;
2750 struct path tmp = root;
2751 char *cwd = page + PAGE_SIZE; 2779 char *cwd = page + PAGE_SIZE;
2752 int buflen = PAGE_SIZE; 2780 int buflen = PAGE_SIZE;
2753 2781
2754 prepend(&cwd, &buflen, "\0", 1); 2782 prepend(&cwd, &buflen, "\0", 1);
2755 error = prepend_path(&pwd, &tmp, &cwd, &buflen); 2783 error = prepend_path(&pwd, &root, &cwd, &buflen);
2756 write_sequnlock(&rename_lock); 2784 write_sequnlock(&rename_lock);
2757 2785
2758 if (error) 2786 if (error < 0)
2759 goto out; 2787 goto out;
2760 2788
2761 /* Unreachable from current root */ 2789 /* Unreachable from current root */
2762 if (!path_equal(&tmp, &root)) { 2790 if (error > 0) {
2763 error = prepend_unreachable(&cwd, &buflen); 2791 error = prepend_unreachable(&cwd, &buflen);
2764 if (error) 2792 if (error)
2765 goto out; 2793 goto out;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 58609bde3b9..2a834255c75 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -967,7 +967,7 @@ static void ecryptfs_set_default_crypt_stat_vals(
967 967
968/** 968/**
969 * ecryptfs_new_file_context 969 * ecryptfs_new_file_context
970 * @ecryptfs_dentry: The eCryptfs dentry 970 * @ecryptfs_inode: The eCryptfs inode
971 * 971 *
972 * If the crypto context for the file has not yet been established, 972 * If the crypto context for the file has not yet been established,
973 * this is where we do that. Establishing a new crypto context 973 * this is where we do that. Establishing a new crypto context
@@ -984,13 +984,13 @@ static void ecryptfs_set_default_crypt_stat_vals(
984 * 984 *
985 * Returns zero on success; non-zero otherwise 985 * Returns zero on success; non-zero otherwise
986 */ 986 */
987int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry) 987int ecryptfs_new_file_context(struct inode *ecryptfs_inode)
988{ 988{
989 struct ecryptfs_crypt_stat *crypt_stat = 989 struct ecryptfs_crypt_stat *crypt_stat =
990 &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; 990 &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
991 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = 991 struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
992 &ecryptfs_superblock_to_private( 992 &ecryptfs_superblock_to_private(
993 ecryptfs_dentry->d_sb)->mount_crypt_stat; 993 ecryptfs_inode->i_sb)->mount_crypt_stat;
994 int cipher_name_len; 994 int cipher_name_len;
995 int rc = 0; 995 int rc = 0;
996 996
@@ -1299,12 +1299,12 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max,
1299} 1299}
1300 1300
1301static int 1301static int
1302ecryptfs_write_metadata_to_contents(struct dentry *ecryptfs_dentry, 1302ecryptfs_write_metadata_to_contents(struct inode *ecryptfs_inode,
1303 char *virt, size_t virt_len) 1303 char *virt, size_t virt_len)
1304{ 1304{
1305 int rc; 1305 int rc;
1306 1306
1307 rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt, 1307 rc = ecryptfs_write_lower(ecryptfs_inode, virt,
1308 0, virt_len); 1308 0, virt_len);
1309 if (rc < 0) 1309 if (rc < 0)
1310 printk(KERN_ERR "%s: Error attempting to write header " 1310 printk(KERN_ERR "%s: Error attempting to write header "
@@ -1338,7 +1338,8 @@ static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask,
1338 1338
1339/** 1339/**
1340 * ecryptfs_write_metadata 1340 * ecryptfs_write_metadata
1341 * @ecryptfs_dentry: The eCryptfs dentry 1341 * @ecryptfs_dentry: The eCryptfs dentry, which should be negative
1342 * @ecryptfs_inode: The newly created eCryptfs inode
1342 * 1343 *
1343 * Write the file headers out. This will likely involve a userspace 1344 * Write the file headers out. This will likely involve a userspace
1344 * callout, in which the session key is encrypted with one or more 1345 * callout, in which the session key is encrypted with one or more
@@ -1348,10 +1349,11 @@ static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask,
1348 * 1349 *
1349 * Returns zero on success; non-zero on error 1350 * Returns zero on success; non-zero on error
1350 */ 1351 */
1351int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) 1352int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry,
1353 struct inode *ecryptfs_inode)
1352{ 1354{
1353 struct ecryptfs_crypt_stat *crypt_stat = 1355 struct ecryptfs_crypt_stat *crypt_stat =
1354 &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; 1356 &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
1355 unsigned int order; 1357 unsigned int order;
1356 char *virt; 1358 char *virt;
1357 size_t virt_len; 1359 size_t virt_len;
@@ -1391,7 +1393,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
1391 rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt, 1393 rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt,
1392 size); 1394 size);
1393 else 1395 else
1394 rc = ecryptfs_write_metadata_to_contents(ecryptfs_dentry, virt, 1396 rc = ecryptfs_write_metadata_to_contents(ecryptfs_inode, virt,
1395 virt_len); 1397 virt_len);
1396 if (rc) { 1398 if (rc) {
1397 printk(KERN_ERR "%s: Error writing metadata out to lower file; " 1399 printk(KERN_ERR "%s: Error writing metadata out to lower file; "
@@ -1943,7 +1945,7 @@ static unsigned char *portable_filename_chars = ("-.0123456789ABCD"
1943 1945
1944/* We could either offset on every reverse map or just pad some 0x00's 1946/* We could either offset on every reverse map or just pad some 0x00's
1945 * at the front here */ 1947 * at the front here */
1946static const unsigned char filename_rev_map[] = { 1948static const unsigned char filename_rev_map[256] = {
1947 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */ 1949 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */
1948 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */ 1950 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */
1949 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */ 1951 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */
@@ -1959,7 +1961,7 @@ static const unsigned char filename_rev_map[] = {
1959 0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */ 1961 0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */
1960 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */ 1962 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */
1961 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */ 1963 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */
1962 0x3D, 0x3E, 0x3F 1964 0x3D, 0x3E, 0x3F /* 123 - 255 initialized to 0x00 */
1963}; 1965};
1964 1966
1965/** 1967/**
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 54481a3b2c7..a9f29b12fbf 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -584,9 +584,10 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat);
584int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode); 584int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode);
585int ecryptfs_encrypt_page(struct page *page); 585int ecryptfs_encrypt_page(struct page *page);
586int ecryptfs_decrypt_page(struct page *page); 586int ecryptfs_decrypt_page(struct page *page);
587int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry); 587int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry,
588 struct inode *ecryptfs_inode);
588int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry); 589int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry);
589int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry); 590int ecryptfs_new_file_context(struct inode *ecryptfs_inode);
590void ecryptfs_write_crypt_stat_flags(char *page_virt, 591void ecryptfs_write_crypt_stat_flags(char *page_virt,
591 struct ecryptfs_crypt_stat *crypt_stat, 592 struct ecryptfs_crypt_stat *crypt_stat,
592 size_t *written); 593 size_t *written);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index c6ac98cf9ba..d3f95f941c4 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -139,6 +139,27 @@ out:
139 return rc; 139 return rc;
140} 140}
141 141
142static void ecryptfs_vma_close(struct vm_area_struct *vma)
143{
144 filemap_write_and_wait(vma->vm_file->f_mapping);
145}
146
147static const struct vm_operations_struct ecryptfs_file_vm_ops = {
148 .close = ecryptfs_vma_close,
149 .fault = filemap_fault,
150};
151
152static int ecryptfs_file_mmap(struct file *file, struct vm_area_struct *vma)
153{
154 int rc;
155
156 rc = generic_file_mmap(file, vma);
157 if (!rc)
158 vma->vm_ops = &ecryptfs_file_vm_ops;
159
160 return rc;
161}
162
142struct kmem_cache *ecryptfs_file_info_cache; 163struct kmem_cache *ecryptfs_file_info_cache;
143 164
144/** 165/**
@@ -349,7 +370,7 @@ const struct file_operations ecryptfs_main_fops = {
349#ifdef CONFIG_COMPAT 370#ifdef CONFIG_COMPAT
350 .compat_ioctl = ecryptfs_compat_ioctl, 371 .compat_ioctl = ecryptfs_compat_ioctl,
351#endif 372#endif
352 .mmap = generic_file_mmap, 373 .mmap = ecryptfs_file_mmap,
353 .open = ecryptfs_open, 374 .open = ecryptfs_open,
354 .flush = ecryptfs_flush, 375 .flush = ecryptfs_flush,
355 .release = ecryptfs_release, 376 .release = ecryptfs_release,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index a36d327f152..32f90a3ae63 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -172,22 +172,23 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode,
172 * it. It will also update the eCryptfs directory inode to mimic the 172 * it. It will also update the eCryptfs directory inode to mimic the
173 * stat of the lower directory inode. 173 * stat of the lower directory inode.
174 * 174 *
175 * Returns zero on success; non-zero on error condition 175 * Returns the new eCryptfs inode on success; an ERR_PTR on error condition
176 */ 176 */
177static int 177static struct inode *
178ecryptfs_do_create(struct inode *directory_inode, 178ecryptfs_do_create(struct inode *directory_inode,
179 struct dentry *ecryptfs_dentry, int mode) 179 struct dentry *ecryptfs_dentry, int mode)
180{ 180{
181 int rc; 181 int rc;
182 struct dentry *lower_dentry; 182 struct dentry *lower_dentry;
183 struct dentry *lower_dir_dentry; 183 struct dentry *lower_dir_dentry;
184 struct inode *inode;
184 185
185 lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); 186 lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
186 lower_dir_dentry = lock_parent(lower_dentry); 187 lower_dir_dentry = lock_parent(lower_dentry);
187 if (IS_ERR(lower_dir_dentry)) { 188 if (IS_ERR(lower_dir_dentry)) {
188 ecryptfs_printk(KERN_ERR, "Error locking directory of " 189 ecryptfs_printk(KERN_ERR, "Error locking directory of "
189 "dentry\n"); 190 "dentry\n");
190 rc = PTR_ERR(lower_dir_dentry); 191 inode = ERR_CAST(lower_dir_dentry);
191 goto out; 192 goto out;
192 } 193 }
193 rc = ecryptfs_create_underlying_file(lower_dir_dentry->d_inode, 194 rc = ecryptfs_create_underlying_file(lower_dir_dentry->d_inode,
@@ -195,20 +196,19 @@ ecryptfs_do_create(struct inode *directory_inode,
195 if (rc) { 196 if (rc) {
196 printk(KERN_ERR "%s: Failure to create dentry in lower fs; " 197 printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
197 "rc = [%d]\n", __func__, rc); 198 "rc = [%d]\n", __func__, rc);
199 inode = ERR_PTR(rc);
198 goto out_lock; 200 goto out_lock;
199 } 201 }
200 rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry, 202 inode = __ecryptfs_get_inode(lower_dentry->d_inode,
201 directory_inode->i_sb); 203 directory_inode->i_sb);
202 if (rc) { 204 if (IS_ERR(inode))
203 ecryptfs_printk(KERN_ERR, "Failure in ecryptfs_interpose\n");
204 goto out_lock; 205 goto out_lock;
205 }
206 fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode); 206 fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode);
207 fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode); 207 fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode);
208out_lock: 208out_lock:
209 unlock_dir(lower_dir_dentry); 209 unlock_dir(lower_dir_dentry);
210out: 210out:
211 return rc; 211 return inode;
212} 212}
213 213
214/** 214/**
@@ -219,26 +219,26 @@ out:
219 * 219 *
220 * Returns zero on success 220 * Returns zero on success
221 */ 221 */
222static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry) 222static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry,
223 struct inode *ecryptfs_inode)
223{ 224{
224 struct ecryptfs_crypt_stat *crypt_stat = 225 struct ecryptfs_crypt_stat *crypt_stat =
225 &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; 226 &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
226 int rc = 0; 227 int rc = 0;
227 228
228 if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) { 229 if (S_ISDIR(ecryptfs_inode->i_mode)) {
229 ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); 230 ecryptfs_printk(KERN_DEBUG, "This is a directory\n");
230 crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); 231 crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
231 goto out; 232 goto out;
232 } 233 }
233 ecryptfs_printk(KERN_DEBUG, "Initializing crypto context\n"); 234 ecryptfs_printk(KERN_DEBUG, "Initializing crypto context\n");
234 rc = ecryptfs_new_file_context(ecryptfs_dentry); 235 rc = ecryptfs_new_file_context(ecryptfs_inode);
235 if (rc) { 236 if (rc) {
236 ecryptfs_printk(KERN_ERR, "Error creating new file " 237 ecryptfs_printk(KERN_ERR, "Error creating new file "
237 "context; rc = [%d]\n", rc); 238 "context; rc = [%d]\n", rc);
238 goto out; 239 goto out;
239 } 240 }
240 rc = ecryptfs_get_lower_file(ecryptfs_dentry, 241 rc = ecryptfs_get_lower_file(ecryptfs_dentry, ecryptfs_inode);
241 ecryptfs_dentry->d_inode);
242 if (rc) { 242 if (rc) {
243 printk(KERN_ERR "%s: Error attempting to initialize " 243 printk(KERN_ERR "%s: Error attempting to initialize "
244 "the lower file for the dentry with name " 244 "the lower file for the dentry with name "
@@ -246,10 +246,10 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
246 ecryptfs_dentry->d_name.name, rc); 246 ecryptfs_dentry->d_name.name, rc);
247 goto out; 247 goto out;
248 } 248 }
249 rc = ecryptfs_write_metadata(ecryptfs_dentry); 249 rc = ecryptfs_write_metadata(ecryptfs_dentry, ecryptfs_inode);
250 if (rc) 250 if (rc)
251 printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc); 251 printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc);
252 ecryptfs_put_lower_file(ecryptfs_dentry->d_inode); 252 ecryptfs_put_lower_file(ecryptfs_inode);
253out: 253out:
254 return rc; 254 return rc;
255} 255}
@@ -269,18 +269,28 @@ static int
269ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, 269ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
270 int mode, struct nameidata *nd) 270 int mode, struct nameidata *nd)
271{ 271{
272 struct inode *ecryptfs_inode;
272 int rc; 273 int rc;
273 274
274 /* ecryptfs_do_create() calls ecryptfs_interpose() */ 275 ecryptfs_inode = ecryptfs_do_create(directory_inode, ecryptfs_dentry,
275 rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode); 276 mode);
276 if (unlikely(rc)) { 277 if (unlikely(IS_ERR(ecryptfs_inode))) {
277 ecryptfs_printk(KERN_WARNING, "Failed to create file in" 278 ecryptfs_printk(KERN_WARNING, "Failed to create file in"
278 "lower filesystem\n"); 279 "lower filesystem\n");
280 rc = PTR_ERR(ecryptfs_inode);
279 goto out; 281 goto out;
280 } 282 }
281 /* At this point, a file exists on "disk"; we need to make sure 283 /* At this point, a file exists on "disk"; we need to make sure
282 * that this on disk file is prepared to be an ecryptfs file */ 284 * that this on disk file is prepared to be an ecryptfs file */
283 rc = ecryptfs_initialize_file(ecryptfs_dentry); 285 rc = ecryptfs_initialize_file(ecryptfs_dentry, ecryptfs_inode);
286 if (rc) {
287 drop_nlink(ecryptfs_inode);
288 unlock_new_inode(ecryptfs_inode);
289 iput(ecryptfs_inode);
290 goto out;
291 }
292 d_instantiate(ecryptfs_dentry, ecryptfs_inode);
293 unlock_new_inode(ecryptfs_inode);
284out: 294out:
285 return rc; 295 return rc;
286} 296}
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
index fa9a286c877..da42f32c49b 100644
--- a/fs/exofs/Kconfig
+++ b/fs/exofs/Kconfig
@@ -5,7 +5,7 @@
5# selected by any of the users. 5# selected by any of the users.
6config ORE 6config ORE
7 tristate 7 tristate
8 depends on EXOFS_FS 8 depends on EXOFS_FS || PNFS_OBJLAYOUT
9 select ASYNC_XOR 9 select ASYNC_XOR
10 default SCSI_OSD_ULD 10 default SCSI_OSD_ULD
11 11
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index fcfa86ae6fa..d271ad83720 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -23,6 +23,7 @@
23 */ 23 */
24 24
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/module.h>
26#include <asm/div64.h> 27#include <asm/div64.h>
27#include <linux/lcm.h> 28#include <linux/lcm.h>
28 29
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 057b237b8b6..e6085ec192d 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -35,6 +35,7 @@
35#include <linux/parser.h> 35#include <linux/parser.h>
36#include <linux/vfs.h> 36#include <linux/vfs.h>
37#include <linux/random.h> 37#include <linux/random.h>
38#include <linux/module.h>
38#include <linux/exportfs.h> 39#include <linux/exportfs.h>
39#include <linux/slab.h> 40#include <linux/slab.h>
40 41
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index f6dba4505f1..12ccacda44e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -565,7 +565,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
565 brelse(bitmap_bh); 565 brelse(bitmap_bh);
566 printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu" 566 printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu"
567 ", computed = %llu, %llu\n", 567 ", computed = %llu, %llu\n",
568 EXT4_B2C(sbi, ext4_free_blocks_count(es)), 568 EXT4_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)),
569 desc_count, bitmap_count); 569 desc_count, bitmap_count);
570 return bitmap_count; 570 return bitmap_count;
571#else 571#else
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cc5a6da030a..848f436df29 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2270,6 +2270,7 @@ retry:
2270 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " 2270 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2271 "%ld pages, ino %lu; err %d", __func__, 2271 "%ld pages, ino %lu; err %d", __func__,
2272 wbc->nr_to_write, inode->i_ino, ret); 2272 wbc->nr_to_write, inode->i_ino, ret);
2273 blk_finish_plug(&plug);
2273 goto out_writepages; 2274 goto out_writepages;
2274 } 2275 }
2275 2276
@@ -2372,7 +2373,7 @@ static int ext4_nonda_switch(struct super_block *sb)
2372 * start pushing delalloc when 1/2 of free blocks are dirty. 2373 * start pushing delalloc when 1/2 of free blocks are dirty.
2373 */ 2374 */
2374 if (free_blocks < 2 * dirty_blocks) 2375 if (free_blocks < 2 * dirty_blocks)
2375 writeback_inodes_sb_if_idle(sb); 2376 writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE);
2376 2377
2377 return 0; 2378 return 0;
2378} 2379}
@@ -2806,8 +2807,8 @@ out:
2806 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 2807 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
2807 2808
2808 /* queue the work to convert unwritten extents to written */ 2809 /* queue the work to convert unwritten extents to written */
2809 queue_work(wq, &io_end->work);
2810 iocb->private = NULL; 2810 iocb->private = NULL;
2811 queue_work(wq, &io_end->work);
2811 2812
2812 /* XXX: probably should move into the real I/O completion handler */ 2813 /* XXX: probably should move into the real I/O completion handler */
2813 inode_dio_done(inode); 2814 inode_dio_done(inode);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9953d80145a..3858767ec67 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1683,7 +1683,9 @@ static int parse_options(char *options, struct super_block *sb,
1683 data_opt = EXT4_MOUNT_WRITEBACK_DATA; 1683 data_opt = EXT4_MOUNT_WRITEBACK_DATA;
1684 datacheck: 1684 datacheck:
1685 if (is_remount) { 1685 if (is_remount) {
1686 if (test_opt(sb, DATA_FLAGS) != data_opt) { 1686 if (!sbi->s_journal)
1687 ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
1688 else if (test_opt(sb, DATA_FLAGS) != data_opt) {
1687 ext4_msg(sb, KERN_ERR, 1689 ext4_msg(sb, KERN_ERR,
1688 "Cannot change data mode on remount"); 1690 "Cannot change data mode on remount");
1689 return 0; 1691 return 0;
@@ -3099,8 +3101,6 @@ static void ext4_destroy_lazyinit_thread(void)
3099} 3101}
3100 3102
3101static int ext4_fill_super(struct super_block *sb, void *data, int silent) 3103static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3102 __releases(kernel_lock)
3103 __acquires(kernel_lock)
3104{ 3104{
3105 char *orig_data = kstrdup(data, GFP_KERNEL); 3105 char *orig_data = kstrdup(data, GFP_KERNEL);
3106 struct buffer_head *bh; 3106 struct buffer_head *bh;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 04cf3b91e50..ac86f8b3e3c 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -41,11 +41,23 @@ struct wb_writeback_work {
41 unsigned int for_kupdate:1; 41 unsigned int for_kupdate:1;
42 unsigned int range_cyclic:1; 42 unsigned int range_cyclic:1;
43 unsigned int for_background:1; 43 unsigned int for_background:1;
44 enum wb_reason reason; /* why was writeback initiated? */
44 45
45 struct list_head list; /* pending work list */ 46 struct list_head list; /* pending work list */
46 struct completion *done; /* set if the caller waits */ 47 struct completion *done; /* set if the caller waits */
47}; 48};
48 49
50const char *wb_reason_name[] = {
51 [WB_REASON_BACKGROUND] = "background",
52 [WB_REASON_TRY_TO_FREE_PAGES] = "try_to_free_pages",
53 [WB_REASON_SYNC] = "sync",
54 [WB_REASON_PERIODIC] = "periodic",
55 [WB_REASON_LAPTOP_TIMER] = "laptop_timer",
56 [WB_REASON_FREE_MORE_MEM] = "free_more_memory",
57 [WB_REASON_FS_FREE_SPACE] = "fs_free_space",
58 [WB_REASON_FORKER_THREAD] = "forker_thread"
59};
60
49/* 61/*
50 * Include the creation of the trace points after defining the 62 * Include the creation of the trace points after defining the
51 * wb_writeback_work structure so that the definition remains local to this 63 * wb_writeback_work structure so that the definition remains local to this
@@ -115,7 +127,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
115 127
116static void 128static void
117__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, 129__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
118 bool range_cyclic) 130 bool range_cyclic, enum wb_reason reason)
119{ 131{
120 struct wb_writeback_work *work; 132 struct wb_writeback_work *work;
121 133
@@ -135,6 +147,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
135 work->sync_mode = WB_SYNC_NONE; 147 work->sync_mode = WB_SYNC_NONE;
136 work->nr_pages = nr_pages; 148 work->nr_pages = nr_pages;
137 work->range_cyclic = range_cyclic; 149 work->range_cyclic = range_cyclic;
150 work->reason = reason;
138 151
139 bdi_queue_work(bdi, work); 152 bdi_queue_work(bdi, work);
140} 153}
@@ -143,6 +156,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
143 * bdi_start_writeback - start writeback 156 * bdi_start_writeback - start writeback
144 * @bdi: the backing device to write from 157 * @bdi: the backing device to write from
145 * @nr_pages: the number of pages to write 158 * @nr_pages: the number of pages to write
159 * @reason: reason why some writeback work was initiated
146 * 160 *
147 * Description: 161 * Description:
148 * This does WB_SYNC_NONE opportunistic writeback. The IO is only 162 * This does WB_SYNC_NONE opportunistic writeback. The IO is only
@@ -150,9 +164,10 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
150 * completion. Caller need not hold sb s_umount semaphore. 164 * completion. Caller need not hold sb s_umount semaphore.
151 * 165 *
152 */ 166 */
153void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) 167void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
168 enum wb_reason reason)
154{ 169{
155 __bdi_start_writeback(bdi, nr_pages, true); 170 __bdi_start_writeback(bdi, nr_pages, true, reason);
156} 171}
157 172
158/** 173/**
@@ -251,7 +266,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
251 */ 266 */
252static int move_expired_inodes(struct list_head *delaying_queue, 267static int move_expired_inodes(struct list_head *delaying_queue,
253 struct list_head *dispatch_queue, 268 struct list_head *dispatch_queue,
254 unsigned long *older_than_this) 269 struct wb_writeback_work *work)
255{ 270{
256 LIST_HEAD(tmp); 271 LIST_HEAD(tmp);
257 struct list_head *pos, *node; 272 struct list_head *pos, *node;
@@ -262,8 +277,8 @@ static int move_expired_inodes(struct list_head *delaying_queue,
262 277
263 while (!list_empty(delaying_queue)) { 278 while (!list_empty(delaying_queue)) {
264 inode = wb_inode(delaying_queue->prev); 279 inode = wb_inode(delaying_queue->prev);
265 if (older_than_this && 280 if (work->older_than_this &&
266 inode_dirtied_after(inode, *older_than_this)) 281 inode_dirtied_after(inode, *work->older_than_this))
267 break; 282 break;
268 if (sb && sb != inode->i_sb) 283 if (sb && sb != inode->i_sb)
269 do_sb_sort = 1; 284 do_sb_sort = 1;
@@ -302,13 +317,13 @@ out:
302 * | 317 * |
303 * +--> dequeue for IO 318 * +--> dequeue for IO
304 */ 319 */
305static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) 320static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
306{ 321{
307 int moved; 322 int moved;
308 assert_spin_locked(&wb->list_lock); 323 assert_spin_locked(&wb->list_lock);
309 list_splice_init(&wb->b_more_io, &wb->b_io); 324 list_splice_init(&wb->b_more_io, &wb->b_io);
310 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); 325 moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
311 trace_writeback_queue_io(wb, older_than_this, moved); 326 trace_writeback_queue_io(wb, work, moved);
312} 327}
313 328
314static int write_inode(struct inode *inode, struct writeback_control *wbc) 329static int write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -641,31 +656,40 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
641 return wrote; 656 return wrote;
642} 657}
643 658
644long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages) 659long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
660 enum wb_reason reason)
645{ 661{
646 struct wb_writeback_work work = { 662 struct wb_writeback_work work = {
647 .nr_pages = nr_pages, 663 .nr_pages = nr_pages,
648 .sync_mode = WB_SYNC_NONE, 664 .sync_mode = WB_SYNC_NONE,
649 .range_cyclic = 1, 665 .range_cyclic = 1,
666 .reason = reason,
650 }; 667 };
651 668
652 spin_lock(&wb->list_lock); 669 spin_lock(&wb->list_lock);
653 if (list_empty(&wb->b_io)) 670 if (list_empty(&wb->b_io))
654 queue_io(wb, NULL); 671 queue_io(wb, &work);
655 __writeback_inodes_wb(wb, &work); 672 __writeback_inodes_wb(wb, &work);
656 spin_unlock(&wb->list_lock); 673 spin_unlock(&wb->list_lock);
657 674
658 return nr_pages - work.nr_pages; 675 return nr_pages - work.nr_pages;
659} 676}
660 677
661static inline bool over_bground_thresh(void) 678static bool over_bground_thresh(struct backing_dev_info *bdi)
662{ 679{
663 unsigned long background_thresh, dirty_thresh; 680 unsigned long background_thresh, dirty_thresh;
664 681
665 global_dirty_limits(&background_thresh, &dirty_thresh); 682 global_dirty_limits(&background_thresh, &dirty_thresh);
666 683
667 return (global_page_state(NR_FILE_DIRTY) + 684 if (global_page_state(NR_FILE_DIRTY) +
668 global_page_state(NR_UNSTABLE_NFS) > background_thresh); 685 global_page_state(NR_UNSTABLE_NFS) > background_thresh)
686 return true;
687
688 if (bdi_stat(bdi, BDI_RECLAIMABLE) >
689 bdi_dirty_limit(bdi, background_thresh))
690 return true;
691
692 return false;
669} 693}
670 694
671/* 695/*
@@ -675,7 +699,7 @@ static inline bool over_bground_thresh(void)
675static void wb_update_bandwidth(struct bdi_writeback *wb, 699static void wb_update_bandwidth(struct bdi_writeback *wb,
676 unsigned long start_time) 700 unsigned long start_time)
677{ 701{
678 __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time); 702 __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time);
679} 703}
680 704
681/* 705/*
@@ -727,7 +751,7 @@ static long wb_writeback(struct bdi_writeback *wb,
727 * For background writeout, stop when we are below the 751 * For background writeout, stop when we are below the
728 * background dirty threshold 752 * background dirty threshold
729 */ 753 */
730 if (work->for_background && !over_bground_thresh()) 754 if (work->for_background && !over_bground_thresh(wb->bdi))
731 break; 755 break;
732 756
733 if (work->for_kupdate) { 757 if (work->for_kupdate) {
@@ -738,7 +762,7 @@ static long wb_writeback(struct bdi_writeback *wb,
738 762
739 trace_writeback_start(wb->bdi, work); 763 trace_writeback_start(wb->bdi, work);
740 if (list_empty(&wb->b_io)) 764 if (list_empty(&wb->b_io))
741 queue_io(wb, work->older_than_this); 765 queue_io(wb, work);
742 if (work->sb) 766 if (work->sb)
743 progress = writeback_sb_inodes(work->sb, wb, work); 767 progress = writeback_sb_inodes(work->sb, wb, work);
744 else 768 else
@@ -811,13 +835,14 @@ static unsigned long get_nr_dirty_pages(void)
811 835
812static long wb_check_background_flush(struct bdi_writeback *wb) 836static long wb_check_background_flush(struct bdi_writeback *wb)
813{ 837{
814 if (over_bground_thresh()) { 838 if (over_bground_thresh(wb->bdi)) {
815 839
816 struct wb_writeback_work work = { 840 struct wb_writeback_work work = {
817 .nr_pages = LONG_MAX, 841 .nr_pages = LONG_MAX,
818 .sync_mode = WB_SYNC_NONE, 842 .sync_mode = WB_SYNC_NONE,
819 .for_background = 1, 843 .for_background = 1,
820 .range_cyclic = 1, 844 .range_cyclic = 1,
845 .reason = WB_REASON_BACKGROUND,
821 }; 846 };
822 847
823 return wb_writeback(wb, &work); 848 return wb_writeback(wb, &work);
@@ -851,6 +876,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
851 .sync_mode = WB_SYNC_NONE, 876 .sync_mode = WB_SYNC_NONE,
852 .for_kupdate = 1, 877 .for_kupdate = 1,
853 .range_cyclic = 1, 878 .range_cyclic = 1,
879 .reason = WB_REASON_PERIODIC,
854 }; 880 };
855 881
856 return wb_writeback(wb, &work); 882 return wb_writeback(wb, &work);
@@ -969,7 +995,7 @@ int bdi_writeback_thread(void *data)
969 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back 995 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
970 * the whole world. 996 * the whole world.
971 */ 997 */
972void wakeup_flusher_threads(long nr_pages) 998void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
973{ 999{
974 struct backing_dev_info *bdi; 1000 struct backing_dev_info *bdi;
975 1001
@@ -982,7 +1008,7 @@ void wakeup_flusher_threads(long nr_pages)
982 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { 1008 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
983 if (!bdi_has_dirty_io(bdi)) 1009 if (!bdi_has_dirty_io(bdi))
984 continue; 1010 continue;
985 __bdi_start_writeback(bdi, nr_pages, false); 1011 __bdi_start_writeback(bdi, nr_pages, false, reason);
986 } 1012 }
987 rcu_read_unlock(); 1013 rcu_read_unlock();
988} 1014}
@@ -1198,12 +1224,15 @@ static void wait_sb_inodes(struct super_block *sb)
1198 * writeback_inodes_sb_nr - writeback dirty inodes from given super_block 1224 * writeback_inodes_sb_nr - writeback dirty inodes from given super_block
1199 * @sb: the superblock 1225 * @sb: the superblock
1200 * @nr: the number of pages to write 1226 * @nr: the number of pages to write
1227 * @reason: reason why some writeback work initiated
1201 * 1228 *
1202 * Start writeback on some inodes on this super_block. No guarantees are made 1229 * Start writeback on some inodes on this super_block. No guarantees are made
1203 * on how many (if any) will be written, and this function does not wait 1230 * on how many (if any) will be written, and this function does not wait
1204 * for IO completion of submitted IO. 1231 * for IO completion of submitted IO.
1205 */ 1232 */
1206void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) 1233void writeback_inodes_sb_nr(struct super_block *sb,
1234 unsigned long nr,
1235 enum wb_reason reason)
1207{ 1236{
1208 DECLARE_COMPLETION_ONSTACK(done); 1237 DECLARE_COMPLETION_ONSTACK(done);
1209 struct wb_writeback_work work = { 1238 struct wb_writeback_work work = {
@@ -1212,6 +1241,7 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
1212 .tagged_writepages = 1, 1241 .tagged_writepages = 1,
1213 .done = &done, 1242 .done = &done,
1214 .nr_pages = nr, 1243 .nr_pages = nr,
1244 .reason = reason,
1215 }; 1245 };
1216 1246
1217 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1247 WARN_ON(!rwsem_is_locked(&sb->s_umount));
@@ -1223,29 +1253,31 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr);
1223/** 1253/**
1224 * writeback_inodes_sb - writeback dirty inodes from given super_block 1254 * writeback_inodes_sb - writeback dirty inodes from given super_block
1225 * @sb: the superblock 1255 * @sb: the superblock
1256 * @reason: reason why some writeback work was initiated
1226 * 1257 *
1227 * Start writeback on some inodes on this super_block. No guarantees are made 1258 * Start writeback on some inodes on this super_block. No guarantees are made
1228 * on how many (if any) will be written, and this function does not wait 1259 * on how many (if any) will be written, and this function does not wait
1229 * for IO completion of submitted IO. 1260 * for IO completion of submitted IO.
1230 */ 1261 */
1231void writeback_inodes_sb(struct super_block *sb) 1262void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
1232{ 1263{
1233 return writeback_inodes_sb_nr(sb, get_nr_dirty_pages()); 1264 return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
1234} 1265}
1235EXPORT_SYMBOL(writeback_inodes_sb); 1266EXPORT_SYMBOL(writeback_inodes_sb);
1236 1267
1237/** 1268/**
1238 * writeback_inodes_sb_if_idle - start writeback if none underway 1269 * writeback_inodes_sb_if_idle - start writeback if none underway
1239 * @sb: the superblock 1270 * @sb: the superblock
1271 * @reason: reason why some writeback work was initiated
1240 * 1272 *
1241 * Invoke writeback_inodes_sb if no writeback is currently underway. 1273 * Invoke writeback_inodes_sb if no writeback is currently underway.
1242 * Returns 1 if writeback was started, 0 if not. 1274 * Returns 1 if writeback was started, 0 if not.
1243 */ 1275 */
1244int writeback_inodes_sb_if_idle(struct super_block *sb) 1276int writeback_inodes_sb_if_idle(struct super_block *sb, enum wb_reason reason)
1245{ 1277{
1246 if (!writeback_in_progress(sb->s_bdi)) { 1278 if (!writeback_in_progress(sb->s_bdi)) {
1247 down_read(&sb->s_umount); 1279 down_read(&sb->s_umount);
1248 writeback_inodes_sb(sb); 1280 writeback_inodes_sb(sb, reason);
1249 up_read(&sb->s_umount); 1281 up_read(&sb->s_umount);
1250 return 1; 1282 return 1;
1251 } else 1283 } else
@@ -1257,16 +1289,18 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
1257 * writeback_inodes_sb_if_idle - start writeback if none underway 1289 * writeback_inodes_sb_if_idle - start writeback if none underway
1258 * @sb: the superblock 1290 * @sb: the superblock
1259 * @nr: the number of pages to write 1291 * @nr: the number of pages to write
1292 * @reason: reason why some writeback work was initiated
1260 * 1293 *
1261 * Invoke writeback_inodes_sb if no writeback is currently underway. 1294 * Invoke writeback_inodes_sb if no writeback is currently underway.
1262 * Returns 1 if writeback was started, 0 if not. 1295 * Returns 1 if writeback was started, 0 if not.
1263 */ 1296 */
1264int writeback_inodes_sb_nr_if_idle(struct super_block *sb, 1297int writeback_inodes_sb_nr_if_idle(struct super_block *sb,
1265 unsigned long nr) 1298 unsigned long nr,
1299 enum wb_reason reason)
1266{ 1300{
1267 if (!writeback_in_progress(sb->s_bdi)) { 1301 if (!writeback_in_progress(sb->s_bdi)) {
1268 down_read(&sb->s_umount); 1302 down_read(&sb->s_umount);
1269 writeback_inodes_sb_nr(sb, nr); 1303 writeback_inodes_sb_nr(sb, nr, reason);
1270 up_read(&sb->s_umount); 1304 up_read(&sb->s_umount);
1271 return 1; 1305 return 1;
1272 } else 1306 } else
@@ -1290,6 +1324,7 @@ void sync_inodes_sb(struct super_block *sb)
1290 .nr_pages = LONG_MAX, 1324 .nr_pages = LONG_MAX,
1291 .range_cyclic = 0, 1325 .range_cyclic = 0,
1292 .done = &done, 1326 .done = &done,
1327 .reason = WB_REASON_SYNC,
1293 }; 1328 };
1294 1329
1295 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1330 WARN_ON(!rwsem_is_locked(&sb->s_umount));
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index b6cca47f7b0..3426521f320 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -47,6 +47,7 @@
47#include <linux/slab.h> 47#include <linux/slab.h>
48#include <linux/spinlock.h> 48#include <linux/spinlock.h>
49#include <linux/stat.h> 49#include <linux/stat.h>
50#include <linux/module.h>
50 51
51#include "fuse_i.h" 52#include "fuse_i.h"
52 53
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 7e823bbd245..cb23c2be731 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -14,6 +14,7 @@
14#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
15#include <linux/blkdev.h> 15#include <linux/blkdev.h>
16#include <linux/kthread.h> 16#include <linux/kthread.h>
17#include <linux/export.h>
17#include <linux/namei.h> 18#include <linux/namei.h>
18#include <linux/mount.h> 19#include <linux/mount.h>
19#include <linux/gfs2_ondisk.h> 20#include <linux/gfs2_ondisk.h>
diff --git a/fs/hfs/trans.c b/fs/hfs/trans.c
index e673a88b8ae..b1ce4c7ad3f 100644
--- a/fs/hfs/trans.c
+++ b/fs/hfs/trans.c
@@ -40,6 +40,8 @@ int hfs_mac2asc(struct super_block *sb, char *out, const struct hfs_name *in)
40 40
41 src = in->name; 41 src = in->name;
42 srclen = in->len; 42 srclen = in->len;
43 if (srclen > HFS_NAMELEN)
44 srclen = HFS_NAMELEN;
43 dst = out; 45 dst = out;
44 dstlen = HFS_MAX_NAMELEN; 46 dstlen = HFS_MAX_NAMELEN;
45 if (nls_io) { 47 if (nls_io) {
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 7da2a06508e..f79dab83e17 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -21,6 +21,7 @@
21 */ 21 */
22#include <linux/gfp.h> 22#include <linux/gfp.h>
23#include <linux/kernel.h> 23#include <linux/kernel.h>
24#include <linux/export.h>
24#include <linux/ioprio.h> 25#include <linux/ioprio.h>
25#include <linux/blkdev.h> 26#include <linux/blkdev.h>
26#include <linux/capability.h> 27#include <linux/capability.h>
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index de4247021d2..5b6c9d1a2fb 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -53,6 +53,78 @@ static int jffs2_is_best_compression(struct jffs2_compressor *this,
53 return 0; 53 return 0;
54} 54}
55 55
56/*
57 * jffs2_selected_compress:
58 * @compr: Explicit compression type to use (ie, JFFS2_COMPR_ZLIB).
59 * If 0, just take the first available compression mode.
60 * @data_in: Pointer to uncompressed data
61 * @cpage_out: Pointer to returned pointer to buffer for compressed data
62 * @datalen: On entry, holds the amount of data available for compression.
63 * On exit, expected to hold the amount of data actually compressed.
64 * @cdatalen: On entry, holds the amount of space available for compressed
65 * data. On exit, expected to hold the actual size of the compressed
66 * data.
67 *
68 * Returns: the compression type used. Zero is used to show that the data
69 * could not be compressed; probably because we couldn't find the requested
70 * compression mode.
71 */
72static int jffs2_selected_compress(u8 compr, unsigned char *data_in,
73 unsigned char **cpage_out, u32 *datalen, u32 *cdatalen)
74{
75 struct jffs2_compressor *this;
76 int err, ret = JFFS2_COMPR_NONE;
77 uint32_t orig_slen, orig_dlen;
78 char *output_buf;
79
80 output_buf = kmalloc(*cdatalen, GFP_KERNEL);
81 if (!output_buf) {
82 printk(KERN_WARNING "JFFS2: No memory for compressor allocation. Compression failed.\n");
83 return ret;
84 }
85 orig_slen = *datalen;
86 orig_dlen = *cdatalen;
87 spin_lock(&jffs2_compressor_list_lock);
88 list_for_each_entry(this, &jffs2_compressor_list, list) {
89 /* Skip decompress-only and disabled modules */
90 if (!this->compress || this->disabled)
91 continue;
92
93 /* Skip if not the desired compression type */
94 if (compr && (compr != this->compr))
95 continue;
96
97 /*
98 * Either compression type was unspecified, or we found our
99 * compressor; either way, we're good to go.
100 */
101 this->usecount++;
102 spin_unlock(&jffs2_compressor_list_lock);
103
104 *datalen = orig_slen;
105 *cdatalen = orig_dlen;
106 err = this->compress(data_in, output_buf, datalen, cdatalen);
107
108 spin_lock(&jffs2_compressor_list_lock);
109 this->usecount--;
110 if (!err) {
111 /* Success */
112 ret = this->compr;
113 this->stat_compr_blocks++;
114 this->stat_compr_orig_size += *datalen;
115 this->stat_compr_new_size += *cdatalen;
116 break;
117 }
118 }
119 spin_unlock(&jffs2_compressor_list_lock);
120 if (ret == JFFS2_COMPR_NONE)
121 kfree(output_buf);
122 else
123 *cpage_out = output_buf;
124
125 return ret;
126}
127
56/* jffs2_compress: 128/* jffs2_compress:
57 * @data_in: Pointer to uncompressed data 129 * @data_in: Pointer to uncompressed data
58 * @cpage_out: Pointer to returned pointer to buffer for compressed data 130 * @cpage_out: Pointer to returned pointer to buffer for compressed data
@@ -76,47 +148,23 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
76 uint32_t *datalen, uint32_t *cdatalen) 148 uint32_t *datalen, uint32_t *cdatalen)
77{ 149{
78 int ret = JFFS2_COMPR_NONE; 150 int ret = JFFS2_COMPR_NONE;
79 int compr_ret; 151 int mode, compr_ret;
80 struct jffs2_compressor *this, *best=NULL; 152 struct jffs2_compressor *this, *best=NULL;
81 unsigned char *output_buf = NULL, *tmp_buf; 153 unsigned char *output_buf = NULL, *tmp_buf;
82 uint32_t orig_slen, orig_dlen; 154 uint32_t orig_slen, orig_dlen;
83 uint32_t best_slen=0, best_dlen=0; 155 uint32_t best_slen=0, best_dlen=0;
84 156
85 switch (jffs2_compression_mode) { 157 if (c->mount_opts.override_compr)
158 mode = c->mount_opts.compr;
159 else
160 mode = jffs2_compression_mode;
161
162 switch (mode) {
86 case JFFS2_COMPR_MODE_NONE: 163 case JFFS2_COMPR_MODE_NONE:
87 break; 164 break;
88 case JFFS2_COMPR_MODE_PRIORITY: 165 case JFFS2_COMPR_MODE_PRIORITY:
89 output_buf = kmalloc(*cdatalen,GFP_KERNEL); 166 ret = jffs2_selected_compress(0, data_in, cpage_out, datalen,
90 if (!output_buf) { 167 cdatalen);
91 printk(KERN_WARNING "JFFS2: No memory for compressor allocation. Compression failed.\n");
92 goto out;
93 }
94 orig_slen = *datalen;
95 orig_dlen = *cdatalen;
96 spin_lock(&jffs2_compressor_list_lock);
97 list_for_each_entry(this, &jffs2_compressor_list, list) {
98 /* Skip decompress-only backwards-compatibility and disabled modules */
99 if ((!this->compress)||(this->disabled))
100 continue;
101
102 this->usecount++;
103 spin_unlock(&jffs2_compressor_list_lock);
104 *datalen = orig_slen;
105 *cdatalen = orig_dlen;
106 compr_ret = this->compress(data_in, output_buf, datalen, cdatalen);
107 spin_lock(&jffs2_compressor_list_lock);
108 this->usecount--;
109 if (!compr_ret) {
110 ret = this->compr;
111 this->stat_compr_blocks++;
112 this->stat_compr_orig_size += *datalen;
113 this->stat_compr_new_size += *cdatalen;
114 break;
115 }
116 }
117 spin_unlock(&jffs2_compressor_list_lock);
118 if (ret == JFFS2_COMPR_NONE)
119 kfree(output_buf);
120 break; 168 break;
121 case JFFS2_COMPR_MODE_SIZE: 169 case JFFS2_COMPR_MODE_SIZE:
122 case JFFS2_COMPR_MODE_FAVOURLZO: 170 case JFFS2_COMPR_MODE_FAVOURLZO:
@@ -174,22 +222,28 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
174 best->stat_compr_orig_size += best_slen; 222 best->stat_compr_orig_size += best_slen;
175 best->stat_compr_new_size += best_dlen; 223 best->stat_compr_new_size += best_dlen;
176 ret = best->compr; 224 ret = best->compr;
225 *cpage_out = output_buf;
177 } 226 }
178 spin_unlock(&jffs2_compressor_list_lock); 227 spin_unlock(&jffs2_compressor_list_lock);
179 break; 228 break;
229 case JFFS2_COMPR_MODE_FORCELZO:
230 ret = jffs2_selected_compress(JFFS2_COMPR_LZO, data_in,
231 cpage_out, datalen, cdatalen);
232 break;
233 case JFFS2_COMPR_MODE_FORCEZLIB:
234 ret = jffs2_selected_compress(JFFS2_COMPR_ZLIB, data_in,
235 cpage_out, datalen, cdatalen);
236 break;
180 default: 237 default:
181 printk(KERN_ERR "JFFS2: unknown compression mode.\n"); 238 printk(KERN_ERR "JFFS2: unknown compression mode.\n");
182 } 239 }
183 out: 240
184 if (ret == JFFS2_COMPR_NONE) { 241 if (ret == JFFS2_COMPR_NONE) {
185 *cpage_out = data_in; 242 *cpage_out = data_in;
186 *datalen = *cdatalen; 243 *datalen = *cdatalen;
187 none_stat_compr_blocks++; 244 none_stat_compr_blocks++;
188 none_stat_compr_size += *datalen; 245 none_stat_compr_size += *datalen;
189 } 246 }
190 else {
191 *cpage_out = output_buf;
192 }
193 return ret; 247 return ret;
194} 248}
195 249
diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h
index 13bb7597ab3..5e91d578f4e 100644
--- a/fs/jffs2/compr.h
+++ b/fs/jffs2/compr.h
@@ -40,6 +40,8 @@
40#define JFFS2_COMPR_MODE_PRIORITY 1 40#define JFFS2_COMPR_MODE_PRIORITY 1
41#define JFFS2_COMPR_MODE_SIZE 2 41#define JFFS2_COMPR_MODE_SIZE 2
42#define JFFS2_COMPR_MODE_FAVOURLZO 3 42#define JFFS2_COMPR_MODE_FAVOURLZO 3
43#define JFFS2_COMPR_MODE_FORCELZO 4
44#define JFFS2_COMPR_MODE_FORCEZLIB 5
43 45
44#define FAVOUR_LZO_PERCENT 80 46#define FAVOUR_LZO_PERCENT 80
45 47
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 7286e44ac66..4b8afe39a87 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -379,7 +379,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags)
379 jffs2_do_setattr(inode, &iattr); 379 jffs2_do_setattr(inode, &iattr);
380} 380}
381 381
382int jffs2_remount_fs (struct super_block *sb, int *flags, char *data) 382int jffs2_do_remount_fs(struct super_block *sb, int *flags, char *data)
383{ 383{
384 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); 384 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
385 385
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 0bc6a6c80a5..55a0c1dcead 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -29,6 +29,11 @@
29 29
30struct jffs2_inodirty; 30struct jffs2_inodirty;
31 31
32struct jffs2_mount_opts {
33 bool override_compr;
34 unsigned int compr;
35};
36
32/* A struct for the overall file system control. Pointers to 37/* A struct for the overall file system control. Pointers to
33 jffs2_sb_info structs are named `c' in the source code. 38 jffs2_sb_info structs are named `c' in the source code.
34 Nee jffs_control 39 Nee jffs_control
@@ -126,6 +131,7 @@ struct jffs2_sb_info {
126#endif 131#endif
127 132
128 struct jffs2_summary *summary; /* Summary information */ 133 struct jffs2_summary *summary; /* Summary information */
134 struct jffs2_mount_opts mount_opts;
129 135
130#ifdef CONFIG_JFFS2_FS_XATTR 136#ifdef CONFIG_JFFS2_FS_XATTR
131#define XATTRINDEX_HASHSIZE (57) 137#define XATTRINDEX_HASHSIZE (57)
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 6c1755c59c0..ab65ee3ec85 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -176,7 +176,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags);
176struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, 176struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode,
177 struct jffs2_raw_inode *ri); 177 struct jffs2_raw_inode *ri);
178int jffs2_statfs (struct dentry *, struct kstatfs *); 178int jffs2_statfs (struct dentry *, struct kstatfs *);
179int jffs2_remount_fs (struct super_block *, int *, char *); 179int jffs2_do_remount_fs(struct super_block *, int *, char *);
180int jffs2_do_fill_super(struct super_block *sb, void *data, int silent); 180int jffs2_do_fill_super(struct super_block *sb, void *data, int silent);
181void jffs2_gc_release_inode(struct jffs2_sb_info *c, 181void jffs2_gc_release_inode(struct jffs2_sb_info *c,
182 struct jffs2_inode_info *f); 182 struct jffs2_inode_info *f);
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 8d8cd3419d0..28107ca136e 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -275,9 +275,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
275 else 275 else
276 c->mtd->unpoint(c->mtd, 0, c->mtd->size); 276 c->mtd->unpoint(c->mtd, 0, c->mtd->size);
277#endif 277#endif
278 if (s) 278 kfree(s);
279 kfree(s);
280
281 return ret; 279 return ret;
282} 280}
283 281
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 853b8e30008..e7e97445411 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -17,11 +17,13 @@
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/err.h> 18#include <linux/err.h>
19#include <linux/mount.h> 19#include <linux/mount.h>
20#include <linux/parser.h>
20#include <linux/jffs2.h> 21#include <linux/jffs2.h>
21#include <linux/pagemap.h> 22#include <linux/pagemap.h>
22#include <linux/mtd/super.h> 23#include <linux/mtd/super.h>
23#include <linux/ctype.h> 24#include <linux/ctype.h>
24#include <linux/namei.h> 25#include <linux/namei.h>
26#include <linux/seq_file.h>
25#include <linux/exportfs.h> 27#include <linux/exportfs.h>
26#include "compr.h" 28#include "compr.h"
27#include "nodelist.h" 29#include "nodelist.h"
@@ -75,6 +77,37 @@ static void jffs2_write_super(struct super_block *sb)
75 unlock_super(sb); 77 unlock_super(sb);
76} 78}
77 79
80static const char *jffs2_compr_name(unsigned int compr)
81{
82 switch (compr) {
83 case JFFS2_COMPR_MODE_NONE:
84 return "none";
85#ifdef CONFIG_JFFS2_LZO
86 case JFFS2_COMPR_MODE_FORCELZO:
87 return "lzo";
88#endif
89#ifdef CONFIG_JFFS2_ZLIB
90 case JFFS2_COMPR_MODE_FORCEZLIB:
91 return "zlib";
92#endif
93 default:
94 /* should never happen; programmer error */
95 WARN_ON(1);
96 return "";
97 }
98}
99
100static int jffs2_show_options(struct seq_file *s, struct vfsmount *mnt)
101{
102 struct jffs2_sb_info *c = JFFS2_SB_INFO(mnt->mnt_sb);
103 struct jffs2_mount_opts *opts = &c->mount_opts;
104
105 if (opts->override_compr)
106 seq_printf(s, ",compr=%s", jffs2_compr_name(opts->compr));
107
108 return 0;
109}
110
78static int jffs2_sync_fs(struct super_block *sb, int wait) 111static int jffs2_sync_fs(struct super_block *sb, int wait)
79{ 112{
80 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); 113 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
@@ -133,6 +166,85 @@ static const struct export_operations jffs2_export_ops = {
133 .fh_to_parent = jffs2_fh_to_parent, 166 .fh_to_parent = jffs2_fh_to_parent,
134}; 167};
135 168
169/*
170 * JFFS2 mount options.
171 *
172 * Opt_override_compr: override default compressor
173 * Opt_err: just end of array marker
174 */
175enum {
176 Opt_override_compr,
177 Opt_err,
178};
179
180static const match_table_t tokens = {
181 {Opt_override_compr, "compr=%s"},
182 {Opt_err, NULL},
183};
184
185static int jffs2_parse_options(struct jffs2_sb_info *c, char *data)
186{
187 substring_t args[MAX_OPT_ARGS];
188 char *p, *name;
189
190 if (!data)
191 return 0;
192
193 while ((p = strsep(&data, ","))) {
194 int token;
195
196 if (!*p)
197 continue;
198
199 token = match_token(p, tokens, args);
200 switch (token) {
201 case Opt_override_compr:
202 name = match_strdup(&args[0]);
203
204 if (!name)
205 return -ENOMEM;
206 if (!strcmp(name, "none"))
207 c->mount_opts.compr = JFFS2_COMPR_MODE_NONE;
208#ifdef CONFIG_JFFS2_LZO
209 else if (!strcmp(name, "lzo"))
210 c->mount_opts.compr = JFFS2_COMPR_MODE_FORCELZO;
211#endif
212#ifdef CONFIG_JFFS2_ZLIB
213 else if (!strcmp(name, "zlib"))
214 c->mount_opts.compr =
215 JFFS2_COMPR_MODE_FORCEZLIB;
216#endif
217 else {
218 printk(KERN_ERR "JFFS2 Error: unknown compressor \"%s\"",
219 name);
220 kfree(name);
221 return -EINVAL;
222 }
223 kfree(name);
224 c->mount_opts.override_compr = true;
225 break;
226 default:
227 printk(KERN_ERR "JFFS2 Error: unrecognized mount option '%s' or missing value\n",
228 p);
229 return -EINVAL;
230 }
231 }
232
233 return 0;
234}
235
236static int jffs2_remount_fs(struct super_block *sb, int *flags, char *data)
237{
238 struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
239 int err;
240
241 err = jffs2_parse_options(c, data);
242 if (err)
243 return -EINVAL;
244
245 return jffs2_do_remount_fs(sb, flags, data);
246}
247
136static const struct super_operations jffs2_super_operations = 248static const struct super_operations jffs2_super_operations =
137{ 249{
138 .alloc_inode = jffs2_alloc_inode, 250 .alloc_inode = jffs2_alloc_inode,
@@ -143,6 +255,7 @@ static const struct super_operations jffs2_super_operations =
143 .remount_fs = jffs2_remount_fs, 255 .remount_fs = jffs2_remount_fs,
144 .evict_inode = jffs2_evict_inode, 256 .evict_inode = jffs2_evict_inode,
145 .dirty_inode = jffs2_dirty_inode, 257 .dirty_inode = jffs2_dirty_inode,
258 .show_options = jffs2_show_options,
146 .sync_fs = jffs2_sync_fs, 259 .sync_fs = jffs2_sync_fs,
147}; 260};
148 261
@@ -166,6 +279,12 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
166 c->os_priv = sb; 279 c->os_priv = sb;
167 sb->s_fs_info = c; 280 sb->s_fs_info = c;
168 281
282 ret = jffs2_parse_options(c, data);
283 if (ret) {
284 kfree(c);
285 return -EINVAL;
286 }
287
169 /* Initialize JFFS2 superblock locks, the further initialization will 288 /* Initialize JFFS2 superblock locks, the further initialization will
170 * be done later */ 289 * be done later */
171 mutex_init(&c->alloc_sem); 290 mutex_init(&c->alloc_sem);
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 4515bea0268..b09e51d2f81 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -578,8 +578,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
578 if (!jffs2_is_writebuffered(c)) 578 if (!jffs2_is_writebuffered(c))
579 return 0; 579 return 0;
580 580
581 if (mutex_trylock(&c->alloc_sem)) { 581 if (!mutex_is_locked(&c->alloc_sem)) {
582 mutex_unlock(&c->alloc_sem);
583 printk(KERN_CRIT "jffs2_flush_wbuf() called with alloc_sem not locked!\n"); 582 printk(KERN_CRIT "jffs2_flush_wbuf() called with alloc_sem not locked!\n");
584 BUG(); 583 BUG();
585 } 584 }
@@ -1026,7 +1025,7 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c,
1026 int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); 1025 int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
1027 struct mtd_oob_ops ops; 1026 struct mtd_oob_ops ops;
1028 1027
1029 ops.mode = MTD_OOB_AUTO; 1028 ops.mode = MTD_OPS_AUTO_OOB;
1030 ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail; 1029 ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail;
1031 ops.oobbuf = c->oobbuf; 1030 ops.oobbuf = c->oobbuf;
1032 ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; 1031 ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
@@ -1069,7 +1068,7 @@ int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c,
1069 struct mtd_oob_ops ops; 1068 struct mtd_oob_ops ops;
1070 int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); 1069 int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
1071 1070
1072 ops.mode = MTD_OOB_AUTO; 1071 ops.mode = MTD_OPS_AUTO_OOB;
1073 ops.ooblen = cmlen; 1072 ops.ooblen = cmlen;
1074 ops.oobbuf = c->oobbuf; 1073 ops.oobbuf = c->oobbuf;
1075 ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; 1074 ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
@@ -1095,7 +1094,7 @@ int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c,
1095 struct mtd_oob_ops ops; 1094 struct mtd_oob_ops ops;
1096 int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); 1095 int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
1097 1096
1098 ops.mode = MTD_OOB_AUTO; 1097 ops.mode = MTD_OPS_AUTO_OOB;
1099 ops.ooblen = cmlen; 1098 ops.ooblen = cmlen;
1100 ops.oobbuf = (uint8_t *)&oob_cleanmarker; 1099 ops.oobbuf = (uint8_t *)&oob_cleanmarker;
1101 ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; 1100 ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 583636f745e..cc5f811ed38 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -67,6 +67,7 @@
67#include <linux/buffer_head.h> /* for sync_blockdev() */ 67#include <linux/buffer_head.h> /* for sync_blockdev() */
68#include <linux/bio.h> 68#include <linux/bio.h>
69#include <linux/freezer.h> 69#include <linux/freezer.h>
70#include <linux/export.h>
70#include <linux/delay.h> 71#include <linux/delay.h>
71#include <linux/mutex.h> 72#include <linux/mutex.h>
72#include <linux/seq_file.h> 73#include <linux/seq_file.h>
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index f2697e4df10..e795c234ea3 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -13,6 +13,7 @@
13#include <linux/bio.h> 13#include <linux/bio.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/blkdev.h> 15#include <linux/blkdev.h>
16#include <linux/module.h>
16#include <linux/mtd/mtd.h> 17#include <linux/mtd/mtd.h>
17#include <linux/statfs.h> 18#include <linux/statfs.h>
18#include <linux/buffer_head.h> 19#include <linux/buffer_head.h>
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 3f32bcb0d9b..ef175cb8cfd 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -16,38 +16,26 @@
16#include <linux/bitops.h> 16#include <linux/bitops.h>
17#include <linux/sched.h> 17#include <linux/sched.h>
18 18
19static const int nibblemap[] = { 4,3,3,2,3,2,2,1,3,2,2,1,2,1,1,0 };
20
21static DEFINE_SPINLOCK(bitmap_lock); 19static DEFINE_SPINLOCK(bitmap_lock);
22 20
23static unsigned long count_free(struct buffer_head *map[], unsigned numblocks, __u32 numbits) 21/*
22 * bitmap consists of blocks filled with 16bit words
23 * bit set == busy, bit clear == free
24 * endianness is a mess, but for counting zero bits it really doesn't matter...
25 */
26static __u32 count_free(struct buffer_head *map[], unsigned blocksize, __u32 numbits)
24{ 27{
25 unsigned i, j, sum = 0; 28 __u32 sum = 0;
26 struct buffer_head *bh; 29 unsigned blocks = DIV_ROUND_UP(numbits, blocksize * 8);
27
28 for (i=0; i<numblocks-1; i++) {
29 if (!(bh=map[i]))
30 return(0);
31 for (j=0; j<bh->b_size; j++)
32 sum += nibblemap[bh->b_data[j] & 0xf]
33 + nibblemap[(bh->b_data[j]>>4) & 0xf];
34 }
35 30
36 if (numblocks==0 || !(bh=map[numblocks-1])) 31 while (blocks--) {
37 return(0); 32 unsigned words = blocksize / 2;
38 i = ((numbits - (numblocks-1) * bh->b_size * 8) / 16) * 2; 33 __u16 *p = (__u16 *)(*map++)->b_data;
39 for (j=0; j<i; j++) { 34 while (words--)
40 sum += nibblemap[bh->b_data[j] & 0xf] 35 sum += 16 - hweight16(*p++);
41 + nibblemap[(bh->b_data[j]>>4) & 0xf];
42 } 36 }
43 37
44 i = numbits%16; 38 return sum;
45 if (i!=0) {
46 i = *(__u16 *)(&bh->b_data[j]) | ~((1<<i) - 1);
47 sum += nibblemap[i & 0xf] + nibblemap[(i>>4) & 0xf];
48 sum += nibblemap[(i>>8) & 0xf] + nibblemap[(i>>12) & 0xf];
49 }
50 return(sum);
51} 39}
52 40
53void minix_free_block(struct inode *inode, unsigned long block) 41void minix_free_block(struct inode *inode, unsigned long block)
@@ -105,10 +93,12 @@ int minix_new_block(struct inode * inode)
105 return 0; 93 return 0;
106} 94}
107 95
108unsigned long minix_count_free_blocks(struct minix_sb_info *sbi) 96unsigned long minix_count_free_blocks(struct super_block *sb)
109{ 97{
110 return (count_free(sbi->s_zmap, sbi->s_zmap_blocks, 98 struct minix_sb_info *sbi = minix_sb(sb);
111 sbi->s_nzones - sbi->s_firstdatazone + 1) 99 u32 bits = sbi->s_nzones - (sbi->s_firstdatazone + 1);
100
101 return (count_free(sbi->s_zmap, sb->s_blocksize, bits)
112 << sbi->s_log_zone_size); 102 << sbi->s_log_zone_size);
113} 103}
114 104
@@ -273,7 +263,10 @@ struct inode *minix_new_inode(const struct inode *dir, int mode, int *error)
273 return inode; 263 return inode;
274} 264}
275 265
276unsigned long minix_count_free_inodes(struct minix_sb_info *sbi) 266unsigned long minix_count_free_inodes(struct super_block *sb)
277{ 267{
278 return count_free(sbi->s_imap, sbi->s_imap_blocks, sbi->s_ninodes + 1); 268 struct minix_sb_info *sbi = minix_sb(sb);
269 u32 bits = sbi->s_ninodes + 1;
270
271 return count_free(sbi->s_imap, sb->s_blocksize, bits);
279} 272}
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 64cdcd662ff..1d9e33966db 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -279,6 +279,27 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
279 else if (sbi->s_mount_state & MINIX_ERROR_FS) 279 else if (sbi->s_mount_state & MINIX_ERROR_FS)
280 printk("MINIX-fs: mounting file system with errors, " 280 printk("MINIX-fs: mounting file system with errors, "
281 "running fsck is recommended\n"); 281 "running fsck is recommended\n");
282
283 /* Apparently minix can create filesystems that allocate more blocks for
284 * the bitmaps than needed. We simply ignore that, but verify it didn't
285 * create one with not enough blocks and bail out if so.
286 */
287 block = minix_blocks_needed(sbi->s_ninodes, s->s_blocksize);
288 if (sbi->s_imap_blocks < block) {
289 printk("MINIX-fs: file system does not have enough "
290 "imap blocks allocated. Refusing to mount\n");
291 goto out_iput;
292 }
293
294 block = minix_blocks_needed(
295 (sbi->s_nzones - (sbi->s_firstdatazone + 1)),
296 s->s_blocksize);
297 if (sbi->s_zmap_blocks < block) {
298 printk("MINIX-fs: file system does not have enough "
299 "zmap blocks allocated. Refusing to mount.\n");
300 goto out_iput;
301 }
302
282 return 0; 303 return 0;
283 304
284out_iput: 305out_iput:
@@ -339,10 +360,10 @@ static int minix_statfs(struct dentry *dentry, struct kstatfs *buf)
339 buf->f_type = sb->s_magic; 360 buf->f_type = sb->s_magic;
340 buf->f_bsize = sb->s_blocksize; 361 buf->f_bsize = sb->s_blocksize;
341 buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size; 362 buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size;
342 buf->f_bfree = minix_count_free_blocks(sbi); 363 buf->f_bfree = minix_count_free_blocks(sb);
343 buf->f_bavail = buf->f_bfree; 364 buf->f_bavail = buf->f_bfree;
344 buf->f_files = sbi->s_ninodes; 365 buf->f_files = sbi->s_ninodes;
345 buf->f_ffree = minix_count_free_inodes(sbi); 366 buf->f_ffree = minix_count_free_inodes(sb);
346 buf->f_namelen = sbi->s_namelen; 367 buf->f_namelen = sbi->s_namelen;
347 buf->f_fsid.val[0] = (u32)id; 368 buf->f_fsid.val[0] = (u32)id;
348 buf->f_fsid.val[1] = (u32)(id >> 32); 369 buf->f_fsid.val[1] = (u32)(id >> 32);
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 341e2122879..26bbd55e82e 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -48,10 +48,10 @@ extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, stru
48extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); 48extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **);
49extern struct inode * minix_new_inode(const struct inode *, int, int *); 49extern struct inode * minix_new_inode(const struct inode *, int, int *);
50extern void minix_free_inode(struct inode * inode); 50extern void minix_free_inode(struct inode * inode);
51extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi); 51extern unsigned long minix_count_free_inodes(struct super_block *sb);
52extern int minix_new_block(struct inode * inode); 52extern int minix_new_block(struct inode * inode);
53extern void minix_free_block(struct inode *inode, unsigned long block); 53extern void minix_free_block(struct inode *inode, unsigned long block);
54extern unsigned long minix_count_free_blocks(struct minix_sb_info *sbi); 54extern unsigned long minix_count_free_blocks(struct super_block *sb);
55extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *); 55extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *);
56extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len); 56extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len);
57 57
@@ -88,6 +88,11 @@ static inline struct minix_inode_info *minix_i(struct inode *inode)
88 return list_entry(inode, struct minix_inode_info, vfs_inode); 88 return list_entry(inode, struct minix_inode_info, vfs_inode);
89} 89}
90 90
91static inline unsigned minix_blocks_needed(unsigned bits, unsigned blocksize)
92{
93 return DIV_ROUND_UP(bits, blocksize * 8);
94}
95
91#if defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) && \ 96#if defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) && \
92 defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED) 97 defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED)
93 98
@@ -125,7 +130,7 @@ static inline int minix_find_first_zero_bit(const void *vaddr, unsigned size)
125 if (!size) 130 if (!size)
126 return 0; 131 return 0;
127 132
128 size = (size >> 4) + ((size & 15) > 0); 133 size >>= 4;
129 while (*p++ == 0xffff) { 134 while (*p++ == 0xffff) {
130 if (--size == 0) 135 if (--size == 0)
131 return (p - addr) << 4; 136 return (p - addr) << 4;
diff --git a/fs/namei.c b/fs/namei.c
index ac6d214da82..5008f01787f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -852,7 +852,7 @@ static int follow_managed(struct path *path, unsigned flags)
852 mntput(path->mnt); 852 mntput(path->mnt);
853 if (ret == -EISDIR) 853 if (ret == -EISDIR)
854 ret = 0; 854 ret = 0;
855 return ret; 855 return ret < 0 ? ret : need_mntput;
856} 856}
857 857
858int follow_down_one(struct path *path) 858int follow_down_one(struct path *path)
@@ -900,6 +900,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
900 break; 900 break;
901 path->mnt = mounted; 901 path->mnt = mounted;
902 path->dentry = mounted->mnt_root; 902 path->dentry = mounted->mnt_root;
903 nd->flags |= LOOKUP_JUMPED;
903 nd->seq = read_seqcount_begin(&path->dentry->d_seq); 904 nd->seq = read_seqcount_begin(&path->dentry->d_seq);
904 /* 905 /*
905 * Update the inode too. We don't need to re-check the 906 * Update the inode too. We don't need to re-check the
@@ -1213,6 +1214,8 @@ retry:
1213 path_put_conditional(path, nd); 1214 path_put_conditional(path, nd);
1214 return err; 1215 return err;
1215 } 1216 }
1217 if (err)
1218 nd->flags |= LOOKUP_JUMPED;
1216 *inode = path->dentry->d_inode; 1219 *inode = path->dentry->d_inode;
1217 return 0; 1220 return 0;
1218} 1221}
@@ -2146,6 +2149,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2146 } 2149 }
2147 2150
2148 /* create side of things */ 2151 /* create side of things */
2152 /*
2153 * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED has been
2154 * cleared when we got to the last component we are about to look up
2155 */
2149 error = complete_walk(nd); 2156 error = complete_walk(nd);
2150 if (error) 2157 if (error)
2151 return ERR_PTR(error); 2158 return ERR_PTR(error);
@@ -2214,6 +2221,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2214 if (error < 0) 2221 if (error < 0)
2215 goto exit_dput; 2222 goto exit_dput;
2216 2223
2224 if (error)
2225 nd->flags |= LOOKUP_JUMPED;
2226
2217 error = -ENOENT; 2227 error = -ENOENT;
2218 if (!path->dentry->d_inode) 2228 if (!path->dentry->d_inode)
2219 goto exit_dput; 2229 goto exit_dput;
@@ -2223,6 +2233,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2223 2233
2224 path_to_nameidata(path, nd); 2234 path_to_nameidata(path, nd);
2225 nd->inode = path->dentry->d_inode; 2235 nd->inode = path->dentry->d_inode;
2236 /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */
2237 error = complete_walk(nd);
2238 if (error)
2239 goto exit;
2226 error = -EISDIR; 2240 error = -EISDIR;
2227 if (S_ISDIR(nd->inode->i_mode)) 2241 if (S_ISDIR(nd->inode->i_mode))
2228 goto exit; 2242 goto exit;
diff --git a/fs/namespace.c b/fs/namespace.c
index e5e1c7d1839..cfc6d4448aa 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1048,15 +1048,12 @@ static int show_mountinfo(struct seq_file *m, void *v)
1048 if (err) 1048 if (err)
1049 goto out; 1049 goto out;
1050 seq_putc(m, ' '); 1050 seq_putc(m, ' ');
1051 seq_path_root(m, &mnt_path, &root, " \t\n\\"); 1051
1052 if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) { 1052 /* mountpoints outside of chroot jail will give SEQ_SKIP on this */
1053 /* 1053 err = seq_path_root(m, &mnt_path, &root, " \t\n\\");
1054 * Mountpoint is outside root, discard that one. Ugly, 1054 if (err)
1055 * but less so than trying to do that in iterator in a 1055 goto out;
1056 * race-free way (due to renames). 1056
1057 */
1058 return SEQ_SKIP;
1059 }
1060 seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw"); 1057 seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw");
1061 show_mnt_opts(m, mnt); 1058 show_mnt_opts(m, mnt);
1062 1059
@@ -2483,11 +2480,43 @@ struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt)
2483 __mnt_make_longterm(mnt); 2480 __mnt_make_longterm(mnt);
2484 new_ns->root = mnt; 2481 new_ns->root = mnt;
2485 list_add(&new_ns->list, &new_ns->root->mnt_list); 2482 list_add(&new_ns->list, &new_ns->root->mnt_list);
2483 } else {
2484 mntput(mnt);
2486 } 2485 }
2487 return new_ns; 2486 return new_ns;
2488} 2487}
2489EXPORT_SYMBOL(create_mnt_ns); 2488EXPORT_SYMBOL(create_mnt_ns);
2490 2489
2490struct dentry *mount_subtree(struct vfsmount *mnt, const char *name)
2491{
2492 struct mnt_namespace *ns;
2493 struct super_block *s;
2494 struct path path;
2495 int err;
2496
2497 ns = create_mnt_ns(mnt);
2498 if (IS_ERR(ns))
2499 return ERR_CAST(ns);
2500
2501 err = vfs_path_lookup(mnt->mnt_root, mnt,
2502 name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
2503
2504 put_mnt_ns(ns);
2505
2506 if (err)
2507 return ERR_PTR(err);
2508
2509 /* trade a vfsmount reference for active sb one */
2510 s = path.mnt->mnt_sb;
2511 atomic_inc(&s->s_active);
2512 mntput(path.mnt);
2513 /* lock the sucker */
2514 down_write(&s->s_umount);
2515 /* ... and return the root of (sub)tree on it */
2516 return path.dentry;
2517}
2518EXPORT_SYMBOL(mount_subtree);
2519
2491SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, 2520SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
2492 char __user *, type, unsigned long, flags, void __user *, data) 2521 char __user *, type, unsigned long, flags, void __user *, data)
2493{ 2522{
@@ -2744,3 +2773,8 @@ void kern_unmount(struct vfsmount *mnt)
2744 } 2773 }
2745} 2774}
2746EXPORT_SYMBOL(kern_unmount); 2775EXPORT_SYMBOL(kern_unmount);
2776
2777bool our_mnt(struct vfsmount *mnt)
2778{
2779 return check_mnt(mnt);
2780}
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 918ad647afe..726e59a9e50 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -488,17 +488,18 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp,
488 struct xdr_stream *xdr, 488 struct xdr_stream *xdr,
489 struct cb_recallanyargs *args) 489 struct cb_recallanyargs *args)
490{ 490{
491 __be32 *p; 491 uint32_t bitmap[2];
492 __be32 *p, status;
492 493
493 args->craa_addr = svc_addr(rqstp); 494 args->craa_addr = svc_addr(rqstp);
494 p = read_buf(xdr, 4); 495 p = read_buf(xdr, 4);
495 if (unlikely(p == NULL)) 496 if (unlikely(p == NULL))
496 return htonl(NFS4ERR_BADXDR); 497 return htonl(NFS4ERR_BADXDR);
497 args->craa_objs_to_keep = ntohl(*p++); 498 args->craa_objs_to_keep = ntohl(*p++);
498 p = read_buf(xdr, 4); 499 status = decode_bitmap(xdr, bitmap);
499 if (unlikely(p == NULL)) 500 if (unlikely(status))
500 return htonl(NFS4ERR_BADXDR); 501 return status;
501 args->craa_type_mask = ntohl(*p); 502 args->craa_type_mask = bitmap[0];
502 503
503 return 0; 504 return 0;
504} 505}
@@ -986,4 +987,5 @@ struct svc_version nfs4_callback_version4 = {
986 .vs_proc = nfs4_callback_procedures1, 987 .vs_proc = nfs4_callback_procedures1,
987 .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, 988 .vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
988 .vs_dispatch = NULL, 989 .vs_dispatch = NULL,
990 .vs_hidden = 1,
989}; 991};
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index b238d95ac48..ac289909814 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1468,12 +1468,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
1468 res = NULL; 1468 res = NULL;
1469 goto out; 1469 goto out;
1470 /* This turned out not to be a regular file */ 1470 /* This turned out not to be a regular file */
1471 case -EISDIR:
1471 case -ENOTDIR: 1472 case -ENOTDIR:
1472 goto no_open; 1473 goto no_open;
1473 case -ELOOP: 1474 case -ELOOP:
1474 if (!(nd->intent.open.flags & O_NOFOLLOW)) 1475 if (!(nd->intent.open.flags & O_NOFOLLOW))
1475 goto no_open; 1476 goto no_open;
1476 /* case -EISDIR: */
1477 /* case -EINVAL: */ 1477 /* case -EINVAL: */
1478 default: 1478 default:
1479 res = ERR_CAST(inode); 1479 res = ERR_CAST(inode);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 91c01f0a4c3..eca56d4b39c 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -40,48 +40,8 @@
40 40
41#define NFSDBG_FACILITY NFSDBG_FILE 41#define NFSDBG_FACILITY NFSDBG_FILE
42 42
43static int nfs_file_open(struct inode *, struct file *);
44static int nfs_file_release(struct inode *, struct file *);
45static loff_t nfs_file_llseek(struct file *file, loff_t offset, int origin);
46static int nfs_file_mmap(struct file *, struct vm_area_struct *);
47static ssize_t nfs_file_splice_read(struct file *filp, loff_t *ppos,
48 struct pipe_inode_info *pipe,
49 size_t count, unsigned int flags);
50static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov,
51 unsigned long nr_segs, loff_t pos);
52static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
53 struct file *filp, loff_t *ppos,
54 size_t count, unsigned int flags);
55static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
56 unsigned long nr_segs, loff_t pos);
57static int nfs_file_flush(struct file *, fl_owner_t id);
58static int nfs_file_fsync(struct file *, loff_t, loff_t, int datasync);
59static int nfs_check_flags(int flags);
60static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
61static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
62static int nfs_setlease(struct file *file, long arg, struct file_lock **fl);
63
64static const struct vm_operations_struct nfs_file_vm_ops; 43static const struct vm_operations_struct nfs_file_vm_ops;
65 44
66const struct file_operations nfs_file_operations = {
67 .llseek = nfs_file_llseek,
68 .read = do_sync_read,
69 .write = do_sync_write,
70 .aio_read = nfs_file_read,
71 .aio_write = nfs_file_write,
72 .mmap = nfs_file_mmap,
73 .open = nfs_file_open,
74 .flush = nfs_file_flush,
75 .release = nfs_file_release,
76 .fsync = nfs_file_fsync,
77 .lock = nfs_lock,
78 .flock = nfs_flock,
79 .splice_read = nfs_file_splice_read,
80 .splice_write = nfs_file_splice_write,
81 .check_flags = nfs_check_flags,
82 .setlease = nfs_setlease,
83};
84
85const struct inode_operations nfs_file_inode_operations = { 45const struct inode_operations nfs_file_inode_operations = {
86 .permission = nfs_permission, 46 .permission = nfs_permission,
87 .getattr = nfs_getattr, 47 .getattr = nfs_getattr,
@@ -137,11 +97,9 @@ nfs_file_open(struct inode *inode, struct file *filp)
137static int 97static int
138nfs_file_release(struct inode *inode, struct file *filp) 98nfs_file_release(struct inode *inode, struct file *filp)
139{ 99{
140 struct dentry *dentry = filp->f_path.dentry;
141
142 dprintk("NFS: release(%s/%s)\n", 100 dprintk("NFS: release(%s/%s)\n",
143 dentry->d_parent->d_name.name, 101 filp->f_path.dentry->d_parent->d_name.name,
144 dentry->d_name.name); 102 filp->f_path.dentry->d_name.name);
145 103
146 nfs_inc_stats(inode, NFSIOS_VFSRELEASE); 104 nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
147 return nfs_release(inode, filp); 105 return nfs_release(inode, filp);
@@ -228,14 +186,13 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
228 struct dentry * dentry = iocb->ki_filp->f_path.dentry; 186 struct dentry * dentry = iocb->ki_filp->f_path.dentry;
229 struct inode * inode = dentry->d_inode; 187 struct inode * inode = dentry->d_inode;
230 ssize_t result; 188 ssize_t result;
231 size_t count = iov_length(iov, nr_segs);
232 189
233 if (iocb->ki_filp->f_flags & O_DIRECT) 190 if (iocb->ki_filp->f_flags & O_DIRECT)
234 return nfs_file_direct_read(iocb, iov, nr_segs, pos); 191 return nfs_file_direct_read(iocb, iov, nr_segs, pos);
235 192
236 dprintk("NFS: read(%s/%s, %lu@%lu)\n", 193 dprintk("NFS: read(%s/%s, %lu@%lu)\n",
237 dentry->d_parent->d_name.name, dentry->d_name.name, 194 dentry->d_parent->d_name.name, dentry->d_name.name,
238 (unsigned long) count, (unsigned long) pos); 195 (unsigned long) iov_length(iov, nr_segs), (unsigned long) pos);
239 196
240 result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); 197 result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
241 if (!result) { 198 if (!result) {
@@ -889,3 +846,54 @@ static int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
889 file->f_path.dentry->d_name.name, arg); 846 file->f_path.dentry->d_name.name, arg);
890 return -EINVAL; 847 return -EINVAL;
891} 848}
849
850const struct file_operations nfs_file_operations = {
851 .llseek = nfs_file_llseek,
852 .read = do_sync_read,
853 .write = do_sync_write,
854 .aio_read = nfs_file_read,
855 .aio_write = nfs_file_write,
856 .mmap = nfs_file_mmap,
857 .open = nfs_file_open,
858 .flush = nfs_file_flush,
859 .release = nfs_file_release,
860 .fsync = nfs_file_fsync,
861 .lock = nfs_lock,
862 .flock = nfs_flock,
863 .splice_read = nfs_file_splice_read,
864 .splice_write = nfs_file_splice_write,
865 .check_flags = nfs_check_flags,
866 .setlease = nfs_setlease,
867};
868
869#ifdef CONFIG_NFS_V4
870static int
871nfs4_file_open(struct inode *inode, struct file *filp)
872{
873 /*
874 * NFSv4 opens are handled in d_lookup and d_revalidate. If we get to
875 * this point, then something is very wrong
876 */
877 dprintk("NFS: %s called! inode=%p filp=%p\n", __func__, inode, filp);
878 return -ENOTDIR;
879}
880
881const struct file_operations nfs4_file_operations = {
882 .llseek = nfs_file_llseek,
883 .read = do_sync_read,
884 .write = do_sync_write,
885 .aio_read = nfs_file_read,
886 .aio_write = nfs_file_write,
887 .mmap = nfs_file_mmap,
888 .open = nfs4_file_open,
889 .flush = nfs_file_flush,
890 .release = nfs_file_release,
891 .fsync = nfs_file_fsync,
892 .lock = nfs_lock,
893 .flock = nfs_flock,
894 .splice_read = nfs_file_splice_read,
895 .splice_write = nfs_file_splice_write,
896 .check_flags = nfs_check_flags,
897 .setlease = nfs_setlease,
898};
899#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c07a55aec83..50a15fa8cf9 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -291,7 +291,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
291 */ 291 */
292 inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops; 292 inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops;
293 if (S_ISREG(inode->i_mode)) { 293 if (S_ISREG(inode->i_mode)) {
294 inode->i_fop = &nfs_file_operations; 294 inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops;
295 inode->i_data.a_ops = &nfs_file_aops; 295 inode->i_data.a_ops = &nfs_file_aops;
296 inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info; 296 inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info;
297 } else if (S_ISDIR(inode->i_mode)) { 297 } else if (S_ISDIR(inode->i_mode)) {
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index c1a1bd8ddf1..3f4d95751d5 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -299,6 +299,8 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
299extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, 299extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
300 struct list_head *head); 300 struct list_head *head);
301 301
302extern void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
303 struct inode *inode);
302extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); 304extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
303extern void nfs_readdata_release(struct nfs_read_data *rdata); 305extern void nfs_readdata_release(struct nfs_read_data *rdata);
304 306
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 85f1690ca08..d4bc9ed9174 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -853,6 +853,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
853 .dentry_ops = &nfs_dentry_operations, 853 .dentry_ops = &nfs_dentry_operations,
854 .dir_inode_ops = &nfs3_dir_inode_operations, 854 .dir_inode_ops = &nfs3_dir_inode_operations,
855 .file_inode_ops = &nfs3_file_inode_operations, 855 .file_inode_ops = &nfs3_file_inode_operations,
856 .file_ops = &nfs_file_operations,
856 .getroot = nfs3_proc_get_root, 857 .getroot = nfs3_proc_get_root,
857 .getattr = nfs3_proc_getattr, 858 .getattr = nfs3_proc_getattr,
858 .setattr = nfs3_proc_setattr, 859 .setattr = nfs3_proc_setattr,
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 09119418402..a62d36b9a99 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -31,6 +31,7 @@
31 31
32#include <linux/nfs_fs.h> 32#include <linux/nfs_fs.h>
33#include <linux/nfs_page.h> 33#include <linux/nfs_page.h>
34#include <linux/module.h>
34 35
35#include "internal.h" 36#include "internal.h"
36#include "nfs4filelayout.h" 37#include "nfs4filelayout.h"
@@ -449,9 +450,8 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
449 450
450 fl->dsaddr = dsaddr; 451 fl->dsaddr = dsaddr;
451 452
452 if (fl->first_stripe_index < 0 || 453 if (fl->first_stripe_index >= dsaddr->stripe_count) {
453 fl->first_stripe_index >= dsaddr->stripe_count) { 454 dprintk("%s Bad first_stripe_index %u\n",
454 dprintk("%s Bad first_stripe_index %d\n",
455 __func__, fl->first_stripe_index); 455 __func__, fl->first_stripe_index);
456 goto out_put; 456 goto out_put;
457 } 457 }
@@ -552,7 +552,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
552 552
553 /* Note that a zero value for num_fh is legal for STRIPE_SPARSE. 553 /* Note that a zero value for num_fh is legal for STRIPE_SPARSE.
554 * Futher checking is done in filelayout_check_layout */ 554 * Futher checking is done in filelayout_check_layout */
555 if (fl->num_fh < 0 || fl->num_fh > 555 if (fl->num_fh >
556 max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT)) 556 max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT))
557 goto out_err; 557 goto out_err;
558 558
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d2ae413c986..be2bbac1381 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2464,8 +2464,7 @@ static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qst
2464 case -NFS4ERR_BADNAME: 2464 case -NFS4ERR_BADNAME:
2465 return -ENOENT; 2465 return -ENOENT;
2466 case -NFS4ERR_MOVED: 2466 case -NFS4ERR_MOVED:
2467 err = nfs4_get_referral(dir, name, fattr, fhandle); 2467 return nfs4_get_referral(dir, name, fattr, fhandle);
2468 break;
2469 case -NFS4ERR_WRONGSEC: 2468 case -NFS4ERR_WRONGSEC:
2470 nfs_fixup_secinfo_attributes(fattr, fhandle); 2469 nfs_fixup_secinfo_attributes(fattr, fhandle);
2471 } 2470 }
@@ -5950,6 +5949,7 @@ static void nfs4_layoutcommit_release(void *calldata)
5950{ 5949{
5951 struct nfs4_layoutcommit_data *data = calldata; 5950 struct nfs4_layoutcommit_data *data = calldata;
5952 struct pnfs_layout_segment *lseg, *tmp; 5951 struct pnfs_layout_segment *lseg, *tmp;
5952 unsigned long *bitlock = &NFS_I(data->args.inode)->flags;
5953 5953
5954 pnfs_cleanup_layoutcommit(data); 5954 pnfs_cleanup_layoutcommit(data);
5955 /* Matched by references in pnfs_set_layoutcommit */ 5955 /* Matched by references in pnfs_set_layoutcommit */
@@ -5959,6 +5959,11 @@ static void nfs4_layoutcommit_release(void *calldata)
5959 &lseg->pls_flags)) 5959 &lseg->pls_flags))
5960 put_lseg(lseg); 5960 put_lseg(lseg);
5961 } 5961 }
5962
5963 clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
5964 smp_mb__after_clear_bit();
5965 wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
5966
5962 put_rpccred(data->cred); 5967 put_rpccred(data->cred);
5963 kfree(data); 5968 kfree(data);
5964} 5969}
@@ -6247,6 +6252,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
6247 .dentry_ops = &nfs4_dentry_operations, 6252 .dentry_ops = &nfs4_dentry_operations,
6248 .dir_inode_ops = &nfs4_dir_inode_operations, 6253 .dir_inode_ops = &nfs4_dir_inode_operations,
6249 .file_inode_ops = &nfs4_file_inode_operations, 6254 .file_inode_ops = &nfs4_file_inode_operations,
6255 .file_ops = &nfs4_file_operations,
6250 .getroot = nfs4_proc_get_root, 6256 .getroot = nfs4_proc_get_root,
6251 .getattr = nfs4_proc_getattr, 6257 .getattr = nfs4_proc_getattr,
6252 .setattr = nfs4_proc_setattr, 6258 .setattr = nfs4_proc_setattr,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 1dce12f41a4..e6161b213ed 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -6602,8 +6602,6 @@ static int nfs4_xdr_dec_secinfo(struct rpc_rqst *rqstp,
6602 if (status) 6602 if (status)
6603 goto out; 6603 goto out;
6604 status = decode_secinfo(xdr, res); 6604 status = decode_secinfo(xdr, res);
6605 if (status)
6606 goto out;
6607out: 6605out:
6608 return status; 6606 return status;
6609} 6607}
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index d0cda12fddc..c807ab93140 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -38,21 +38,15 @@
38 */ 38 */
39 39
40#include <linux/module.h> 40#include <linux/module.h>
41#include <scsi/osd_initiator.h> 41#include <scsi/osd_ore.h>
42 42
43#include "objlayout.h" 43#include "objlayout.h"
44 44
45#define NFSDBG_FACILITY NFSDBG_PNFS_LD 45#define NFSDBG_FACILITY NFSDBG_PNFS_LD
46 46
47#define _LLU(x) ((unsigned long long)x)
48
49enum { BIO_MAX_PAGES_KMALLOC =
50 (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
51};
52
53struct objio_dev_ent { 47struct objio_dev_ent {
54 struct nfs4_deviceid_node id_node; 48 struct nfs4_deviceid_node id_node;
55 struct osd_dev *od; 49 struct ore_dev od;
56}; 50};
57 51
58static void 52static void
@@ -60,8 +54,8 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d)
60{ 54{
61 struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node); 55 struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
62 56
63 dprintk("%s: free od=%p\n", __func__, de->od); 57 dprintk("%s: free od=%p\n", __func__, de->od.od);
64 osduld_put_device(de->od); 58 osduld_put_device(de->od.od);
65 kfree(de); 59 kfree(de);
66} 60}
67 61
@@ -98,12 +92,12 @@ _dev_list_add(const struct nfs_server *nfss,
98 nfss->pnfs_curr_ld, 92 nfss->pnfs_curr_ld,
99 nfss->nfs_client, 93 nfss->nfs_client,
100 d_id); 94 d_id);
101 de->od = od; 95 de->od.od = od;
102 96
103 d = nfs4_insert_deviceid_node(&de->id_node); 97 d = nfs4_insert_deviceid_node(&de->id_node);
104 n = container_of(d, struct objio_dev_ent, id_node); 98 n = container_of(d, struct objio_dev_ent, id_node);
105 if (n != de) { 99 if (n != de) {
106 dprintk("%s: Race with other n->od=%p\n", __func__, n->od); 100 dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od);
107 objio_free_deviceid_node(&de->id_node); 101 objio_free_deviceid_node(&de->id_node);
108 de = n; 102 de = n;
109 } 103 }
@@ -111,28 +105,11 @@ _dev_list_add(const struct nfs_server *nfss,
111 return de; 105 return de;
112} 106}
113 107
114struct caps_buffers {
115 u8 caps_key[OSD_CRYPTO_KEYID_SIZE];
116 u8 creds[OSD_CAP_LEN];
117};
118
119struct objio_segment { 108struct objio_segment {
120 struct pnfs_layout_segment lseg; 109 struct pnfs_layout_segment lseg;
121 110
122 struct pnfs_osd_object_cred *comps; 111 struct ore_layout layout;
123 112 struct ore_components oc;
124 unsigned mirrors_p1;
125 unsigned stripe_unit;
126 unsigned group_width; /* Data stripe_units without integrity comps */
127 u64 group_depth;
128 unsigned group_count;
129
130 unsigned max_io_size;
131
132 unsigned comps_index;
133 unsigned num_comps;
134 /* variable length */
135 struct objio_dev_ent *ods[];
136}; 113};
137 114
138static inline struct objio_segment * 115static inline struct objio_segment *
@@ -141,59 +118,44 @@ OBJIO_LSEG(struct pnfs_layout_segment *lseg)
141 return container_of(lseg, struct objio_segment, lseg); 118 return container_of(lseg, struct objio_segment, lseg);
142} 119}
143 120
144struct objio_state;
145typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
146
147struct objio_state { 121struct objio_state {
148 /* Generic layer */ 122 /* Generic layer */
149 struct objlayout_io_state ol_state; 123 struct objlayout_io_res oir;
150 124
151 struct objio_segment *layout; 125 bool sync;
152 126 /*FIXME: Support for extra_bytes at ore_get_rw_state() */
153 struct kref kref; 127 struct ore_io_state *ios;
154 objio_done_fn done;
155 void *private;
156
157 unsigned long length;
158 unsigned numdevs; /* Actually used devs in this IO */
159 /* A per-device variable array of size numdevs */
160 struct _objio_per_comp {
161 struct bio *bio;
162 struct osd_request *or;
163 unsigned long length;
164 u64 offset;
165 unsigned dev;
166 } per_dev[];
167}; 128};
168 129
169/* Send and wait for a get_device_info of devices in the layout, 130/* Send and wait for a get_device_info of devices in the layout,
170 then look them up with the osd_initiator library */ 131 then look them up with the osd_initiator library */
171static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay, 132static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
172 struct objio_segment *objio_seg, unsigned comp, 133 struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id,
173 gfp_t gfp_flags) 134 gfp_t gfp_flags)
174{ 135{
175 struct pnfs_osd_deviceaddr *deviceaddr; 136 struct pnfs_osd_deviceaddr *deviceaddr;
176 struct nfs4_deviceid *d_id;
177 struct objio_dev_ent *ode; 137 struct objio_dev_ent *ode;
178 struct osd_dev *od; 138 struct osd_dev *od;
179 struct osd_dev_info odi; 139 struct osd_dev_info odi;
180 int err; 140 int err;
181 141
182 d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id;
183
184 ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id); 142 ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
185 if (ode) 143 if (ode) {
186 return ode; 144 objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
145 return 0;
146 }
187 147
188 err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags); 148 err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
189 if (unlikely(err)) { 149 if (unlikely(err)) {
190 dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n", 150 dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
191 __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err); 151 __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
192 return ERR_PTR(err); 152 return err;
193 } 153 }
194 154
195 odi.systemid_len = deviceaddr->oda_systemid.len; 155 odi.systemid_len = deviceaddr->oda_systemid.len;
196 if (odi.systemid_len > sizeof(odi.systemid)) { 156 if (odi.systemid_len > sizeof(odi.systemid)) {
157 dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n",
158 __func__, sizeof(odi.systemid));
197 err = -EINVAL; 159 err = -EINVAL;
198 goto out; 160 goto out;
199 } else if (odi.systemid_len) 161 } else if (odi.systemid_len)
@@ -218,96 +180,53 @@ static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
218 180
219 ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od, 181 ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
220 gfp_flags); 182 gfp_flags);
221 183 objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
184 dprintk("Adding new dev_id(%llx:%llx)\n",
185 _DEVID_LO(d_id), _DEVID_HI(d_id));
222out: 186out:
223 dprintk("%s: return=%d\n", __func__, err);
224 objlayout_put_deviceinfo(deviceaddr); 187 objlayout_put_deviceinfo(deviceaddr);
225 return err ? ERR_PTR(err) : ode; 188 return err;
226} 189}
227 190
228static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, 191static void copy_single_comp(struct ore_components *oc, unsigned c,
229 struct objio_segment *objio_seg, 192 struct pnfs_osd_object_cred *src_comp)
230 gfp_t gfp_flags)
231{ 193{
232 unsigned i; 194 struct ore_comp *ocomp = &oc->comps[c];
233 int err;
234 195
235 /* lookup all devices */ 196 WARN_ON(src_comp->oc_cap_key.cred_len > 0); /* libosd is NO_SEC only */
236 for (i = 0; i < objio_seg->num_comps; i++) { 197 WARN_ON(src_comp->oc_cap.cred_len > sizeof(ocomp->cred));
237 struct objio_dev_ent *ode;
238 198
239 ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags); 199 ocomp->obj.partition = src_comp->oc_object_id.oid_partition_id;
240 if (unlikely(IS_ERR(ode))) { 200 ocomp->obj.id = src_comp->oc_object_id.oid_object_id;
241 err = PTR_ERR(ode);
242 goto out;
243 }
244 objio_seg->ods[i] = ode;
245 }
246 err = 0;
247 201
248out: 202 memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred));
249 dprintk("%s: return=%d\n", __func__, err);
250 return err;
251} 203}
252 204
253static int _verify_data_map(struct pnfs_osd_layout *layout) 205int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags,
206 struct objio_segment **pseg)
254{ 207{
255 struct pnfs_osd_data_map *data_map = &layout->olo_map; 208 struct __alloc_objio_segment {
256 u64 stripe_length; 209 struct objio_segment olseg;
257 u32 group_width; 210 struct ore_dev *ods[numdevs];
258 211 struct ore_comp comps[numdevs];
259/* FIXME: Only raid0 for now. if not go through MDS */ 212 } *aolseg;
260 if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
261 printk(KERN_ERR "Only RAID_0 for now\n");
262 return -ENOTSUPP;
263 }
264 if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
265 printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
266 data_map->odm_num_comps, data_map->odm_mirror_cnt);
267 return -EINVAL;
268 }
269 213
270 if (data_map->odm_group_width) 214 aolseg = kzalloc(sizeof(*aolseg), gfp_flags);
271 group_width = data_map->odm_group_width; 215 if (unlikely(!aolseg)) {
272 else 216 dprintk("%s: Faild allocation numdevs=%d size=%zd\n", __func__,
273 group_width = data_map->odm_num_comps / 217 numdevs, sizeof(*aolseg));
274 (data_map->odm_mirror_cnt + 1); 218 return -ENOMEM;
275
276 stripe_length = (u64)data_map->odm_stripe_unit * group_width;
277 if (stripe_length >= (1ULL << 32)) {
278 printk(KERN_ERR "Total Stripe length(0x%llx)"
279 " >= 32bit is not supported\n", _LLU(stripe_length));
280 return -ENOTSUPP;
281 } 219 }
282 220
283 if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) { 221 aolseg->olseg.oc.numdevs = numdevs;
284 printk(KERN_ERR "Stripe Unit(0x%llx)" 222 aolseg->olseg.oc.single_comp = EC_MULTPLE_COMPS;
285 " must be Multples of PAGE_SIZE(0x%lx)\n", 223 aolseg->olseg.oc.comps = aolseg->comps;
286 _LLU(data_map->odm_stripe_unit), PAGE_SIZE); 224 aolseg->olseg.oc.ods = aolseg->ods;
287 return -ENOTSUPP;
288 }
289 225
226 *pseg = &aolseg->olseg;
290 return 0; 227 return 0;
291} 228}
292 229
293static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp,
294 struct pnfs_osd_object_cred *src_comp,
295 struct caps_buffers *caps_p)
296{
297 WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key));
298 WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds));
299
300 *cur_comp = *src_comp;
301
302 memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred,
303 sizeof(caps_p->caps_key));
304 cur_comp->oc_cap_key.cred = caps_p->caps_key;
305
306 memcpy(caps_p->creds, src_comp->oc_cap.cred,
307 sizeof(caps_p->creds));
308 cur_comp->oc_cap.cred = caps_p->creds;
309}
310
311int objio_alloc_lseg(struct pnfs_layout_segment **outp, 230int objio_alloc_lseg(struct pnfs_layout_segment **outp,
312 struct pnfs_layout_hdr *pnfslay, 231 struct pnfs_layout_hdr *pnfslay,
313 struct pnfs_layout_range *range, 232 struct pnfs_layout_range *range,
@@ -317,59 +236,43 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
317 struct objio_segment *objio_seg; 236 struct objio_segment *objio_seg;
318 struct pnfs_osd_xdr_decode_layout_iter iter; 237 struct pnfs_osd_xdr_decode_layout_iter iter;
319 struct pnfs_osd_layout layout; 238 struct pnfs_osd_layout layout;
320 struct pnfs_osd_object_cred *cur_comp, src_comp; 239 struct pnfs_osd_object_cred src_comp;
321 struct caps_buffers *caps_p; 240 unsigned cur_comp;
322 int err; 241 int err;
323 242
324 err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr); 243 err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
325 if (unlikely(err)) 244 if (unlikely(err))
326 return err; 245 return err;
327 246
328 err = _verify_data_map(&layout); 247 err = __alloc_objio_seg(layout.olo_num_comps, gfp_flags, &objio_seg);
329 if (unlikely(err)) 248 if (unlikely(err))
330 return err; 249 return err;
331 250
332 objio_seg = kzalloc(sizeof(*objio_seg) + 251 objio_seg->layout.stripe_unit = layout.olo_map.odm_stripe_unit;
333 sizeof(objio_seg->ods[0]) * layout.olo_num_comps + 252 objio_seg->layout.group_width = layout.olo_map.odm_group_width;
334 sizeof(*objio_seg->comps) * layout.olo_num_comps + 253 objio_seg->layout.group_depth = layout.olo_map.odm_group_depth;
335 sizeof(struct caps_buffers) * layout.olo_num_comps, 254 objio_seg->layout.mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
336 gfp_flags); 255 objio_seg->layout.raid_algorithm = layout.olo_map.odm_raid_algorithm;
337 if (!objio_seg)
338 return -ENOMEM;
339 256
340 objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps); 257 err = ore_verify_layout(layout.olo_map.odm_num_comps,
341 cur_comp = objio_seg->comps; 258 &objio_seg->layout);
342 caps_p = (void *)(cur_comp + layout.olo_num_comps);
343 while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err))
344 copy_single_comp(cur_comp++, &src_comp, caps_p++);
345 if (unlikely(err)) 259 if (unlikely(err))
346 goto err; 260 goto err;
347 261
348 objio_seg->num_comps = layout.olo_num_comps; 262 objio_seg->oc.first_dev = layout.olo_comps_index;
349 objio_seg->comps_index = layout.olo_comps_index; 263 cur_comp = 0;
350 err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags); 264 while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) {
351 if (err) 265 copy_single_comp(&objio_seg->oc, cur_comp, &src_comp);
352 goto err; 266 err = objio_devices_lookup(pnfslay, objio_seg, cur_comp,
353 267 &src_comp.oc_object_id.oid_device_id,
354 objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1; 268 gfp_flags);
355 objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit; 269 if (err)
356 if (layout.olo_map.odm_group_width) { 270 goto err;
357 objio_seg->group_width = layout.olo_map.odm_group_width; 271 ++cur_comp;
358 objio_seg->group_depth = layout.olo_map.odm_group_depth;
359 objio_seg->group_count = layout.olo_map.odm_num_comps /
360 objio_seg->mirrors_p1 /
361 objio_seg->group_width;
362 } else {
363 objio_seg->group_width = layout.olo_map.odm_num_comps /
364 objio_seg->mirrors_p1;
365 objio_seg->group_depth = -1;
366 objio_seg->group_count = 1;
367 } 272 }
368 273 /* pnfs_osd_xdr_decode_layout_comp returns false on error */
369 /* Cache this calculation it will hit for every page */ 274 if (unlikely(err))
370 objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - 275 goto err;
371 objio_seg->stripe_unit) *
372 objio_seg->group_width;
373 276
374 *outp = &objio_seg->lseg; 277 *outp = &objio_seg->lseg;
375 return 0; 278 return 0;
@@ -386,43 +289,63 @@ void objio_free_lseg(struct pnfs_layout_segment *lseg)
386 int i; 289 int i;
387 struct objio_segment *objio_seg = OBJIO_LSEG(lseg); 290 struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
388 291
389 for (i = 0; i < objio_seg->num_comps; i++) { 292 for (i = 0; i < objio_seg->oc.numdevs; i++) {
390 if (!objio_seg->ods[i]) 293 struct ore_dev *od = objio_seg->oc.ods[i];
294 struct objio_dev_ent *ode;
295
296 if (!od)
391 break; 297 break;
392 nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node); 298 ode = container_of(od, typeof(*ode), od);
299 nfs4_put_deviceid_node(&ode->id_node);
393 } 300 }
394 kfree(objio_seg); 301 kfree(objio_seg);
395} 302}
396 303
397int objio_alloc_io_state(struct pnfs_layout_segment *lseg, 304static int
398 struct objlayout_io_state **outp, 305objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, bool is_reading,
399 gfp_t gfp_flags) 306 struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase,
307 loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags,
308 struct objio_state **outp)
400{ 309{
401 struct objio_segment *objio_seg = OBJIO_LSEG(lseg); 310 struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
402 struct objio_state *ios; 311 struct ore_io_state *ios;
403 const unsigned first_size = sizeof(*ios) + 312 int ret;
404 objio_seg->num_comps * sizeof(ios->per_dev[0]); 313 struct __alloc_objio_state {
405 const unsigned sec_size = objio_seg->num_comps * 314 struct objio_state objios;
406 sizeof(ios->ol_state.ioerrs[0]); 315 struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs];
407 316 } *aos;
408 ios = kzalloc(first_size + sec_size, gfp_flags); 317
409 if (unlikely(!ios)) 318 aos = kzalloc(sizeof(*aos), gfp_flags);
319 if (unlikely(!aos))
410 return -ENOMEM; 320 return -ENOMEM;
411 321
412 ios->layout = objio_seg; 322 objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs,
413 ios->ol_state.ioerrs = ((void *)ios) + first_size; 323 aos->ioerrs, rpcdata, pnfs_layout_type);
414 ios->ol_state.num_comps = objio_seg->num_comps;
415 324
416 *outp = &ios->ol_state; 325 ret = ore_get_rw_state(&objio_seg->layout, &objio_seg->oc, is_reading,
326 offset, count, &ios);
327 if (unlikely(ret)) {
328 kfree(aos);
329 return ret;
330 }
331
332 ios->pages = pages;
333 ios->pgbase = pgbase;
334 ios->private = aos;
335 BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT);
336
337 aos->objios.sync = 0;
338 aos->objios.ios = ios;
339 *outp = &aos->objios;
417 return 0; 340 return 0;
418} 341}
419 342
420void objio_free_io_state(struct objlayout_io_state *ol_state) 343void objio_free_result(struct objlayout_io_res *oir)
421{ 344{
422 struct objio_state *ios = container_of(ol_state, struct objio_state, 345 struct objio_state *objios = container_of(oir, struct objio_state, oir);
423 ol_state);
424 346
425 kfree(ios); 347 ore_put_io_state(objios->ios);
348 kfree(objios);
426} 349}
427 350
428enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) 351enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
@@ -455,539 +378,152 @@ enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
455 } 378 }
456} 379}
457 380
458static void _clear_bio(struct bio *bio) 381static void __on_dev_error(struct ore_io_state *ios,
382 struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep,
383 u64 dev_offset, u64 dev_len)
459{ 384{
460 struct bio_vec *bv; 385 struct objio_state *objios = ios->private;
461 unsigned i; 386 struct pnfs_osd_objid pooid;
462 387 struct objio_dev_ent *ode = container_of(od, typeof(*ode), od);
463 __bio_for_each_segment(bv, bio, i, 0) { 388 /* FIXME: what to do with more-then-one-group layouts. We need to
464 unsigned this_count = bv->bv_len; 389 * translate from ore_io_state index to oc->comps index
465 390 */
466 if (likely(PAGE_SIZE == this_count)) 391 unsigned comp = dev_index;
467 clear_highpage(bv->bv_page);
468 else
469 zero_user(bv->bv_page, bv->bv_offset, this_count);
470 }
471}
472
473static int _io_check(struct objio_state *ios, bool is_write)
474{
475 enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
476 int lin_ret = 0;
477 int i;
478
479 for (i = 0; i < ios->numdevs; i++) {
480 struct osd_sense_info osi;
481 struct osd_request *or = ios->per_dev[i].or;
482 int ret;
483
484 if (!or)
485 continue;
486 392
487 ret = osd_req_decode_sense(or, &osi); 393 pooid.oid_device_id = ode->id_node.deviceid;
488 if (likely(!ret)) 394 pooid.oid_partition_id = ios->oc->comps[comp].obj.partition;
489 continue; 395 pooid.oid_object_id = ios->oc->comps[comp].obj.id;
490 396
491 if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { 397 objlayout_io_set_result(&objios->oir, comp,
492 /* start read offset passed endof file */ 398 &pooid, osd_pri_2_pnfs_err(oep),
493 BUG_ON(is_write); 399 dev_offset, dev_len, !ios->reading);
494 _clear_bio(ios->per_dev[i].bio);
495 dprintk("%s: start read offset passed end of file "
496 "offset=0x%llx, length=0x%lx\n", __func__,
497 _LLU(ios->per_dev[i].offset),
498 ios->per_dev[i].length);
499
500 continue; /* we recovered */
501 }
502 objlayout_io_set_result(&ios->ol_state, i,
503 &ios->layout->comps[i].oc_object_id,
504 osd_pri_2_pnfs_err(osi.osd_err_pri),
505 ios->per_dev[i].offset,
506 ios->per_dev[i].length,
507 is_write);
508
509 if (osi.osd_err_pri >= oep) {
510 oep = osi.osd_err_pri;
511 lin_ret = ret;
512 }
513 }
514
515 return lin_ret;
516}
517
518/*
519 * Common IO state helpers.
520 */
521static void _io_free(struct objio_state *ios)
522{
523 unsigned i;
524
525 for (i = 0; i < ios->numdevs; i++) {
526 struct _objio_per_comp *per_dev = &ios->per_dev[i];
527
528 if (per_dev->or) {
529 osd_end_request(per_dev->or);
530 per_dev->or = NULL;
531 }
532
533 if (per_dev->bio) {
534 bio_put(per_dev->bio);
535 per_dev->bio = NULL;
536 }
537 }
538}
539
540struct osd_dev *_io_od(struct objio_state *ios, unsigned dev)
541{
542 unsigned min_dev = ios->layout->comps_index;
543 unsigned max_dev = min_dev + ios->layout->num_comps;
544
545 BUG_ON(dev < min_dev || max_dev <= dev);
546 return ios->layout->ods[dev - min_dev]->od;
547}
548
549struct _striping_info {
550 u64 obj_offset;
551 u64 group_length;
552 unsigned dev;
553 unsigned unit_off;
554};
555
556static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
557 struct _striping_info *si)
558{
559 u32 stripe_unit = ios->layout->stripe_unit;
560 u32 group_width = ios->layout->group_width;
561 u64 group_depth = ios->layout->group_depth;
562 u32 U = stripe_unit * group_width;
563
564 u64 T = U * group_depth;
565 u64 S = T * ios->layout->group_count;
566 u64 M = div64_u64(file_offset, S);
567
568 /*
569 G = (L - (M * S)) / T
570 H = (L - (M * S)) % T
571 */
572 u64 LmodU = file_offset - M * S;
573 u32 G = div64_u64(LmodU, T);
574 u64 H = LmodU - G * T;
575
576 u32 N = div_u64(H, U);
577
578 div_u64_rem(file_offset, stripe_unit, &si->unit_off);
579 si->obj_offset = si->unit_off + (N * stripe_unit) +
580 (M * group_depth * stripe_unit);
581
582 /* "H - (N * U)" is just "H % U" so it's bound to u32 */
583 si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
584 si->dev *= ios->layout->mirrors_p1;
585
586 si->group_length = T - H;
587}
588
589static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
590 unsigned pgbase, struct _objio_per_comp *per_dev, int len,
591 gfp_t gfp_flags)
592{
593 unsigned pg = *cur_pg;
594 int cur_len = len;
595 struct request_queue *q =
596 osd_request_queue(_io_od(ios, per_dev->dev));
597
598 if (per_dev->bio == NULL) {
599 unsigned pages_in_stripe = ios->layout->group_width *
600 (ios->layout->stripe_unit / PAGE_SIZE);
601 unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
602 ios->layout->group_width;
603
604 if (BIO_MAX_PAGES_KMALLOC < bio_size)
605 bio_size = BIO_MAX_PAGES_KMALLOC;
606
607 per_dev->bio = bio_kmalloc(gfp_flags, bio_size);
608 if (unlikely(!per_dev->bio)) {
609 dprintk("Faild to allocate BIO size=%u\n", bio_size);
610 return -ENOMEM;
611 }
612 }
613
614 while (cur_len > 0) {
615 unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
616 unsigned added_len;
617
618 BUG_ON(ios->ol_state.nr_pages <= pg);
619 cur_len -= pglen;
620
621 added_len = bio_add_pc_page(q, per_dev->bio,
622 ios->ol_state.pages[pg], pglen, pgbase);
623 if (unlikely(pglen != added_len))
624 return -ENOMEM;
625 pgbase = 0;
626 ++pg;
627 }
628 BUG_ON(cur_len);
629
630 per_dev->length += len;
631 *cur_pg = pg;
632 return 0;
633}
634
635static int _prepare_one_group(struct objio_state *ios, u64 length,
636 struct _striping_info *si, unsigned *last_pg,
637 gfp_t gfp_flags)
638{
639 unsigned stripe_unit = ios->layout->stripe_unit;
640 unsigned mirrors_p1 = ios->layout->mirrors_p1;
641 unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
642 unsigned dev = si->dev;
643 unsigned first_dev = dev - (dev % devs_in_group);
644 unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
645 unsigned cur_pg = *last_pg;
646 int ret = 0;
647
648 while (length) {
649 struct _objio_per_comp *per_dev = &ios->per_dev[dev - first_dev];
650 unsigned cur_len, page_off = 0;
651
652 if (!per_dev->length) {
653 per_dev->dev = dev;
654 if (dev < si->dev) {
655 per_dev->offset = si->obj_offset + stripe_unit -
656 si->unit_off;
657 cur_len = stripe_unit;
658 } else if (dev == si->dev) {
659 per_dev->offset = si->obj_offset;
660 cur_len = stripe_unit - si->unit_off;
661 page_off = si->unit_off & ~PAGE_MASK;
662 BUG_ON(page_off &&
663 (page_off != ios->ol_state.pgbase));
664 } else { /* dev > si->dev */
665 per_dev->offset = si->obj_offset - si->unit_off;
666 cur_len = stripe_unit;
667 }
668
669 if (max_comp < dev - first_dev)
670 max_comp = dev - first_dev;
671 } else {
672 cur_len = stripe_unit;
673 }
674 if (cur_len >= length)
675 cur_len = length;
676
677 ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
678 cur_len, gfp_flags);
679 if (unlikely(ret))
680 goto out;
681
682 dev += mirrors_p1;
683 dev = (dev % devs_in_group) + first_dev;
684
685 length -= cur_len;
686 ios->length += cur_len;
687 }
688out:
689 ios->numdevs = max_comp + mirrors_p1;
690 *last_pg = cur_pg;
691 return ret;
692}
693
694static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags)
695{
696 u64 length = ios->ol_state.count;
697 u64 offset = ios->ol_state.offset;
698 struct _striping_info si;
699 unsigned last_pg = 0;
700 int ret = 0;
701
702 while (length) {
703 _calc_stripe_info(ios, offset, &si);
704
705 if (length < si.group_length)
706 si.group_length = length;
707
708 ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags);
709 if (unlikely(ret))
710 goto out;
711
712 offset += si.group_length;
713 length -= si.group_length;
714 }
715
716out:
717 if (!ios->length)
718 return ret;
719
720 return 0;
721}
722
723static ssize_t _sync_done(struct objio_state *ios)
724{
725 struct completion *waiting = ios->private;
726
727 complete(waiting);
728 return 0;
729}
730
731static void _last_io(struct kref *kref)
732{
733 struct objio_state *ios = container_of(kref, struct objio_state, kref);
734
735 ios->done(ios);
736}
737
738static void _done_io(struct osd_request *or, void *p)
739{
740 struct objio_state *ios = p;
741
742 kref_put(&ios->kref, _last_io);
743}
744
745static ssize_t _io_exec(struct objio_state *ios)
746{
747 DECLARE_COMPLETION_ONSTACK(wait);
748 ssize_t status = 0; /* sync status */
749 unsigned i;
750 objio_done_fn saved_done_fn = ios->done;
751 bool sync = ios->ol_state.sync;
752
753 if (sync) {
754 ios->done = _sync_done;
755 ios->private = &wait;
756 }
757
758 kref_init(&ios->kref);
759
760 for (i = 0; i < ios->numdevs; i++) {
761 struct osd_request *or = ios->per_dev[i].or;
762
763 if (!or)
764 continue;
765
766 kref_get(&ios->kref);
767 osd_execute_request_async(or, _done_io, ios);
768 }
769
770 kref_put(&ios->kref, _last_io);
771
772 if (sync) {
773 wait_for_completion(&wait);
774 status = saved_done_fn(ios);
775 }
776
777 return status;
778} 400}
779 401
780/* 402/*
781 * read 403 * read
782 */ 404 */
783static ssize_t _read_done(struct objio_state *ios) 405static void _read_done(struct ore_io_state *ios, void *private)
784{ 406{
407 struct objio_state *objios = private;
785 ssize_t status; 408 ssize_t status;
786 int ret = _io_check(ios, false); 409 int ret = ore_check_io(ios, &__on_dev_error);
787 410
788 _io_free(ios); 411 /* FIXME: _io_free(ios) can we dealocate the libosd resources; */
789 412
790 if (likely(!ret)) 413 if (likely(!ret))
791 status = ios->length; 414 status = ios->length;
792 else 415 else
793 status = ret; 416 status = ret;
794 417
795 objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync); 418 objlayout_read_done(&objios->oir, status, objios->sync);
796 return status;
797} 419}
798 420
799static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) 421int objio_read_pagelist(struct nfs_read_data *rdata)
800{ 422{
801 struct osd_request *or = NULL; 423 struct objio_state *objios;
802 struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
803 unsigned dev = per_dev->dev;
804 struct pnfs_osd_object_cred *cred =
805 &ios->layout->comps[cur_comp];
806 struct osd_obj_id obj = {
807 .partition = cred->oc_object_id.oid_partition_id,
808 .id = cred->oc_object_id.oid_object_id,
809 };
810 int ret; 424 int ret;
811 425
812 or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); 426 ret = objio_alloc_io_state(NFS_I(rdata->inode)->layout, true,
813 if (unlikely(!or)) { 427 rdata->lseg, rdata->args.pages, rdata->args.pgbase,
814 ret = -ENOMEM; 428 rdata->args.offset, rdata->args.count, rdata,
815 goto err; 429 GFP_KERNEL, &objios);
816 }
817 per_dev->or = or;
818
819 osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
820
821 ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
822 if (ret) {
823 dprintk("%s: Faild to osd_finalize_request() => %d\n",
824 __func__, ret);
825 goto err;
826 }
827
828 dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
829 __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
830 per_dev->length);
831
832err:
833 return ret;
834}
835
836static ssize_t _read_exec(struct objio_state *ios)
837{
838 unsigned i;
839 int ret;
840
841 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
842 if (!ios->per_dev[i].length)
843 continue;
844 ret = _read_mirrors(ios, i);
845 if (unlikely(ret))
846 goto err;
847 }
848
849 ios->done = _read_done;
850 return _io_exec(ios); /* In sync mode exec returns the io status */
851
852err:
853 _io_free(ios);
854 return ret;
855}
856
857ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
858{
859 struct objio_state *ios = container_of(ol_state, struct objio_state,
860 ol_state);
861 int ret;
862
863 ret = _io_rw_pagelist(ios, GFP_KERNEL);
864 if (unlikely(ret)) 430 if (unlikely(ret))
865 return ret; 431 return ret;
866 432
867 return _read_exec(ios); 433 objios->ios->done = _read_done;
434 dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
435 rdata->args.offset, rdata->args.count);
436 return ore_read(objios->ios);
868} 437}
869 438
870/* 439/*
871 * write 440 * write
872 */ 441 */
873static ssize_t _write_done(struct objio_state *ios) 442static void _write_done(struct ore_io_state *ios, void *private)
874{ 443{
444 struct objio_state *objios = private;
875 ssize_t status; 445 ssize_t status;
876 int ret = _io_check(ios, true); 446 int ret = ore_check_io(ios, &__on_dev_error);
877 447
878 _io_free(ios); 448 /* FIXME: _io_free(ios) can we dealocate the libosd resources; */
879 449
880 if (likely(!ret)) { 450 if (likely(!ret)) {
881 /* FIXME: should be based on the OSD's persistence model 451 /* FIXME: should be based on the OSD's persistence model
882 * See OSD2r05 Section 4.13 Data persistence model */ 452 * See OSD2r05 Section 4.13 Data persistence model */
883 ios->ol_state.committed = NFS_FILE_SYNC; 453 objios->oir.committed = NFS_FILE_SYNC;
884 status = ios->length; 454 status = ios->length;
885 } else { 455 } else {
886 status = ret; 456 status = ret;
887 } 457 }
888 458
889 objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync); 459 objlayout_write_done(&objios->oir, status, objios->sync);
890 return status;
891} 460}
892 461
893static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) 462static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
894{ 463{
895 struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp]; 464 struct objio_state *objios = priv;
896 unsigned dev = ios->per_dev[cur_comp].dev; 465 struct nfs_write_data *wdata = objios->oir.rpcdata;
897 unsigned last_comp = cur_comp + ios->layout->mirrors_p1; 466 pgoff_t index = offset / PAGE_SIZE;
898 int ret; 467 struct page *page = find_get_page(wdata->inode->i_mapping, index);
899
900 for (; cur_comp < last_comp; ++cur_comp, ++dev) {
901 struct osd_request *or = NULL;
902 struct pnfs_osd_object_cred *cred =
903 &ios->layout->comps[cur_comp];
904 struct osd_obj_id obj = {
905 .partition = cred->oc_object_id.oid_partition_id,
906 .id = cred->oc_object_id.oid_object_id,
907 };
908 struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
909 struct bio *bio;
910
911 or = osd_start_request(_io_od(ios, dev), GFP_NOFS);
912 if (unlikely(!or)) {
913 ret = -ENOMEM;
914 goto err;
915 }
916 per_dev->or = or;
917
918 if (per_dev != master_dev) {
919 bio = bio_kmalloc(GFP_NOFS,
920 master_dev->bio->bi_max_vecs);
921 if (unlikely(!bio)) {
922 dprintk("Faild to allocate BIO size=%u\n",
923 master_dev->bio->bi_max_vecs);
924 ret = -ENOMEM;
925 goto err;
926 }
927
928 __bio_clone(bio, master_dev->bio);
929 bio->bi_bdev = NULL;
930 bio->bi_next = NULL;
931 per_dev->bio = bio;
932 per_dev->dev = dev;
933 per_dev->length = master_dev->length;
934 per_dev->offset = master_dev->offset;
935 } else {
936 bio = master_dev->bio;
937 bio->bi_rw |= REQ_WRITE;
938 }
939
940 osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
941 468
942 ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); 469 if (!page) {
943 if (ret) { 470 page = find_or_create_page(wdata->inode->i_mapping,
944 dprintk("%s: Faild to osd_finalize_request() => %d\n", 471 index, GFP_NOFS);
945 __func__, ret); 472 if (unlikely(!page)) {
946 goto err; 473 dprintk("%s: grab_cache_page Failed index=0x%lx\n",
474 __func__, index);
475 return NULL;
947 } 476 }
948 477 unlock_page(page);
949 dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
950 __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
951 per_dev->length);
952 } 478 }
479 if (PageDirty(page) || PageWriteback(page))
480 *uptodate = true;
481 else
482 *uptodate = PageUptodate(page);
483 dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate);
484 return page;
485}
953 486
954err: 487static void __r4w_put_page(void *priv, struct page *page)
955 return ret; 488{
489 dprintk("%s: index=0x%lx\n", __func__, page->index);
490 page_cache_release(page);
491 return;
956} 492}
957 493
958static ssize_t _write_exec(struct objio_state *ios) 494static const struct _ore_r4w_op _r4w_op = {
495 .get_page = &__r4w_get_page,
496 .put_page = &__r4w_put_page,
497};
498
499int objio_write_pagelist(struct nfs_write_data *wdata, int how)
959{ 500{
960 unsigned i; 501 struct objio_state *objios;
961 int ret; 502 int ret;
962 503
963 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { 504 ret = objio_alloc_io_state(NFS_I(wdata->inode)->layout, false,
964 if (!ios->per_dev[i].length) 505 wdata->lseg, wdata->args.pages, wdata->args.pgbase,
965 continue; 506 wdata->args.offset, wdata->args.count, wdata, GFP_NOFS,
966 ret = _write_mirrors(ios, i); 507 &objios);
967 if (unlikely(ret)) 508 if (unlikely(ret))
968 goto err; 509 return ret;
969 }
970
971 ios->done = _write_done;
972 return _io_exec(ios); /* In sync mode exec returns the io->status */
973 510
974err: 511 objios->sync = 0 != (how & FLUSH_SYNC);
975 _io_free(ios); 512 objios->ios->r4w = &_r4w_op;
976 return ret;
977}
978 513
979ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) 514 if (!objios->sync)
980{ 515 objios->ios->done = _write_done;
981 struct objio_state *ios = container_of(ol_state, struct objio_state,
982 ol_state);
983 int ret;
984 516
985 /* TODO: ios->stable = stable; */ 517 dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
986 ret = _io_rw_pagelist(ios, GFP_NOFS); 518 wdata->args.offset, wdata->args.count);
519 ret = ore_write(objios->ios);
987 if (unlikely(ret)) 520 if (unlikely(ret))
988 return ret; 521 return ret;
989 522
990 return _write_exec(ios); 523 if (objios->sync)
524 _write_done(objios->ios, objios);
525
526 return 0;
991} 527}
992 528
993static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, 529static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
@@ -997,7 +533,7 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
997 return false; 533 return false;
998 534
999 return pgio->pg_count + req->wb_bytes <= 535 return pgio->pg_count + req->wb_bytes <=
1000 OBJIO_LSEG(pgio->pg_lseg)->max_io_size; 536 OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length;
1001} 537}
1002 538
1003static const struct nfs_pageio_ops objio_pg_read_ops = { 539static const struct nfs_pageio_ops objio_pg_read_ops = {
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 1d06f8e2ade..72074e3a04f 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -156,77 +156,39 @@ last_byte_offset(u64 start, u64 len)
156 return end > start ? end - 1 : NFS4_MAX_UINT64; 156 return end > start ? end - 1 : NFS4_MAX_UINT64;
157} 157}
158 158
159static struct objlayout_io_state * 159void _fix_verify_io_params(struct pnfs_layout_segment *lseg,
160objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, 160 struct page ***p_pages, unsigned *p_pgbase,
161 struct page **pages, 161 u64 offset, unsigned long count)
162 unsigned pgbase,
163 loff_t offset,
164 size_t count,
165 struct pnfs_layout_segment *lseg,
166 void *rpcdata,
167 gfp_t gfp_flags)
168{ 162{
169 struct objlayout_io_state *state;
170 u64 lseg_end_offset; 163 u64 lseg_end_offset;
171 164
172 dprintk("%s: allocating io_state\n", __func__);
173 if (objio_alloc_io_state(lseg, &state, gfp_flags))
174 return NULL;
175
176 BUG_ON(offset < lseg->pls_range.offset); 165 BUG_ON(offset < lseg->pls_range.offset);
177 lseg_end_offset = end_offset(lseg->pls_range.offset, 166 lseg_end_offset = end_offset(lseg->pls_range.offset,
178 lseg->pls_range.length); 167 lseg->pls_range.length);
179 BUG_ON(offset >= lseg_end_offset); 168 BUG_ON(offset >= lseg_end_offset);
180 if (offset + count > lseg_end_offset) { 169 WARN_ON(offset + count > lseg_end_offset);
181 count = lseg->pls_range.length -
182 (offset - lseg->pls_range.offset);
183 dprintk("%s: truncated count %Zd\n", __func__, count);
184 }
185 170
186 if (pgbase > PAGE_SIZE) { 171 if (*p_pgbase > PAGE_SIZE) {
187 pages += pgbase >> PAGE_SHIFT; 172 dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__, *p_pgbase);
188 pgbase &= ~PAGE_MASK; 173 *p_pages += *p_pgbase >> PAGE_SHIFT;
174 *p_pgbase &= ~PAGE_MASK;
189 } 175 }
190
191 INIT_LIST_HEAD(&state->err_list);
192 state->lseg = lseg;
193 state->rpcdata = rpcdata;
194 state->pages = pages;
195 state->pgbase = pgbase;
196 state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
197 state->offset = offset;
198 state->count = count;
199 state->sync = 0;
200
201 return state;
202}
203
204static void
205objlayout_free_io_state(struct objlayout_io_state *state)
206{
207 dprintk("%s: freeing io_state\n", __func__);
208 if (unlikely(!state))
209 return;
210
211 objio_free_io_state(state);
212} 176}
213 177
214/* 178/*
215 * I/O done common code 179 * I/O done common code
216 */ 180 */
217static void 181static void
218objlayout_iodone(struct objlayout_io_state *state) 182objlayout_iodone(struct objlayout_io_res *oir)
219{ 183{
220 dprintk("%s: state %p status\n", __func__, state); 184 if (likely(oir->status >= 0)) {
221 185 objio_free_result(oir);
222 if (likely(state->status >= 0)) {
223 objlayout_free_io_state(state);
224 } else { 186 } else {
225 struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout); 187 struct objlayout *objlay = oir->objlay;
226 188
227 spin_lock(&objlay->lock); 189 spin_lock(&objlay->lock);
228 objlay->delta_space_valid = OBJ_DSU_INVALID; 190 objlay->delta_space_valid = OBJ_DSU_INVALID;
229 list_add(&objlay->err_list, &state->err_list); 191 list_add(&objlay->err_list, &oir->err_list);
230 spin_unlock(&objlay->lock); 192 spin_unlock(&objlay->lock);
231 } 193 }
232} 194}
@@ -238,13 +200,13 @@ objlayout_iodone(struct objlayout_io_state *state)
238 * the error for later reporting at layout-return. 200 * the error for later reporting at layout-return.
239 */ 201 */
240void 202void
241objlayout_io_set_result(struct objlayout_io_state *state, unsigned index, 203objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index,
242 struct pnfs_osd_objid *pooid, int osd_error, 204 struct pnfs_osd_objid *pooid, int osd_error,
243 u64 offset, u64 length, bool is_write) 205 u64 offset, u64 length, bool is_write)
244{ 206{
245 struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index]; 207 struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[index];
246 208
247 BUG_ON(index >= state->num_comps); 209 BUG_ON(index >= oir->num_comps);
248 if (osd_error) { 210 if (osd_error) {
249 ioerr->oer_component = *pooid; 211 ioerr->oer_component = *pooid;
250 ioerr->oer_comp_offset = offset; 212 ioerr->oer_comp_offset = offset;
@@ -285,21 +247,18 @@ static void _rpc_read_complete(struct work_struct *work)
285} 247}
286 248
287void 249void
288objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) 250objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
289{ 251{
290 int eof = state->eof; 252 struct nfs_read_data *rdata = oir->rpcdata;
291 struct nfs_read_data *rdata;
292 253
293 state->status = status; 254 oir->status = rdata->task.tk_status = status;
294 dprintk("%s: Begin status=%zd eof=%d\n", __func__, status, eof); 255 if (status >= 0)
295 rdata = state->rpcdata;
296 rdata->task.tk_status = status;
297 if (status >= 0) {
298 rdata->res.count = status; 256 rdata->res.count = status;
299 rdata->res.eof = eof; 257 objlayout_iodone(oir);
300 } 258 /* must not use oir after this point */
301 objlayout_iodone(state); 259
302 /* must not use state after this point */ 260 dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__,
261 status, rdata->res.eof, sync);
303 262
304 if (sync) 263 if (sync)
305 pnfs_ld_read_done(rdata); 264 pnfs_ld_read_done(rdata);
@@ -317,40 +276,36 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
317{ 276{
318 loff_t offset = rdata->args.offset; 277 loff_t offset = rdata->args.offset;
319 size_t count = rdata->args.count; 278 size_t count = rdata->args.count;
320 struct objlayout_io_state *state; 279 int err;
321 ssize_t status = 0;
322 loff_t eof; 280 loff_t eof;
323 281
324 dprintk("%s: Begin inode %p offset %llu count %d\n",
325 __func__, rdata->inode, offset, (int)count);
326
327 eof = i_size_read(rdata->inode); 282 eof = i_size_read(rdata->inode);
328 if (unlikely(offset + count > eof)) { 283 if (unlikely(offset + count > eof)) {
329 if (offset >= eof) { 284 if (offset >= eof) {
330 status = 0; 285 err = 0;
331 rdata->res.count = 0; 286 rdata->res.count = 0;
332 rdata->res.eof = 1; 287 rdata->res.eof = 1;
288 /*FIXME: do we need to call pnfs_ld_read_done() */
333 goto out; 289 goto out;
334 } 290 }
335 count = eof - offset; 291 count = eof - offset;
336 } 292 }
337 293
338 state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout, 294 rdata->res.eof = (offset + count) >= eof;
339 rdata->args.pages, rdata->args.pgbase, 295 _fix_verify_io_params(rdata->lseg, &rdata->args.pages,
340 offset, count, 296 &rdata->args.pgbase,
341 rdata->lseg, rdata, 297 rdata->args.offset, rdata->args.count);
342 GFP_KERNEL);
343 if (unlikely(!state)) {
344 status = -ENOMEM;
345 goto out;
346 }
347 298
348 state->eof = state->offset + state->count >= eof; 299 dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
300 __func__, rdata->inode->i_ino, offset, count, rdata->res.eof);
349 301
350 status = objio_read_pagelist(state); 302 err = objio_read_pagelist(rdata);
351 out: 303 out:
352 dprintk("%s: Return status %Zd\n", __func__, status); 304 if (unlikely(err)) {
353 rdata->pnfs_error = status; 305 rdata->pnfs_error = err;
306 dprintk("%s: Returned Error %d\n", __func__, err);
307 return PNFS_NOT_ATTEMPTED;
308 }
354 return PNFS_ATTEMPTED; 309 return PNFS_ATTEMPTED;
355} 310}
356 311
@@ -371,26 +326,20 @@ static void _rpc_write_complete(struct work_struct *work)
371} 326}
372 327
373void 328void
374objlayout_write_done(struct objlayout_io_state *state, ssize_t status, 329objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
375 bool sync)
376{ 330{
377 struct nfs_write_data *wdata; 331 struct nfs_write_data *wdata = oir->rpcdata;
378 332
379 dprintk("%s: Begin\n", __func__); 333 oir->status = wdata->task.tk_status = status;
380 wdata = state->rpcdata;
381 state->status = status;
382 wdata->task.tk_status = status;
383 if (status >= 0) { 334 if (status >= 0) {
384 wdata->res.count = status; 335 wdata->res.count = status;
385 wdata->verf.committed = state->committed; 336 wdata->verf.committed = oir->committed;
386 dprintk("%s: Return status %d committed %d\n", 337 }
387 __func__, wdata->task.tk_status, 338 objlayout_iodone(oir);
388 wdata->verf.committed); 339 /* must not use oir after this point */
389 } else 340
390 dprintk("%s: Return status %d\n", 341 dprintk("%s: Return status %zd committed %d sync=%d\n", __func__,
391 __func__, wdata->task.tk_status); 342 status, wdata->verf.committed, sync);
392 objlayout_iodone(state);
393 /* must not use state after this point */
394 343
395 if (sync) 344 if (sync)
396 pnfs_ld_write_done(wdata); 345 pnfs_ld_write_done(wdata);
@@ -407,30 +356,18 @@ enum pnfs_try_status
407objlayout_write_pagelist(struct nfs_write_data *wdata, 356objlayout_write_pagelist(struct nfs_write_data *wdata,
408 int how) 357 int how)
409{ 358{
410 struct objlayout_io_state *state; 359 int err;
411 ssize_t status;
412
413 dprintk("%s: Begin inode %p offset %llu count %u\n",
414 __func__, wdata->inode, wdata->args.offset, wdata->args.count);
415
416 state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
417 wdata->args.pages,
418 wdata->args.pgbase,
419 wdata->args.offset,
420 wdata->args.count,
421 wdata->lseg, wdata,
422 GFP_NOFS);
423 if (unlikely(!state)) {
424 status = -ENOMEM;
425 goto out;
426 }
427 360
428 state->sync = how & FLUSH_SYNC; 361 _fix_verify_io_params(wdata->lseg, &wdata->args.pages,
362 &wdata->args.pgbase,
363 wdata->args.offset, wdata->args.count);
429 364
430 status = objio_write_pagelist(state, how & FLUSH_STABLE); 365 err = objio_write_pagelist(wdata, how);
431 out: 366 if (unlikely(err)) {
432 dprintk("%s: Return status %Zd\n", __func__, status); 367 wdata->pnfs_error = err;
433 wdata->pnfs_error = status; 368 dprintk("%s: Returned Error %d\n", __func__, err);
369 return PNFS_NOT_ATTEMPTED;
370 }
434 return PNFS_ATTEMPTED; 371 return PNFS_ATTEMPTED;
435} 372}
436 373
@@ -537,14 +474,14 @@ merge_ioerr(struct pnfs_osd_ioerr *dest_err,
537static void 474static void
538encode_accumulated_error(struct objlayout *objlay, __be32 *p) 475encode_accumulated_error(struct objlayout *objlay, __be32 *p)
539{ 476{
540 struct objlayout_io_state *state, *tmp; 477 struct objlayout_io_res *oir, *tmp;
541 struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; 478 struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};
542 479
543 list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { 480 list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) {
544 unsigned i; 481 unsigned i;
545 482
546 for (i = 0; i < state->num_comps; i++) { 483 for (i = 0; i < oir->num_comps; i++) {
547 struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; 484 struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i];
548 485
549 if (!ioerr->oer_errno) 486 if (!ioerr->oer_errno)
550 continue; 487 continue;
@@ -563,8 +500,8 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p)
563 500
564 merge_ioerr(&accumulated_err, ioerr); 501 merge_ioerr(&accumulated_err, ioerr);
565 } 502 }
566 list_del(&state->err_list); 503 list_del(&oir->err_list);
567 objlayout_free_io_state(state); 504 objio_free_result(oir);
568 } 505 }
569 506
570 pnfs_osd_xdr_encode_ioerr(p, &accumulated_err); 507 pnfs_osd_xdr_encode_ioerr(p, &accumulated_err);
@@ -576,7 +513,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
576 const struct nfs4_layoutreturn_args *args) 513 const struct nfs4_layoutreturn_args *args)
577{ 514{
578 struct objlayout *objlay = OBJLAYOUT(pnfslay); 515 struct objlayout *objlay = OBJLAYOUT(pnfslay);
579 struct objlayout_io_state *state, *tmp; 516 struct objlayout_io_res *oir, *tmp;
580 __be32 *start; 517 __be32 *start;
581 518
582 dprintk("%s: Begin\n", __func__); 519 dprintk("%s: Begin\n", __func__);
@@ -585,13 +522,13 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
585 522
586 spin_lock(&objlay->lock); 523 spin_lock(&objlay->lock);
587 524
588 list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { 525 list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) {
589 __be32 *last_xdr = NULL, *p; 526 __be32 *last_xdr = NULL, *p;
590 unsigned i; 527 unsigned i;
591 int res = 0; 528 int res = 0;
592 529
593 for (i = 0; i < state->num_comps; i++) { 530 for (i = 0; i < oir->num_comps; i++) {
594 struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; 531 struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i];
595 532
596 if (!ioerr->oer_errno) 533 if (!ioerr->oer_errno)
597 continue; 534 continue;
@@ -615,7 +552,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
615 } 552 }
616 553
617 last_xdr = p; 554 last_xdr = p;
618 pnfs_osd_xdr_encode_ioerr(p, &state->ioerrs[i]); 555 pnfs_osd_xdr_encode_ioerr(p, &oir->ioerrs[i]);
619 } 556 }
620 557
621 /* TODO: use xdr_write_pages */ 558 /* TODO: use xdr_write_pages */
@@ -631,8 +568,8 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
631 encode_accumulated_error(objlay, last_xdr); 568 encode_accumulated_error(objlay, last_xdr);
632 goto loop_done; 569 goto loop_done;
633 } 570 }
634 list_del(&state->err_list); 571 list_del(&oir->err_list);
635 objlayout_free_io_state(state); 572 objio_free_result(oir);
636 } 573 }
637loop_done: 574loop_done:
638 spin_unlock(&objlay->lock); 575 spin_unlock(&objlay->lock);
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index a8244c8e042..8ec34727ed2 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -74,19 +74,11 @@ OBJLAYOUT(struct pnfs_layout_hdr *lo)
74 * per-I/O operation state 74 * per-I/O operation state
75 * embedded in objects provider io_state data structure 75 * embedded in objects provider io_state data structure
76 */ 76 */
77struct objlayout_io_state { 77struct objlayout_io_res {
78 struct pnfs_layout_segment *lseg; 78 struct objlayout *objlay;
79
80 struct page **pages;
81 unsigned pgbase;
82 unsigned nr_pages;
83 unsigned long count;
84 loff_t offset;
85 bool sync;
86 79
87 void *rpcdata; 80 void *rpcdata;
88 int status; /* res */ 81 int status; /* res */
89 int eof; /* res */
90 int committed; /* res */ 82 int committed; /* res */
91 83
92 /* Error reporting (layout_return) */ 84 /* Error reporting (layout_return) */
@@ -100,6 +92,18 @@ struct objlayout_io_state {
100 struct pnfs_osd_ioerr *ioerrs; 92 struct pnfs_osd_ioerr *ioerrs;
101}; 93};
102 94
95static inline
96void objlayout_init_ioerrs(struct objlayout_io_res *oir, unsigned num_comps,
97 struct pnfs_osd_ioerr *ioerrs, void *rpcdata,
98 struct pnfs_layout_hdr *pnfs_layout_type)
99{
100 oir->objlay = OBJLAYOUT(pnfs_layout_type);
101 oir->rpcdata = rpcdata;
102 INIT_LIST_HEAD(&oir->err_list);
103 oir->num_comps = num_comps;
104 oir->ioerrs = ioerrs;
105}
106
103/* 107/*
104 * Raid engine I/O API 108 * Raid engine I/O API
105 */ 109 */
@@ -110,28 +114,24 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
110 gfp_t gfp_flags); 114 gfp_t gfp_flags);
111extern void objio_free_lseg(struct pnfs_layout_segment *lseg); 115extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
112 116
113extern int objio_alloc_io_state( 117/* objio_free_result will free these @oir structs recieved from
114 struct pnfs_layout_segment *lseg, 118 * objlayout_{read,write}_done
115 struct objlayout_io_state **outp, 119 */
116 gfp_t gfp_flags); 120extern void objio_free_result(struct objlayout_io_res *oir);
117extern void objio_free_io_state(struct objlayout_io_state *state);
118 121
119extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state); 122extern int objio_read_pagelist(struct nfs_read_data *rdata);
120extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, 123extern int objio_write_pagelist(struct nfs_write_data *wdata, int how);
121 bool stable);
122 124
123/* 125/*
124 * callback API 126 * callback API
125 */ 127 */
126extern void objlayout_io_set_result(struct objlayout_io_state *state, 128extern void objlayout_io_set_result(struct objlayout_io_res *oir,
127 unsigned index, struct pnfs_osd_objid *pooid, 129 unsigned index, struct pnfs_osd_objid *pooid,
128 int osd_error, u64 offset, u64 length, bool is_write); 130 int osd_error, u64 offset, u64 length, bool is_write);
129 131
130static inline void 132static inline void
131objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used) 133objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used)
132{ 134{
133 struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
134
135 /* If one of the I/Os errored out and the delta_space_used was 135 /* If one of the I/Os errored out and the delta_space_used was
136 * invalid we render the complete report as invalid. Protocol mandate 136 * invalid we render the complete report as invalid. Protocol mandate
137 * the DSU be accurate or not reported. 137 * the DSU be accurate or not reported.
@@ -144,9 +144,9 @@ objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
144 spin_unlock(&objlay->lock); 144 spin_unlock(&objlay->lock);
145} 145}
146 146
147extern void objlayout_read_done(struct objlayout_io_state *state, 147extern void objlayout_read_done(struct objlayout_io_res *oir,
148 ssize_t status, bool sync); 148 ssize_t status, bool sync);
149extern void objlayout_write_done(struct objlayout_io_state *state, 149extern void objlayout_write_done(struct objlayout_io_res *oir,
150 ssize_t status, bool sync); 150 ssize_t status, bool sync);
151 151
152extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, 152extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index b60970cc7f1..5668f7c54c4 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -18,6 +18,7 @@
18#include <linux/nfs_page.h> 18#include <linux/nfs_page.h>
19#include <linux/nfs_fs.h> 19#include <linux/nfs_fs.h>
20#include <linux/nfs_mount.h> 20#include <linux/nfs_mount.h>
21#include <linux/export.h>
21 22
22#include "internal.h" 23#include "internal.h"
23#include "pnfs.h" 24#include "pnfs.h"
@@ -41,7 +42,7 @@ nfs_page_free(struct nfs_page *p)
41 42
42/** 43/**
43 * nfs_create_request - Create an NFS read/write request. 44 * nfs_create_request - Create an NFS read/write request.
44 * @file: file descriptor to use 45 * @ctx: open context to use
45 * @inode: inode to which the request is attached 46 * @inode: inode to which the request is attached
46 * @page: page to write 47 * @page: page to write
47 * @offset: starting offset within the page for the write 48 * @offset: starting offset within the page for the write
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index ee73d9a4f70..8e672a2b2d6 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -29,6 +29,7 @@
29 29
30#include <linux/nfs_fs.h> 30#include <linux/nfs_fs.h>
31#include <linux/nfs_page.h> 31#include <linux/nfs_page.h>
32#include <linux/module.h>
32#include "internal.h" 33#include "internal.h"
33#include "pnfs.h" 34#include "pnfs.h"
34#include "iostat.h" 35#include "iostat.h"
@@ -1259,6 +1260,25 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
1259} 1260}
1260EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); 1261EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
1261 1262
1263static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
1264{
1265 struct nfs_pageio_descriptor pgio;
1266
1267 put_lseg(data->lseg);
1268 data->lseg = NULL;
1269 dprintk("pnfs write error = %d\n", data->pnfs_error);
1270
1271 nfs_pageio_init_read_mds(&pgio, data->inode);
1272
1273 while (!list_empty(&data->pages)) {
1274 struct nfs_page *req = nfs_list_entry(data->pages.next);
1275
1276 nfs_list_remove_request(req);
1277 nfs_pageio_add_request(&pgio, req);
1278 }
1279 nfs_pageio_complete(&pgio);
1280}
1281
1262/* 1282/*
1263 * Called by non rpc-based layout drivers 1283 * Called by non rpc-based layout drivers
1264 */ 1284 */
@@ -1267,11 +1287,8 @@ void pnfs_ld_read_done(struct nfs_read_data *data)
1267 if (likely(!data->pnfs_error)) { 1287 if (likely(!data->pnfs_error)) {
1268 __nfs4_read_done_cb(data); 1288 __nfs4_read_done_cb(data);
1269 data->mds_ops->rpc_call_done(&data->task, data); 1289 data->mds_ops->rpc_call_done(&data->task, data);
1270 } else { 1290 } else
1271 put_lseg(data->lseg); 1291 pnfs_ld_handle_read_error(data);
1272 data->lseg = NULL;
1273 dprintk("pnfs write error = %d\n", data->pnfs_error);
1274 }
1275 data->mds_ops->rpc_release(data); 1292 data->mds_ops->rpc_release(data);
1276} 1293}
1277EXPORT_SYMBOL_GPL(pnfs_ld_read_done); 1294EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
@@ -1443,17 +1460,31 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
1443 /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ 1460 /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
1444 data = kzalloc(sizeof(*data), GFP_NOFS); 1461 data = kzalloc(sizeof(*data), GFP_NOFS);
1445 if (!data) { 1462 if (!data) {
1446 mark_inode_dirty_sync(inode);
1447 status = -ENOMEM; 1463 status = -ENOMEM;
1448 goto out; 1464 goto out;
1449 } 1465 }
1450 1466
1467 if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
1468 goto out_free;
1469
1470 if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
1471 if (!sync) {
1472 status = -EAGAIN;
1473 goto out_free;
1474 }
1475 status = wait_on_bit_lock(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING,
1476 nfs_wait_bit_killable, TASK_KILLABLE);
1477 if (status)
1478 goto out_free;
1479 }
1480
1451 INIT_LIST_HEAD(&data->lseg_list); 1481 INIT_LIST_HEAD(&data->lseg_list);
1452 spin_lock(&inode->i_lock); 1482 spin_lock(&inode->i_lock);
1453 if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { 1483 if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
1484 clear_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags);
1454 spin_unlock(&inode->i_lock); 1485 spin_unlock(&inode->i_lock);
1455 kfree(data); 1486 wake_up_bit(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING);
1456 goto out; 1487 goto out_free;
1457 } 1488 }
1458 1489
1459 pnfs_list_write_lseg(inode, &data->lseg_list); 1490 pnfs_list_write_lseg(inode, &data->lseg_list);
@@ -1475,6 +1506,11 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
1475 1506
1476 status = nfs4_proc_layoutcommit(data, sync); 1507 status = nfs4_proc_layoutcommit(data, sync);
1477out: 1508out:
1509 if (status)
1510 mark_inode_dirty_sync(inode);
1478 dprintk("<-- %s status %d\n", __func__, status); 1511 dprintk("<-- %s status %d\n", __func__, status);
1479 return status; 1512 return status;
1513out_free:
1514 kfree(data);
1515 goto out;
1480} 1516}
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 6fda5228ef5..4f359d2a26e 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -28,6 +28,7 @@
28 * such damages. 28 * such damages.
29 */ 29 */
30 30
31#include <linux/export.h>
31#include "pnfs.h" 32#include "pnfs.h"
32 33
33#define NFSDBG_FACILITY NFSDBG_PNFS 34#define NFSDBG_FACILITY NFSDBG_PNFS
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index ac40b8535d7..f48125da198 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -710,6 +710,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
710 .dentry_ops = &nfs_dentry_operations, 710 .dentry_ops = &nfs_dentry_operations,
711 .dir_inode_ops = &nfs_dir_inode_operations, 711 .dir_inode_ops = &nfs_dir_inode_operations,
712 .file_inode_ops = &nfs_file_inode_operations, 712 .file_inode_ops = &nfs_file_inode_operations,
713 .file_ops = &nfs_file_operations,
713 .getroot = nfs_proc_get_root, 714 .getroot = nfs_proc_get_root,
714 .getattr = nfs_proc_getattr, 715 .getattr = nfs_proc_getattr,
715 .setattr = nfs_proc_setattr, 716 .setattr = nfs_proc_setattr,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 8b48ec63f72..cfa175c223d 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -109,7 +109,7 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
109 } 109 }
110} 110}
111 111
112static void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, 112void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
113 struct inode *inode) 113 struct inode *inode)
114{ 114{
115 nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops, 115 nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops,
@@ -534,23 +534,13 @@ static void nfs_readpage_result_full(struct rpc_task *task, void *calldata)
534static void nfs_readpage_release_full(void *calldata) 534static void nfs_readpage_release_full(void *calldata)
535{ 535{
536 struct nfs_read_data *data = calldata; 536 struct nfs_read_data *data = calldata;
537 struct nfs_pageio_descriptor pgio;
538 537
539 if (data->pnfs_error) {
540 nfs_pageio_init_read_mds(&pgio, data->inode);
541 pgio.pg_recoalesce = 1;
542 }
543 while (!list_empty(&data->pages)) { 538 while (!list_empty(&data->pages)) {
544 struct nfs_page *req = nfs_list_entry(data->pages.next); 539 struct nfs_page *req = nfs_list_entry(data->pages.next);
545 540
546 nfs_list_remove_request(req); 541 nfs_list_remove_request(req);
547 if (!data->pnfs_error) 542 nfs_readpage_release(req);
548 nfs_readpage_release(req);
549 else
550 nfs_pageio_add_request(&pgio, req);
551 } 543 }
552 if (data->pnfs_error)
553 nfs_pageio_complete(&pgio);
554 nfs_readdata_release(calldata); 544 nfs_readdata_release(calldata);
555} 545}
556 546
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 480b3b6bf71..134777406ee 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2787,43 +2787,18 @@ static void nfs_referral_loop_unprotect(void)
2787static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt, 2787static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt,
2788 const char *export_path) 2788 const char *export_path)
2789{ 2789{
2790 struct mnt_namespace *ns_private;
2791 struct super_block *s;
2792 struct dentry *dentry; 2790 struct dentry *dentry;
2793 struct path path; 2791 int ret = nfs_referral_loop_protect();
2794 int ret;
2795
2796 ns_private = create_mnt_ns(root_mnt);
2797 ret = PTR_ERR(ns_private);
2798 if (IS_ERR(ns_private))
2799 goto out_mntput;
2800
2801 ret = nfs_referral_loop_protect();
2802 if (ret != 0)
2803 goto out_put_mnt_ns;
2804 2792
2805 ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt, 2793 if (ret) {
2806 export_path, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); 2794 mntput(root_mnt);
2795 return ERR_PTR(ret);
2796 }
2807 2797
2798 dentry = mount_subtree(root_mnt, export_path);
2808 nfs_referral_loop_unprotect(); 2799 nfs_referral_loop_unprotect();
2809 put_mnt_ns(ns_private);
2810
2811 if (ret != 0)
2812 goto out_err;
2813
2814 s = path.mnt->mnt_sb;
2815 atomic_inc(&s->s_active);
2816 dentry = dget(path.dentry);
2817 2800
2818 path_put(&path);
2819 down_write(&s->s_umount);
2820 return dentry; 2801 return dentry;
2821out_put_mnt_ns:
2822 put_mnt_ns(ns_private);
2823out_mntput:
2824 mntput(root_mnt);
2825out_err:
2826 return ERR_PTR(ret);
2827} 2802}
2828 2803
2829static struct dentry *nfs4_try_mount(int flags, const char *dev_name, 2804static struct dentry *nfs4_try_mount(int flags, const char *dev_name,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 2219c88d96b..1dda78db6a7 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -20,6 +20,7 @@
20#include <linux/nfs_mount.h> 20#include <linux/nfs_mount.h>
21#include <linux/nfs_page.h> 21#include <linux/nfs_page.h>
22#include <linux/backing-dev.h> 22#include <linux/backing-dev.h>
23#include <linux/export.h>
23 24
24#include <asm/uaccess.h> 25#include <asm/uaccess.h>
25 26
@@ -1243,7 +1244,6 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1243{ 1244{
1244 struct nfs_writeargs *argp = &data->args; 1245 struct nfs_writeargs *argp = &data->args;
1245 struct nfs_writeres *resp = &data->res; 1246 struct nfs_writeres *resp = &data->res;
1246 struct nfs_server *server = NFS_SERVER(data->inode);
1247 int status; 1247 int status;
1248 1248
1249 dprintk("NFS: %5u nfs_writeback_done (status %d)\n", 1249 dprintk("NFS: %5u nfs_writeback_done (status %d)\n",
@@ -1277,7 +1277,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1277 if (time_before(complain, jiffies)) { 1277 if (time_before(complain, jiffies)) {
1278 dprintk("NFS: faulty NFS server %s:" 1278 dprintk("NFS: faulty NFS server %s:"
1279 " (committed = %d) != (stable = %d)\n", 1279 " (committed = %d) != (stable = %d)\n",
1280 server->nfs_client->cl_hostname, 1280 NFS_SERVER(data->inode)->nfs_client->cl_hostname,
1281 resp->verf->committed, argp->stable); 1281 resp->verf->committed, argp->stable);
1282 complain = jiffies + 300 * HZ; 1282 complain = jiffies + 300 * HZ;
1283 } 1283 }
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index ad88f1c0a4c..9c51aff02ae 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -36,6 +36,7 @@
36 36
37#include <linux/slab.h> 37#include <linux/slab.h>
38#include <linux/nfs_fs.h> 38#include <linux/nfs_fs.h>
39#include <linux/export.h>
39#include "acl.h" 40#include "acl.h"
40 41
41 42
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index db34a585e11..c45a2ea4a09 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -13,6 +13,7 @@
13#include <linux/sunrpc/clnt.h> 13#include <linux/sunrpc/clnt.h>
14#include <linux/sunrpc/gss_api.h> 14#include <linux/sunrpc/gss_api.h>
15#include <linux/sunrpc/gss_krb5_enctypes.h> 15#include <linux/sunrpc/gss_krb5_enctypes.h>
16#include <linux/module.h>
16 17
17#include "idmap.h" 18#include "idmap.h"
18#include "nfsd.h" 19#include "nfsd.h"
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index dc5a1bf476b..eda7d7e55e0 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/sched.h> 9#include <linux/sched.h>
10#include <linux/freezer.h> 10#include <linux/freezer.h>
11#include <linux/module.h>
11#include <linux/fs_struct.h> 12#include <linux/fs_struct.h>
12#include <linux/swap.h> 13#include <linux/swap.h>
13 14
@@ -256,6 +257,8 @@ static void nfsd_last_thread(struct svc_serv *serv)
256 nfsd_serv = NULL; 257 nfsd_serv = NULL;
257 nfsd_shutdown(); 258 nfsd_shutdown();
258 259
260 svc_rpcb_cleanup(serv);
261
259 printk(KERN_WARNING "nfsd: last server has exited, flushing export " 262 printk(KERN_WARNING "nfsd: last server has exited, flushing export "
260 "cache\n"); 263 "cache\n");
261 nfsd_export_flush(); 264 nfsd_export_flush();
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index ed553c60de8..3165aebb43c 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5699,7 +5699,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
5699 OCFS2_JOURNAL_ACCESS_WRITE); 5699 OCFS2_JOURNAL_ACCESS_WRITE);
5700 if (ret) { 5700 if (ret) {
5701 mlog_errno(ret); 5701 mlog_errno(ret);
5702 goto out; 5702 goto out_commit;
5703 } 5703 }
5704 5704
5705 dquot_free_space_nodirty(inode, 5705 dquot_free_space_nodirty(inode,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c1efe939c77..78b68af3b0e 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -290,7 +290,15 @@ static int ocfs2_readpage(struct file *file, struct page *page)
290 } 290 }
291 291
292 if (down_read_trylock(&oi->ip_alloc_sem) == 0) { 292 if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
293 /*
294 * Unlock the page and cycle ip_alloc_sem so that we don't
295 * busyloop waiting for ip_alloc_sem to unlock
296 */
293 ret = AOP_TRUNCATED_PAGE; 297 ret = AOP_TRUNCATED_PAGE;
298 unlock_page(page);
299 unlock = 0;
300 down_read(&oi->ip_alloc_sem);
301 up_read(&oi->ip_alloc_sem);
294 goto out_inode_unlock; 302 goto out_inode_unlock;
295 } 303 }
296 304
@@ -563,6 +571,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
563{ 571{
564 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 572 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
565 int level; 573 int level;
574 wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
566 575
567 /* this io's submitter should not have unlocked this before we could */ 576 /* this io's submitter should not have unlocked this before we could */
568 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); 577 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
@@ -570,6 +579,15 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
570 if (ocfs2_iocb_is_sem_locked(iocb)) 579 if (ocfs2_iocb_is_sem_locked(iocb))
571 ocfs2_iocb_clear_sem_locked(iocb); 580 ocfs2_iocb_clear_sem_locked(iocb);
572 581
582 if (ocfs2_iocb_is_unaligned_aio(iocb)) {
583 ocfs2_iocb_clear_unaligned_aio(iocb);
584
585 if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) &&
586 waitqueue_active(wq)) {
587 wake_up_all(wq);
588 }
589 }
590
573 ocfs2_iocb_clear_rw_locked(iocb); 591 ocfs2_iocb_clear_rw_locked(iocb);
574 592
575 level = ocfs2_iocb_rw_locked_level(iocb); 593 level = ocfs2_iocb_rw_locked_level(iocb);
@@ -863,6 +881,12 @@ struct ocfs2_write_ctxt {
863 struct page *w_target_page; 881 struct page *w_target_page;
864 882
865 /* 883 /*
884 * w_target_locked is used for page_mkwrite path indicating no unlocking
885 * against w_target_page in ocfs2_write_end_nolock.
886 */
887 unsigned int w_target_locked:1;
888
889 /*
866 * ocfs2_write_end() uses this to know what the real range to 890 * ocfs2_write_end() uses this to know what the real range to
867 * write in the target should be. 891 * write in the target should be.
868 */ 892 */
@@ -895,6 +919,24 @@ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
895 919
896static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) 920static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
897{ 921{
922 int i;
923
924 /*
925 * w_target_locked is only set to true in the page_mkwrite() case.
926 * The intent is to allow us to lock the target page from write_begin()
927 * to write_end(). The caller must hold a ref on w_target_page.
928 */
929 if (wc->w_target_locked) {
930 BUG_ON(!wc->w_target_page);
931 for (i = 0; i < wc->w_num_pages; i++) {
932 if (wc->w_target_page == wc->w_pages[i]) {
933 wc->w_pages[i] = NULL;
934 break;
935 }
936 }
937 mark_page_accessed(wc->w_target_page);
938 page_cache_release(wc->w_target_page);
939 }
898 ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); 940 ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
899 941
900 brelse(wc->w_di_bh); 942 brelse(wc->w_di_bh);
@@ -1132,20 +1174,17 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
1132 */ 1174 */
1133 lock_page(mmap_page); 1175 lock_page(mmap_page);
1134 1176
1177 /* Exit and let the caller retry */
1135 if (mmap_page->mapping != mapping) { 1178 if (mmap_page->mapping != mapping) {
1179 WARN_ON(mmap_page->mapping);
1136 unlock_page(mmap_page); 1180 unlock_page(mmap_page);
1137 /* 1181 ret = -EAGAIN;
1138 * Sanity check - the locking in
1139 * ocfs2_pagemkwrite() should ensure
1140 * that this code doesn't trigger.
1141 */
1142 ret = -EINVAL;
1143 mlog_errno(ret);
1144 goto out; 1182 goto out;
1145 } 1183 }
1146 1184
1147 page_cache_get(mmap_page); 1185 page_cache_get(mmap_page);
1148 wc->w_pages[i] = mmap_page; 1186 wc->w_pages[i] = mmap_page;
1187 wc->w_target_locked = true;
1149 } else { 1188 } else {
1150 wc->w_pages[i] = find_or_create_page(mapping, index, 1189 wc->w_pages[i] = find_or_create_page(mapping, index,
1151 GFP_NOFS); 1190 GFP_NOFS);
@@ -1160,6 +1199,8 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
1160 wc->w_target_page = wc->w_pages[i]; 1199 wc->w_target_page = wc->w_pages[i];
1161 } 1200 }
1162out: 1201out:
1202 if (ret)
1203 wc->w_target_locked = false;
1163 return ret; 1204 return ret;
1164} 1205}
1165 1206
@@ -1817,11 +1858,23 @@ try_again:
1817 */ 1858 */
1818 ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len, 1859 ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
1819 cluster_of_pages, mmap_page); 1860 cluster_of_pages, mmap_page);
1820 if (ret) { 1861 if (ret && ret != -EAGAIN) {
1821 mlog_errno(ret); 1862 mlog_errno(ret);
1822 goto out_quota; 1863 goto out_quota;
1823 } 1864 }
1824 1865
1866 /*
1867 * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock
1868 * the target page. In this case, we exit with no error and no target
1869 * page. This will trigger the caller, page_mkwrite(), to re-try
1870 * the operation.
1871 */
1872 if (ret == -EAGAIN) {
1873 BUG_ON(wc->w_target_page);
1874 ret = 0;
1875 goto out_quota;
1876 }
1877
1825 ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos, 1878 ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
1826 len); 1879 len);
1827 if (ret) { 1880 if (ret) {
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 75cf3ad987a..ffb2da370a9 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -78,6 +78,7 @@ enum ocfs2_iocb_lock_bits {
78 OCFS2_IOCB_RW_LOCK = 0, 78 OCFS2_IOCB_RW_LOCK = 0,
79 OCFS2_IOCB_RW_LOCK_LEVEL, 79 OCFS2_IOCB_RW_LOCK_LEVEL,
80 OCFS2_IOCB_SEM, 80 OCFS2_IOCB_SEM,
81 OCFS2_IOCB_UNALIGNED_IO,
81 OCFS2_IOCB_NUM_LOCKS 82 OCFS2_IOCB_NUM_LOCKS
82}; 83};
83 84
@@ -91,4 +92,17 @@ enum ocfs2_iocb_lock_bits {
91 clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) 92 clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
92#define ocfs2_iocb_is_sem_locked(iocb) \ 93#define ocfs2_iocb_is_sem_locked(iocb) \
93 test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) 94 test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
95
96#define ocfs2_iocb_set_unaligned_aio(iocb) \
97 set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
98#define ocfs2_iocb_clear_unaligned_aio(iocb) \
99 clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
100#define ocfs2_iocb_is_unaligned_aio(iocb) \
101 test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
102
103#define OCFS2_IOEND_WQ_HASH_SZ 37
104#define ocfs2_ioend_wq(v) (&ocfs2__ioend_wq[((unsigned long)(v)) %\
105 OCFS2_IOEND_WQ_HASH_SZ])
106extern wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
107
94#endif /* OCFS2_FILE_H */ 108#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 9a3e6bbff27..a4e855e3690 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -216,6 +216,7 @@ struct o2hb_region {
216 216
217 struct list_head hr_all_item; 217 struct list_head hr_all_item;
218 unsigned hr_unclean_stop:1, 218 unsigned hr_unclean_stop:1,
219 hr_aborted_start:1,
219 hr_item_pinned:1, 220 hr_item_pinned:1,
220 hr_item_dropped:1; 221 hr_item_dropped:1;
221 222
@@ -254,6 +255,10 @@ struct o2hb_region {
254 * a more complete api that doesn't lead to this sort of fragility. */ 255 * a more complete api that doesn't lead to this sort of fragility. */
255 atomic_t hr_steady_iterations; 256 atomic_t hr_steady_iterations;
256 257
258 /* terminate o2hb thread if it does not reach steady state
259 * (hr_steady_iterations == 0) within hr_unsteady_iterations */
260 atomic_t hr_unsteady_iterations;
261
257 char hr_dev_name[BDEVNAME_SIZE]; 262 char hr_dev_name[BDEVNAME_SIZE];
258 263
259 unsigned int hr_timeout_ms; 264 unsigned int hr_timeout_ms;
@@ -324,6 +329,10 @@ static void o2hb_write_timeout(struct work_struct *work)
324 329
325static void o2hb_arm_write_timeout(struct o2hb_region *reg) 330static void o2hb_arm_write_timeout(struct o2hb_region *reg)
326{ 331{
332 /* Arm writeout only after thread reaches steady state */
333 if (atomic_read(&reg->hr_steady_iterations) != 0)
334 return;
335
327 mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", 336 mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n",
328 O2HB_MAX_WRITE_TIMEOUT_MS); 337 O2HB_MAX_WRITE_TIMEOUT_MS);
329 338
@@ -537,9 +546,14 @@ static int o2hb_verify_crc(struct o2hb_region *reg,
537 return read == computed; 546 return read == computed;
538} 547}
539 548
540/* We want to make sure that nobody is heartbeating on top of us -- 549/*
541 * this will help detect an invalid configuration. */ 550 * Compare the slot data with what we wrote in the last iteration.
542static void o2hb_check_last_timestamp(struct o2hb_region *reg) 551 * If the match fails, print an appropriate error message. This is to
552 * detect errors like... another node hearting on the same slot,
553 * flaky device that is losing writes, etc.
554 * Returns 1 if check succeeds, 0 otherwise.
555 */
556static int o2hb_check_own_slot(struct o2hb_region *reg)
543{ 557{
544 struct o2hb_disk_slot *slot; 558 struct o2hb_disk_slot *slot;
545 struct o2hb_disk_heartbeat_block *hb_block; 559 struct o2hb_disk_heartbeat_block *hb_block;
@@ -548,13 +562,13 @@ static void o2hb_check_last_timestamp(struct o2hb_region *reg)
548 slot = &reg->hr_slots[o2nm_this_node()]; 562 slot = &reg->hr_slots[o2nm_this_node()];
549 /* Don't check on our 1st timestamp */ 563 /* Don't check on our 1st timestamp */
550 if (!slot->ds_last_time) 564 if (!slot->ds_last_time)
551 return; 565 return 0;
552 566
553 hb_block = slot->ds_raw_block; 567 hb_block = slot->ds_raw_block;
554 if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time && 568 if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time &&
555 le64_to_cpu(hb_block->hb_generation) == slot->ds_last_generation && 569 le64_to_cpu(hb_block->hb_generation) == slot->ds_last_generation &&
556 hb_block->hb_node == slot->ds_node_num) 570 hb_block->hb_node == slot->ds_node_num)
557 return; 571 return 1;
558 572
559#define ERRSTR1 "Another node is heartbeating on device" 573#define ERRSTR1 "Another node is heartbeating on device"
560#define ERRSTR2 "Heartbeat generation mismatch on device" 574#define ERRSTR2 "Heartbeat generation mismatch on device"
@@ -574,6 +588,8 @@ static void o2hb_check_last_timestamp(struct o2hb_region *reg)
574 (unsigned long long)slot->ds_last_time, hb_block->hb_node, 588 (unsigned long long)slot->ds_last_time, hb_block->hb_node,
575 (unsigned long long)le64_to_cpu(hb_block->hb_generation), 589 (unsigned long long)le64_to_cpu(hb_block->hb_generation),
576 (unsigned long long)le64_to_cpu(hb_block->hb_seq)); 590 (unsigned long long)le64_to_cpu(hb_block->hb_seq));
591
592 return 0;
577} 593}
578 594
579static inline void o2hb_prepare_block(struct o2hb_region *reg, 595static inline void o2hb_prepare_block(struct o2hb_region *reg,
@@ -719,17 +735,24 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
719 o2nm_node_put(node); 735 o2nm_node_put(node);
720} 736}
721 737
722static void o2hb_set_quorum_device(struct o2hb_region *reg, 738static void o2hb_set_quorum_device(struct o2hb_region *reg)
723 struct o2hb_disk_slot *slot)
724{ 739{
725 assert_spin_locked(&o2hb_live_lock);
726
727 if (!o2hb_global_heartbeat_active()) 740 if (!o2hb_global_heartbeat_active())
728 return; 741 return;
729 742
730 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) 743 /* Prevent race with o2hb_heartbeat_group_drop_item() */
744 if (kthread_should_stop())
745 return;
746
747 /* Tag region as quorum only after thread reaches steady state */
748 if (atomic_read(&reg->hr_steady_iterations) != 0)
731 return; 749 return;
732 750
751 spin_lock(&o2hb_live_lock);
752
753 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
754 goto unlock;
755
733 /* 756 /*
734 * A region can be added to the quorum only when it sees all 757 * A region can be added to the quorum only when it sees all
735 * live nodes heartbeat on it. In other words, the region has been 758 * live nodes heartbeat on it. In other words, the region has been
@@ -737,13 +760,10 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg,
737 */ 760 */
738 if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap, 761 if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
739 sizeof(o2hb_live_node_bitmap))) 762 sizeof(o2hb_live_node_bitmap)))
740 return; 763 goto unlock;
741
742 if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD)
743 return;
744 764
745 printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n", 765 printk(KERN_NOTICE "o2hb: Region %s (%s) is now a quorum device\n",
746 config_item_name(&reg->hr_item)); 766 config_item_name(&reg->hr_item), reg->hr_dev_name);
747 767
748 set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); 768 set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
749 769
@@ -754,6 +774,8 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg,
754 if (o2hb_pop_count(&o2hb_quorum_region_bitmap, 774 if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
755 O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF) 775 O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
756 o2hb_region_unpin(NULL); 776 o2hb_region_unpin(NULL);
777unlock:
778 spin_unlock(&o2hb_live_lock);
757} 779}
758 780
759static int o2hb_check_slot(struct o2hb_region *reg, 781static int o2hb_check_slot(struct o2hb_region *reg,
@@ -925,8 +947,6 @@ fire_callbacks:
925 slot->ds_equal_samples = 0; 947 slot->ds_equal_samples = 0;
926 } 948 }
927out: 949out:
928 o2hb_set_quorum_device(reg, slot);
929
930 spin_unlock(&o2hb_live_lock); 950 spin_unlock(&o2hb_live_lock);
931 951
932 o2hb_run_event_list(&event); 952 o2hb_run_event_list(&event);
@@ -957,7 +977,8 @@ static int o2hb_highest_node(unsigned long *nodes,
957 977
958static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) 978static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
959{ 979{
960 int i, ret, highest_node, change = 0; 980 int i, ret, highest_node;
981 int membership_change = 0, own_slot_ok = 0;
961 unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; 982 unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
962 unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; 983 unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
963 struct o2hb_bio_wait_ctxt write_wc; 984 struct o2hb_bio_wait_ctxt write_wc;
@@ -966,7 +987,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
966 sizeof(configured_nodes)); 987 sizeof(configured_nodes));
967 if (ret) { 988 if (ret) {
968 mlog_errno(ret); 989 mlog_errno(ret);
969 return ret; 990 goto bail;
970 } 991 }
971 992
972 /* 993 /*
@@ -982,8 +1003,9 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
982 1003
983 highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); 1004 highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
984 if (highest_node >= O2NM_MAX_NODES) { 1005 if (highest_node >= O2NM_MAX_NODES) {
985 mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); 1006 mlog(ML_NOTICE, "o2hb: No configured nodes found!\n");
986 return -EINVAL; 1007 ret = -EINVAL;
1008 goto bail;
987 } 1009 }
988 1010
989 /* No sense in reading the slots of nodes that don't exist 1011 /* No sense in reading the slots of nodes that don't exist
@@ -993,29 +1015,27 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
993 ret = o2hb_read_slots(reg, highest_node + 1); 1015 ret = o2hb_read_slots(reg, highest_node + 1);
994 if (ret < 0) { 1016 if (ret < 0) {
995 mlog_errno(ret); 1017 mlog_errno(ret);
996 return ret; 1018 goto bail;
997 } 1019 }
998 1020
999 /* With an up to date view of the slots, we can check that no 1021 /* With an up to date view of the slots, we can check that no
1000 * other node has been improperly configured to heartbeat in 1022 * other node has been improperly configured to heartbeat in
1001 * our slot. */ 1023 * our slot. */
1002 o2hb_check_last_timestamp(reg); 1024 own_slot_ok = o2hb_check_own_slot(reg);
1003 1025
1004 /* fill in the proper info for our next heartbeat */ 1026 /* fill in the proper info for our next heartbeat */
1005 o2hb_prepare_block(reg, reg->hr_generation); 1027 o2hb_prepare_block(reg, reg->hr_generation);
1006 1028
1007 /* And fire off the write. Note that we don't wait on this I/O
1008 * until later. */
1009 ret = o2hb_issue_node_write(reg, &write_wc); 1029 ret = o2hb_issue_node_write(reg, &write_wc);
1010 if (ret < 0) { 1030 if (ret < 0) {
1011 mlog_errno(ret); 1031 mlog_errno(ret);
1012 return ret; 1032 goto bail;
1013 } 1033 }
1014 1034
1015 i = -1; 1035 i = -1;
1016 while((i = find_next_bit(configured_nodes, 1036 while((i = find_next_bit(configured_nodes,
1017 O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { 1037 O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
1018 change |= o2hb_check_slot(reg, &reg->hr_slots[i]); 1038 membership_change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
1019 } 1039 }
1020 1040
1021 /* 1041 /*
@@ -1030,18 +1050,39 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
1030 * disk */ 1050 * disk */
1031 mlog(ML_ERROR, "Write error %d on device \"%s\"\n", 1051 mlog(ML_ERROR, "Write error %d on device \"%s\"\n",
1032 write_wc.wc_error, reg->hr_dev_name); 1052 write_wc.wc_error, reg->hr_dev_name);
1033 return write_wc.wc_error; 1053 ret = write_wc.wc_error;
1054 goto bail;
1034 } 1055 }
1035 1056
1036 o2hb_arm_write_timeout(reg); 1057 /* Skip disarming the timeout if own slot has stale/bad data */
1058 if (own_slot_ok) {
1059 o2hb_set_quorum_device(reg);
1060 o2hb_arm_write_timeout(reg);
1061 }
1037 1062
1063bail:
1038 /* let the person who launched us know when things are steady */ 1064 /* let the person who launched us know when things are steady */
1039 if (!change && (atomic_read(&reg->hr_steady_iterations) != 0)) { 1065 if (atomic_read(&reg->hr_steady_iterations) != 0) {
1040 if (atomic_dec_and_test(&reg->hr_steady_iterations)) 1066 if (!ret && own_slot_ok && !membership_change) {
1067 if (atomic_dec_and_test(&reg->hr_steady_iterations))
1068 wake_up(&o2hb_steady_queue);
1069 }
1070 }
1071
1072 if (atomic_read(&reg->hr_steady_iterations) != 0) {
1073 if (atomic_dec_and_test(&reg->hr_unsteady_iterations)) {
1074 printk(KERN_NOTICE "o2hb: Unable to stabilize "
1075 "heartbeart on region %s (%s)\n",
1076 config_item_name(&reg->hr_item),
1077 reg->hr_dev_name);
1078 atomic_set(&reg->hr_steady_iterations, 0);
1079 reg->hr_aborted_start = 1;
1041 wake_up(&o2hb_steady_queue); 1080 wake_up(&o2hb_steady_queue);
1081 ret = -EIO;
1082 }
1042 } 1083 }
1043 1084
1044 return 0; 1085 return ret;
1045} 1086}
1046 1087
1047/* Subtract b from a, storing the result in a. a *must* have a larger 1088/* Subtract b from a, storing the result in a. a *must* have a larger
@@ -1095,7 +1136,8 @@ static int o2hb_thread(void *data)
1095 /* Pin node */ 1136 /* Pin node */
1096 o2nm_depend_this_node(); 1137 o2nm_depend_this_node();
1097 1138
1098 while (!kthread_should_stop() && !reg->hr_unclean_stop) { 1139 while (!kthread_should_stop() &&
1140 !reg->hr_unclean_stop && !reg->hr_aborted_start) {
1099 /* We track the time spent inside 1141 /* We track the time spent inside
1100 * o2hb_do_disk_heartbeat so that we avoid more than 1142 * o2hb_do_disk_heartbeat so that we avoid more than
1101 * hr_timeout_ms between disk writes. On busy systems 1143 * hr_timeout_ms between disk writes. On busy systems
@@ -1103,10 +1145,7 @@ static int o2hb_thread(void *data)
1103 * likely to time itself out. */ 1145 * likely to time itself out. */
1104 do_gettimeofday(&before_hb); 1146 do_gettimeofday(&before_hb);
1105 1147
1106 i = 0; 1148 ret = o2hb_do_disk_heartbeat(reg);
1107 do {
1108 ret = o2hb_do_disk_heartbeat(reg);
1109 } while (ret && ++i < 2);
1110 1149
1111 do_gettimeofday(&after_hb); 1150 do_gettimeofday(&after_hb);
1112 elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); 1151 elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
@@ -1117,7 +1156,8 @@ static int o2hb_thread(void *data)
1117 after_hb.tv_sec, (unsigned long) after_hb.tv_usec, 1156 after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
1118 elapsed_msec); 1157 elapsed_msec);
1119 1158
1120 if (elapsed_msec < reg->hr_timeout_ms) { 1159 if (!kthread_should_stop() &&
1160 elapsed_msec < reg->hr_timeout_ms) {
1121 /* the kthread api has blocked signals for us so no 1161 /* the kthread api has blocked signals for us so no
1122 * need to record the return value. */ 1162 * need to record the return value. */
1123 msleep_interruptible(reg->hr_timeout_ms - elapsed_msec); 1163 msleep_interruptible(reg->hr_timeout_ms - elapsed_msec);
@@ -1134,20 +1174,20 @@ static int o2hb_thread(void *data)
1134 * to timeout on this region when we could just as easily 1174 * to timeout on this region when we could just as easily
1135 * write a clear generation - thus indicating to them that 1175 * write a clear generation - thus indicating to them that
1136 * this node has left this region. 1176 * this node has left this region.
1137 * 1177 */
1138 * XXX: Should we skip this on unclean_stop? */ 1178 if (!reg->hr_unclean_stop && !reg->hr_aborted_start) {
1139 o2hb_prepare_block(reg, 0); 1179 o2hb_prepare_block(reg, 0);
1140 ret = o2hb_issue_node_write(reg, &write_wc); 1180 ret = o2hb_issue_node_write(reg, &write_wc);
1141 if (ret == 0) { 1181 if (ret == 0)
1142 o2hb_wait_on_io(reg, &write_wc); 1182 o2hb_wait_on_io(reg, &write_wc);
1143 } else { 1183 else
1144 mlog_errno(ret); 1184 mlog_errno(ret);
1145 } 1185 }
1146 1186
1147 /* Unpin node */ 1187 /* Unpin node */
1148 o2nm_undepend_this_node(); 1188 o2nm_undepend_this_node();
1149 1189
1150 mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n"); 1190 mlog(ML_HEARTBEAT|ML_KTHREAD, "o2hb thread exiting\n");
1151 1191
1152 return 0; 1192 return 0;
1153} 1193}
@@ -1158,6 +1198,7 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
1158 struct o2hb_debug_buf *db = inode->i_private; 1198 struct o2hb_debug_buf *db = inode->i_private;
1159 struct o2hb_region *reg; 1199 struct o2hb_region *reg;
1160 unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 1200 unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
1201 unsigned long lts;
1161 char *buf = NULL; 1202 char *buf = NULL;
1162 int i = -1; 1203 int i = -1;
1163 int out = 0; 1204 int out = 0;
@@ -1194,9 +1235,11 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
1194 1235
1195 case O2HB_DB_TYPE_REGION_ELAPSED_TIME: 1236 case O2HB_DB_TYPE_REGION_ELAPSED_TIME:
1196 reg = (struct o2hb_region *)db->db_data; 1237 reg = (struct o2hb_region *)db->db_data;
1197 out += snprintf(buf + out, PAGE_SIZE - out, "%u\n", 1238 lts = reg->hr_last_timeout_start;
1198 jiffies_to_msecs(jiffies - 1239 /* If 0, it has never been set before */
1199 reg->hr_last_timeout_start)); 1240 if (lts)
1241 lts = jiffies_to_msecs(jiffies - lts);
1242 out += snprintf(buf + out, PAGE_SIZE - out, "%lu\n", lts);
1200 goto done; 1243 goto done;
1201 1244
1202 case O2HB_DB_TYPE_REGION_PINNED: 1245 case O2HB_DB_TYPE_REGION_PINNED:
@@ -1426,6 +1469,8 @@ static void o2hb_region_release(struct config_item *item)
1426 struct page *page; 1469 struct page *page;
1427 struct o2hb_region *reg = to_o2hb_region(item); 1470 struct o2hb_region *reg = to_o2hb_region(item);
1428 1471
1472 mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name);
1473
1429 if (reg->hr_tmp_block) 1474 if (reg->hr_tmp_block)
1430 kfree(reg->hr_tmp_block); 1475 kfree(reg->hr_tmp_block);
1431 1476
@@ -1792,7 +1837,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1792 live_threshold <<= 1; 1837 live_threshold <<= 1;
1793 spin_unlock(&o2hb_live_lock); 1838 spin_unlock(&o2hb_live_lock);
1794 } 1839 }
1795 atomic_set(&reg->hr_steady_iterations, live_threshold + 1); 1840 ++live_threshold;
1841 atomic_set(&reg->hr_steady_iterations, live_threshold);
1842 /* unsteady_iterations is double the steady_iterations */
1843 atomic_set(&reg->hr_unsteady_iterations, (live_threshold << 1));
1796 1844
1797 hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s", 1845 hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
1798 reg->hr_item.ci_name); 1846 reg->hr_item.ci_name);
@@ -1809,14 +1857,12 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1809 ret = wait_event_interruptible(o2hb_steady_queue, 1857 ret = wait_event_interruptible(o2hb_steady_queue,
1810 atomic_read(&reg->hr_steady_iterations) == 0); 1858 atomic_read(&reg->hr_steady_iterations) == 0);
1811 if (ret) { 1859 if (ret) {
1812 /* We got interrupted (hello ptrace!). Clean up */ 1860 atomic_set(&reg->hr_steady_iterations, 0);
1813 spin_lock(&o2hb_live_lock); 1861 reg->hr_aborted_start = 1;
1814 hb_task = reg->hr_task; 1862 }
1815 reg->hr_task = NULL;
1816 spin_unlock(&o2hb_live_lock);
1817 1863
1818 if (hb_task) 1864 if (reg->hr_aborted_start) {
1819 kthread_stop(hb_task); 1865 ret = -EIO;
1820 goto out; 1866 goto out;
1821 } 1867 }
1822 1868
@@ -1833,8 +1879,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1833 ret = -EIO; 1879 ret = -EIO;
1834 1880
1835 if (hb_task && o2hb_global_heartbeat_active()) 1881 if (hb_task && o2hb_global_heartbeat_active())
1836 printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n", 1882 printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%s)\n",
1837 config_item_name(&reg->hr_item)); 1883 config_item_name(&reg->hr_item), reg->hr_dev_name);
1838 1884
1839out: 1885out:
1840 if (filp) 1886 if (filp)
@@ -2092,13 +2138,6 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
2092 2138
2093 /* stop the thread when the user removes the region dir */ 2139 /* stop the thread when the user removes the region dir */
2094 spin_lock(&o2hb_live_lock); 2140 spin_lock(&o2hb_live_lock);
2095 if (o2hb_global_heartbeat_active()) {
2096 clear_bit(reg->hr_region_num, o2hb_region_bitmap);
2097 clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
2098 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
2099 quorum_region = 1;
2100 clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
2101 }
2102 hb_task = reg->hr_task; 2141 hb_task = reg->hr_task;
2103 reg->hr_task = NULL; 2142 reg->hr_task = NULL;
2104 reg->hr_item_dropped = 1; 2143 reg->hr_item_dropped = 1;
@@ -2107,19 +2146,30 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
2107 if (hb_task) 2146 if (hb_task)
2108 kthread_stop(hb_task); 2147 kthread_stop(hb_task);
2109 2148
2149 if (o2hb_global_heartbeat_active()) {
2150 spin_lock(&o2hb_live_lock);
2151 clear_bit(reg->hr_region_num, o2hb_region_bitmap);
2152 clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
2153 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
2154 quorum_region = 1;
2155 clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
2156 spin_unlock(&o2hb_live_lock);
2157 printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%s)\n",
2158 ((atomic_read(&reg->hr_steady_iterations) == 0) ?
2159 "stopped" : "start aborted"), config_item_name(item),
2160 reg->hr_dev_name);
2161 }
2162
2110 /* 2163 /*
2111 * If we're racing a dev_write(), we need to wake them. They will 2164 * If we're racing a dev_write(), we need to wake them. They will
2112 * check reg->hr_task 2165 * check reg->hr_task
2113 */ 2166 */
2114 if (atomic_read(&reg->hr_steady_iterations) != 0) { 2167 if (atomic_read(&reg->hr_steady_iterations) != 0) {
2168 reg->hr_aborted_start = 1;
2115 atomic_set(&reg->hr_steady_iterations, 0); 2169 atomic_set(&reg->hr_steady_iterations, 0);
2116 wake_up(&o2hb_steady_queue); 2170 wake_up(&o2hb_steady_queue);
2117 } 2171 }
2118 2172
2119 if (o2hb_global_heartbeat_active())
2120 printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
2121 config_item_name(&reg->hr_item));
2122
2123 config_item_put(item); 2173 config_item_put(item);
2124 2174
2125 if (!o2hb_global_heartbeat_active() || !quorum_region) 2175 if (!o2hb_global_heartbeat_active() || !quorum_region)
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 3a5835904b3..dc45deb19e6 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -47,6 +47,7 @@
47#define SC_DEBUG_NAME "sock_containers" 47#define SC_DEBUG_NAME "sock_containers"
48#define NST_DEBUG_NAME "send_tracking" 48#define NST_DEBUG_NAME "send_tracking"
49#define STATS_DEBUG_NAME "stats" 49#define STATS_DEBUG_NAME "stats"
50#define NODES_DEBUG_NAME "connected_nodes"
50 51
51#define SHOW_SOCK_CONTAINERS 0 52#define SHOW_SOCK_CONTAINERS 0
52#define SHOW_SOCK_STATS 1 53#define SHOW_SOCK_STATS 1
@@ -55,6 +56,7 @@ static struct dentry *o2net_dentry;
55static struct dentry *sc_dentry; 56static struct dentry *sc_dentry;
56static struct dentry *nst_dentry; 57static struct dentry *nst_dentry;
57static struct dentry *stats_dentry; 58static struct dentry *stats_dentry;
59static struct dentry *nodes_dentry;
58 60
59static DEFINE_SPINLOCK(o2net_debug_lock); 61static DEFINE_SPINLOCK(o2net_debug_lock);
60 62
@@ -491,53 +493,87 @@ static const struct file_operations sc_seq_fops = {
491 .release = sc_fop_release, 493 .release = sc_fop_release,
492}; 494};
493 495
494int o2net_debugfs_init(void) 496static int o2net_fill_bitmap(char *buf, int len)
495{ 497{
496 o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL); 498 unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
497 if (!o2net_dentry) { 499 int i = -1, out = 0;
498 mlog_errno(-ENOMEM);
499 goto bail;
500 }
501 500
502 nst_dentry = debugfs_create_file(NST_DEBUG_NAME, S_IFREG|S_IRUSR, 501 o2net_fill_node_map(map, sizeof(map));
503 o2net_dentry, NULL,
504 &nst_seq_fops);
505 if (!nst_dentry) {
506 mlog_errno(-ENOMEM);
507 goto bail;
508 }
509 502
510 sc_dentry = debugfs_create_file(SC_DEBUG_NAME, S_IFREG|S_IRUSR, 503 while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
511 o2net_dentry, NULL, 504 out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
512 &sc_seq_fops); 505 out += snprintf(buf + out, PAGE_SIZE - out, "\n");
513 if (!sc_dentry) {
514 mlog_errno(-ENOMEM);
515 goto bail;
516 }
517 506
518 stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, S_IFREG|S_IRUSR, 507 return out;
519 o2net_dentry, NULL, 508}
520 &stats_seq_fops); 509
521 if (!stats_dentry) { 510static int nodes_fop_open(struct inode *inode, struct file *file)
522 mlog_errno(-ENOMEM); 511{
523 goto bail; 512 char *buf;
524 } 513
514 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
515 if (!buf)
516 return -ENOMEM;
517
518 i_size_write(inode, o2net_fill_bitmap(buf, PAGE_SIZE));
519
520 file->private_data = buf;
525 521
526 return 0; 522 return 0;
527bail:
528 debugfs_remove(stats_dentry);
529 debugfs_remove(sc_dentry);
530 debugfs_remove(nst_dentry);
531 debugfs_remove(o2net_dentry);
532 return -ENOMEM;
533} 523}
534 524
525static int o2net_debug_release(struct inode *inode, struct file *file)
526{
527 kfree(file->private_data);
528 return 0;
529}
530
531static ssize_t o2net_debug_read(struct file *file, char __user *buf,
532 size_t nbytes, loff_t *ppos)
533{
534 return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
535 i_size_read(file->f_mapping->host));
536}
537
538static const struct file_operations nodes_fops = {
539 .open = nodes_fop_open,
540 .release = o2net_debug_release,
541 .read = o2net_debug_read,
542 .llseek = generic_file_llseek,
543};
544
535void o2net_debugfs_exit(void) 545void o2net_debugfs_exit(void)
536{ 546{
547 debugfs_remove(nodes_dentry);
537 debugfs_remove(stats_dentry); 548 debugfs_remove(stats_dentry);
538 debugfs_remove(sc_dentry); 549 debugfs_remove(sc_dentry);
539 debugfs_remove(nst_dentry); 550 debugfs_remove(nst_dentry);
540 debugfs_remove(o2net_dentry); 551 debugfs_remove(o2net_dentry);
541} 552}
542 553
554int o2net_debugfs_init(void)
555{
556 mode_t mode = S_IFREG|S_IRUSR;
557
558 o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
559 if (o2net_dentry)
560 nst_dentry = debugfs_create_file(NST_DEBUG_NAME, mode,
561 o2net_dentry, NULL, &nst_seq_fops);
562 if (nst_dentry)
563 sc_dentry = debugfs_create_file(SC_DEBUG_NAME, mode,
564 o2net_dentry, NULL, &sc_seq_fops);
565 if (sc_dentry)
566 stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, mode,
567 o2net_dentry, NULL, &stats_seq_fops);
568 if (stats_dentry)
569 nodes_dentry = debugfs_create_file(NODES_DEBUG_NAME, mode,
570 o2net_dentry, NULL, &nodes_fops);
571 if (nodes_dentry)
572 return 0;
573
574 o2net_debugfs_exit();
575 mlog_errno(-ENOMEM);
576 return -ENOMEM;
577}
578
543#endif /* CONFIG_DEBUG_FS */ 579#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index db5ee4b4f47..044e7b58d31 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -59,6 +59,7 @@
59#include <linux/idr.h> 59#include <linux/idr.h>
60#include <linux/kref.h> 60#include <linux/kref.h>
61#include <linux/net.h> 61#include <linux/net.h>
62#include <linux/export.h>
62#include <net/tcp.h> 63#include <net/tcp.h>
63 64
64#include <asm/uaccess.h> 65#include <asm/uaccess.h>
@@ -545,7 +546,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
545 } 546 }
546 547
547 if (was_valid && !valid) { 548 if (was_valid && !valid) {
548 printk(KERN_NOTICE "o2net: no longer connected to " 549 printk(KERN_NOTICE "o2net: No longer connected to "
549 SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc)); 550 SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
550 o2net_complete_nodes_nsw(nn); 551 o2net_complete_nodes_nsw(nn);
551 } 552 }
@@ -555,7 +556,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
555 cancel_delayed_work(&nn->nn_connect_expired); 556 cancel_delayed_work(&nn->nn_connect_expired);
556 printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n", 557 printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n",
557 o2nm_this_node() > sc->sc_node->nd_num ? 558 o2nm_this_node() > sc->sc_node->nd_num ?
558 "connected to" : "accepted connection from", 559 "Connected to" : "Accepted connection from",
559 SC_NODEF_ARGS(sc)); 560 SC_NODEF_ARGS(sc));
560 } 561 }
561 562
@@ -643,7 +644,7 @@ static void o2net_state_change(struct sock *sk)
643 o2net_sc_queue_work(sc, &sc->sc_connect_work); 644 o2net_sc_queue_work(sc, &sc->sc_connect_work);
644 break; 645 break;
645 default: 646 default:
646 printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT 647 printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT
647 " shutdown, state %d\n", 648 " shutdown, state %d\n",
648 SC_NODEF_ARGS(sc), sk->sk_state); 649 SC_NODEF_ARGS(sc), sk->sk_state);
649 o2net_sc_queue_work(sc, &sc->sc_shutdown_work); 650 o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
@@ -1034,6 +1035,25 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
1034 return ret; 1035 return ret;
1035} 1036}
1036 1037
1038/* Get a map of all nodes to which this node is currently connected to */
1039void o2net_fill_node_map(unsigned long *map, unsigned bytes)
1040{
1041 struct o2net_sock_container *sc;
1042 int node, ret;
1043
1044 BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
1045
1046 memset(map, 0, bytes);
1047 for (node = 0; node < O2NM_MAX_NODES; ++node) {
1048 o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret);
1049 if (!ret) {
1050 set_bit(node, map);
1051 sc_put(sc);
1052 }
1053 }
1054}
1055EXPORT_SYMBOL_GPL(o2net_fill_node_map);
1056
1037int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, 1057int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
1038 size_t caller_veclen, u8 target_node, int *status) 1058 size_t caller_veclen, u8 target_node, int *status)
1039{ 1059{
@@ -1284,11 +1304,11 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
1284 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); 1304 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
1285 1305
1286 if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) { 1306 if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) {
1287 mlog(ML_NOTICE, SC_NODEF_FMT " advertised net protocol " 1307 printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " Advertised net "
1288 "version %llu but %llu is required, disconnecting\n", 1308 "protocol version %llu but %llu is required. "
1289 SC_NODEF_ARGS(sc), 1309 "Disconnecting.\n", SC_NODEF_ARGS(sc),
1290 (unsigned long long)be64_to_cpu(hand->protocol_version), 1310 (unsigned long long)be64_to_cpu(hand->protocol_version),
1291 O2NET_PROTOCOL_VERSION); 1311 O2NET_PROTOCOL_VERSION);
1292 1312
1293 /* don't bother reconnecting if its the wrong version. */ 1313 /* don't bother reconnecting if its the wrong version. */
1294 o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1314 o2net_ensure_shutdown(nn, sc, -ENOTCONN);
@@ -1302,33 +1322,33 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
1302 */ 1322 */
1303 if (be32_to_cpu(hand->o2net_idle_timeout_ms) != 1323 if (be32_to_cpu(hand->o2net_idle_timeout_ms) !=
1304 o2net_idle_timeout()) { 1324 o2net_idle_timeout()) {
1305 mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of " 1325 printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a network "
1306 "%u ms, but we use %u ms locally. disconnecting\n", 1326 "idle timeout of %u ms, but we use %u ms locally. "
1307 SC_NODEF_ARGS(sc), 1327 "Disconnecting.\n", SC_NODEF_ARGS(sc),
1308 be32_to_cpu(hand->o2net_idle_timeout_ms), 1328 be32_to_cpu(hand->o2net_idle_timeout_ms),
1309 o2net_idle_timeout()); 1329 o2net_idle_timeout());
1310 o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1330 o2net_ensure_shutdown(nn, sc, -ENOTCONN);
1311 return -1; 1331 return -1;
1312 } 1332 }
1313 1333
1314 if (be32_to_cpu(hand->o2net_keepalive_delay_ms) != 1334 if (be32_to_cpu(hand->o2net_keepalive_delay_ms) !=
1315 o2net_keepalive_delay()) { 1335 o2net_keepalive_delay()) {
1316 mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of " 1336 printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a keepalive "
1317 "%u ms, but we use %u ms locally. disconnecting\n", 1337 "delay of %u ms, but we use %u ms locally. "
1318 SC_NODEF_ARGS(sc), 1338 "Disconnecting.\n", SC_NODEF_ARGS(sc),
1319 be32_to_cpu(hand->o2net_keepalive_delay_ms), 1339 be32_to_cpu(hand->o2net_keepalive_delay_ms),
1320 o2net_keepalive_delay()); 1340 o2net_keepalive_delay());
1321 o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1341 o2net_ensure_shutdown(nn, sc, -ENOTCONN);
1322 return -1; 1342 return -1;
1323 } 1343 }
1324 1344
1325 if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) != 1345 if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) !=
1326 O2HB_MAX_WRITE_TIMEOUT_MS) { 1346 O2HB_MAX_WRITE_TIMEOUT_MS) {
1327 mlog(ML_NOTICE, SC_NODEF_FMT " uses a heartbeat timeout of " 1347 printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a heartbeat "
1328 "%u ms, but we use %u ms locally. disconnecting\n", 1348 "timeout of %u ms, but we use %u ms locally. "
1329 SC_NODEF_ARGS(sc), 1349 "Disconnecting.\n", SC_NODEF_ARGS(sc),
1330 be32_to_cpu(hand->o2hb_heartbeat_timeout_ms), 1350 be32_to_cpu(hand->o2hb_heartbeat_timeout_ms),
1331 O2HB_MAX_WRITE_TIMEOUT_MS); 1351 O2HB_MAX_WRITE_TIMEOUT_MS);
1332 o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1352 o2net_ensure_shutdown(nn, sc, -ENOTCONN);
1333 return -1; 1353 return -1;
1334 } 1354 }
@@ -1539,28 +1559,16 @@ static void o2net_idle_timer(unsigned long data)
1539{ 1559{
1540 struct o2net_sock_container *sc = (struct o2net_sock_container *)data; 1560 struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
1541 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); 1561 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
1542
1543#ifdef CONFIG_DEBUG_FS 1562#ifdef CONFIG_DEBUG_FS
1544 ktime_t now = ktime_get(); 1563 unsigned long msecs = ktime_to_ms(ktime_get()) -
1564 ktime_to_ms(sc->sc_tv_timer);
1565#else
1566 unsigned long msecs = o2net_idle_timeout();
1545#endif 1567#endif
1546 1568
1547 printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u " 1569 printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been "
1548 "seconds, shutting it down.\n", SC_NODEF_ARGS(sc), 1570 "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc),
1549 o2net_idle_timeout() / 1000, 1571 msecs / 1000, msecs % 1000);
1550 o2net_idle_timeout() % 1000);
1551
1552#ifdef CONFIG_DEBUG_FS
1553 mlog(ML_NOTICE, "Here are some times that might help debug the "
1554 "situation: (Timer: %lld, Now %lld, DataReady %lld, Advance %lld-%lld, "
1555 "Key 0x%08x, Func %u, FuncTime %lld-%lld)\n",
1556 (long long)ktime_to_us(sc->sc_tv_timer), (long long)ktime_to_us(now),
1557 (long long)ktime_to_us(sc->sc_tv_data_ready),
1558 (long long)ktime_to_us(sc->sc_tv_advance_start),
1559 (long long)ktime_to_us(sc->sc_tv_advance_stop),
1560 sc->sc_msg_key, sc->sc_msg_type,
1561 (long long)ktime_to_us(sc->sc_tv_func_start),
1562 (long long)ktime_to_us(sc->sc_tv_func_stop));
1563#endif
1564 1572
1565 /* 1573 /*
1566 * Initialize the nn_timeout so that the next connection attempt 1574 * Initialize the nn_timeout so that the next connection attempt
@@ -1693,8 +1701,8 @@ static void o2net_start_connect(struct work_struct *work)
1693 1701
1694out: 1702out:
1695 if (ret) { 1703 if (ret) {
1696 mlog(ML_NOTICE, "connect attempt to " SC_NODEF_FMT " failed " 1704 printk(KERN_NOTICE "o2net: Connect attempt to " SC_NODEF_FMT
1697 "with errno %d\n", SC_NODEF_ARGS(sc), ret); 1705 " failed with errno %d\n", SC_NODEF_ARGS(sc), ret);
1698 /* 0 err so that another will be queued and attempted 1706 /* 0 err so that another will be queued and attempted
1699 * from set_nn_state */ 1707 * from set_nn_state */
1700 if (sc) 1708 if (sc)
@@ -1717,8 +1725,8 @@ static void o2net_connect_expired(struct work_struct *work)
1717 1725
1718 spin_lock(&nn->nn_lock); 1726 spin_lock(&nn->nn_lock);
1719 if (!nn->nn_sc_valid) { 1727 if (!nn->nn_sc_valid) {
1720 mlog(ML_ERROR, "no connection established with node %u after " 1728 printk(KERN_NOTICE "o2net: No connection established with "
1721 "%u.%u seconds, giving up and returning errors.\n", 1729 "node %u after %u.%u seconds, giving up.\n",
1722 o2net_num_from_nn(nn), 1730 o2net_num_from_nn(nn),
1723 o2net_idle_timeout() / 1000, 1731 o2net_idle_timeout() / 1000,
1724 o2net_idle_timeout() % 1000); 1732 o2net_idle_timeout() % 1000);
@@ -1861,21 +1869,21 @@ static int o2net_accept_one(struct socket *sock)
1861 1869
1862 node = o2nm_get_node_by_ip(sin.sin_addr.s_addr); 1870 node = o2nm_get_node_by_ip(sin.sin_addr.s_addr);
1863 if (node == NULL) { 1871 if (node == NULL) {
1864 mlog(ML_NOTICE, "attempt to connect from unknown node at %pI4:%d\n", 1872 printk(KERN_NOTICE "o2net: Attempt to connect from unknown "
1865 &sin.sin_addr.s_addr, ntohs(sin.sin_port)); 1873 "node at %pI4:%d\n", &sin.sin_addr.s_addr,
1874 ntohs(sin.sin_port));
1866 ret = -EINVAL; 1875 ret = -EINVAL;
1867 goto out; 1876 goto out;
1868 } 1877 }
1869 1878
1870 if (o2nm_this_node() >= node->nd_num) { 1879 if (o2nm_this_node() >= node->nd_num) {
1871 local_node = o2nm_get_node_by_num(o2nm_this_node()); 1880 local_node = o2nm_get_node_by_num(o2nm_this_node());
1872 mlog(ML_NOTICE, "unexpected connect attempt seen at node '%s' (" 1881 printk(KERN_NOTICE "o2net: Unexpected connect attempt seen "
1873 "%u, %pI4:%d) from node '%s' (%u, %pI4:%d)\n", 1882 "at node '%s' (%u, %pI4:%d) from node '%s' (%u, "
1874 local_node->nd_name, local_node->nd_num, 1883 "%pI4:%d)\n", local_node->nd_name, local_node->nd_num,
1875 &(local_node->nd_ipv4_address), 1884 &(local_node->nd_ipv4_address),
1876 ntohs(local_node->nd_ipv4_port), 1885 ntohs(local_node->nd_ipv4_port), node->nd_name,
1877 node->nd_name, node->nd_num, &sin.sin_addr.s_addr, 1886 node->nd_num, &sin.sin_addr.s_addr, ntohs(sin.sin_port));
1878 ntohs(sin.sin_port));
1879 ret = -EINVAL; 1887 ret = -EINVAL;
1880 goto out; 1888 goto out;
1881 } 1889 }
@@ -1900,10 +1908,10 @@ static int o2net_accept_one(struct socket *sock)
1900 ret = 0; 1908 ret = 0;
1901 spin_unlock(&nn->nn_lock); 1909 spin_unlock(&nn->nn_lock);
1902 if (ret) { 1910 if (ret) {
1903 mlog(ML_NOTICE, "attempt to connect from node '%s' at " 1911 printk(KERN_NOTICE "o2net: Attempt to connect from node '%s' "
1904 "%pI4:%d but it already has an open connection\n", 1912 "at %pI4:%d but it already has an open connection\n",
1905 node->nd_name, &sin.sin_addr.s_addr, 1913 node->nd_name, &sin.sin_addr.s_addr,
1906 ntohs(sin.sin_port)); 1914 ntohs(sin.sin_port));
1907 goto out; 1915 goto out;
1908 } 1916 }
1909 1917
@@ -1983,7 +1991,7 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port)
1983 1991
1984 ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); 1992 ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
1985 if (ret < 0) { 1993 if (ret < 0) {
1986 mlog(ML_ERROR, "unable to create socket, ret=%d\n", ret); 1994 printk(KERN_ERR "o2net: Error %d while creating socket\n", ret);
1987 goto out; 1995 goto out;
1988 } 1996 }
1989 1997
@@ -2000,16 +2008,15 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port)
2000 sock->sk->sk_reuse = 1; 2008 sock->sk->sk_reuse = 1;
2001 ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); 2009 ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
2002 if (ret < 0) { 2010 if (ret < 0) {
2003 mlog(ML_ERROR, "unable to bind socket at %pI4:%u, " 2011 printk(KERN_ERR "o2net: Error %d while binding socket at "
2004 "ret=%d\n", &addr, ntohs(port), ret); 2012 "%pI4:%u\n", ret, &addr, ntohs(port));
2005 goto out; 2013 goto out;
2006 } 2014 }
2007 2015
2008 ret = sock->ops->listen(sock, 64); 2016 ret = sock->ops->listen(sock, 64);
2009 if (ret < 0) { 2017 if (ret < 0)
2010 mlog(ML_ERROR, "unable to listen on %pI4:%u, ret=%d\n", 2018 printk(KERN_ERR "o2net: Error %d while listening on %pI4:%u\n",
2011 &addr, ntohs(port), ret); 2019 ret, &addr, ntohs(port));
2012 }
2013 2020
2014out: 2021out:
2015 if (ret) { 2022 if (ret) {
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index fd6179eb26d..5bada2a69b5 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -106,6 +106,8 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
106 struct list_head *unreg_list); 106 struct list_head *unreg_list);
107void o2net_unregister_handler_list(struct list_head *list); 107void o2net_unregister_handler_list(struct list_head *list);
108 108
109void o2net_fill_node_map(unsigned long *map, unsigned bytes);
110
109struct o2nm_node; 111struct o2nm_node;
110int o2net_register_hb_callbacks(void); 112int o2net_register_hb_callbacks(void);
111void o2net_unregister_hb_callbacks(void); 113void o2net_unregister_hb_callbacks(void);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index e2878b5895f..8fe4e2892ab 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1184,8 +1184,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
1184 if (pde) 1184 if (pde)
1185 le16_add_cpu(&pde->rec_len, 1185 le16_add_cpu(&pde->rec_len,
1186 le16_to_cpu(de->rec_len)); 1186 le16_to_cpu(de->rec_len));
1187 else 1187 de->inode = 0;
1188 de->inode = 0;
1189 dir->i_version++; 1188 dir->i_version++;
1190 ocfs2_journal_dirty(handle, bh); 1189 ocfs2_journal_dirty(handle, bh);
1191 goto bail; 1190 goto bail;
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index d602abb51b6..a5952ceecba 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -859,8 +859,8 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
859void dlm_wait_for_recovery(struct dlm_ctxt *dlm); 859void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
860void dlm_kick_recovery_thread(struct dlm_ctxt *dlm); 860void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
861int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node); 861int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
862int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout); 862void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
863int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout); 863void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout);
864 864
865void dlm_put(struct dlm_ctxt *dlm); 865void dlm_put(struct dlm_ctxt *dlm);
866struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm); 866struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
@@ -877,9 +877,8 @@ static inline void dlm_lockres_get(struct dlm_lock_resource *res)
877 kref_get(&res->refs); 877 kref_get(&res->refs);
878} 878}
879void dlm_lockres_put(struct dlm_lock_resource *res); 879void dlm_lockres_put(struct dlm_lock_resource *res);
880void __dlm_unhash_lockres(struct dlm_lock_resource *res); 880void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
881void __dlm_insert_lockres(struct dlm_ctxt *dlm, 881void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
882 struct dlm_lock_resource *res);
883struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, 882struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
884 const char *name, 883 const char *name,
885 unsigned int len, 884 unsigned int len,
@@ -902,46 +901,15 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
902 const char *name, 901 const char *name,
903 unsigned int namelen); 902 unsigned int namelen);
904 903
905#define dlm_lockres_set_refmap_bit(bit,res) \ 904void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,
906 __dlm_lockres_set_refmap_bit(bit,res,__FILE__,__LINE__) 905 struct dlm_lock_resource *res, int bit);
907#define dlm_lockres_clear_refmap_bit(bit,res) \ 906void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
908 __dlm_lockres_clear_refmap_bit(bit,res,__FILE__,__LINE__) 907 struct dlm_lock_resource *res, int bit);
909 908
910static inline void __dlm_lockres_set_refmap_bit(int bit, 909void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
911 struct dlm_lock_resource *res, 910 struct dlm_lock_resource *res);
912 const char *file, 911void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
913 int line) 912 struct dlm_lock_resource *res);
914{
915 //printk("%s:%d:%.*s: setting bit %d\n", file, line,
916 // res->lockname.len, res->lockname.name, bit);
917 set_bit(bit, res->refmap);
918}
919
920static inline void __dlm_lockres_clear_refmap_bit(int bit,
921 struct dlm_lock_resource *res,
922 const char *file,
923 int line)
924{
925 //printk("%s:%d:%.*s: clearing bit %d\n", file, line,
926 // res->lockname.len, res->lockname.name, bit);
927 clear_bit(bit, res->refmap);
928}
929
930void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
931 struct dlm_lock_resource *res,
932 const char *file,
933 int line);
934void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
935 struct dlm_lock_resource *res,
936 int new_lockres,
937 const char *file,
938 int line);
939#define dlm_lockres_drop_inflight_ref(d,r) \
940 __dlm_lockres_drop_inflight_ref(d,r,__FILE__,__LINE__)
941#define dlm_lockres_grab_inflight_ref(d,r) \
942 __dlm_lockres_grab_inflight_ref(d,r,0,__FILE__,__LINE__)
943#define dlm_lockres_grab_inflight_ref_new(d,r) \
944 __dlm_lockres_grab_inflight_ref(d,r,1,__FILE__,__LINE__)
945 913
946void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); 914void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
947void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); 915void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 56f82cb912e..0e28e242226 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -30,6 +30,7 @@
30#include <linux/sysctl.h> 30#include <linux/sysctl.h>
31#include <linux/spinlock.h> 31#include <linux/spinlock.h>
32#include <linux/debugfs.h> 32#include <linux/debugfs.h>
33#include <linux/export.h>
33 34
34#include "cluster/heartbeat.h" 35#include "cluster/heartbeat.h"
35#include "cluster/nodemanager.h" 36#include "cluster/nodemanager.h"
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 6ed6b95dcf9..92f2ead0fab 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -157,16 +157,18 @@ static int dlm_protocol_compare(struct dlm_protocol_version *existing,
157 157
158static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm); 158static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm);
159 159
160void __dlm_unhash_lockres(struct dlm_lock_resource *lockres) 160void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
161{ 161{
162 if (!hlist_unhashed(&lockres->hash_node)) { 162 if (hlist_unhashed(&res->hash_node))
163 hlist_del_init(&lockres->hash_node); 163 return;
164 dlm_lockres_put(lockres); 164
165 } 165 mlog(0, "%s: Unhash res %.*s\n", dlm->name, res->lockname.len,
166 res->lockname.name);
167 hlist_del_init(&res->hash_node);
168 dlm_lockres_put(res);
166} 169}
167 170
168void __dlm_insert_lockres(struct dlm_ctxt *dlm, 171void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
169 struct dlm_lock_resource *res)
170{ 172{
171 struct hlist_head *bucket; 173 struct hlist_head *bucket;
172 struct qstr *q; 174 struct qstr *q;
@@ -180,6 +182,9 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm,
180 dlm_lockres_get(res); 182 dlm_lockres_get(res);
181 183
182 hlist_add_head(&res->hash_node, bucket); 184 hlist_add_head(&res->hash_node, bucket);
185
186 mlog(0, "%s: Hash res %.*s\n", dlm->name, res->lockname.len,
187 res->lockname.name);
183} 188}
184 189
185struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, 190struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
@@ -539,17 +544,17 @@ again:
539 544
540static void __dlm_print_nodes(struct dlm_ctxt *dlm) 545static void __dlm_print_nodes(struct dlm_ctxt *dlm)
541{ 546{
542 int node = -1; 547 int node = -1, num = 0;
543 548
544 assert_spin_locked(&dlm->spinlock); 549 assert_spin_locked(&dlm->spinlock);
545 550
546 printk(KERN_NOTICE "o2dlm: Nodes in domain %s: ", dlm->name); 551 printk("( ");
547
548 while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 552 while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
549 node + 1)) < O2NM_MAX_NODES) { 553 node + 1)) < O2NM_MAX_NODES) {
550 printk("%d ", node); 554 printk("%d ", node);
555 ++num;
551 } 556 }
552 printk("\n"); 557 printk(") %u nodes\n", num);
553} 558}
554 559
555static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, 560static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
@@ -566,11 +571,10 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
566 571
567 node = exit_msg->node_idx; 572 node = exit_msg->node_idx;
568 573
569 printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s\n", node, dlm->name);
570
571 spin_lock(&dlm->spinlock); 574 spin_lock(&dlm->spinlock);
572 clear_bit(node, dlm->domain_map); 575 clear_bit(node, dlm->domain_map);
573 clear_bit(node, dlm->exit_domain_map); 576 clear_bit(node, dlm->exit_domain_map);
577 printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s ", node, dlm->name);
574 __dlm_print_nodes(dlm); 578 __dlm_print_nodes(dlm);
575 579
576 /* notify anything attached to the heartbeat events */ 580 /* notify anything attached to the heartbeat events */
@@ -755,6 +759,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
755 759
756 dlm_mark_domain_leaving(dlm); 760 dlm_mark_domain_leaving(dlm);
757 dlm_leave_domain(dlm); 761 dlm_leave_domain(dlm);
762 printk(KERN_NOTICE "o2dlm: Leaving domain %s\n", dlm->name);
758 dlm_force_free_mles(dlm); 763 dlm_force_free_mles(dlm);
759 dlm_complete_dlm_shutdown(dlm); 764 dlm_complete_dlm_shutdown(dlm);
760 } 765 }
@@ -970,7 +975,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
970 clear_bit(assert->node_idx, dlm->exit_domain_map); 975 clear_bit(assert->node_idx, dlm->exit_domain_map);
971 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); 976 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
972 977
973 printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n", 978 printk(KERN_NOTICE "o2dlm: Node %u joins domain %s ",
974 assert->node_idx, dlm->name); 979 assert->node_idx, dlm->name);
975 __dlm_print_nodes(dlm); 980 __dlm_print_nodes(dlm);
976 981
@@ -1701,8 +1706,10 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
1701bail: 1706bail:
1702 spin_lock(&dlm->spinlock); 1707 spin_lock(&dlm->spinlock);
1703 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); 1708 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
1704 if (!status) 1709 if (!status) {
1710 printk(KERN_NOTICE "o2dlm: Joining domain %s ", dlm->name);
1705 __dlm_print_nodes(dlm); 1711 __dlm_print_nodes(dlm);
1712 }
1706 spin_unlock(&dlm->spinlock); 1713 spin_unlock(&dlm->spinlock);
1707 1714
1708 if (ctxt) { 1715 if (ctxt) {
@@ -2131,13 +2138,6 @@ struct dlm_ctxt * dlm_register_domain(const char *domain,
2131 goto leave; 2138 goto leave;
2132 } 2139 }
2133 2140
2134 if (!o2hb_check_local_node_heartbeating()) {
2135 mlog(ML_ERROR, "the local node has not been configured, or is "
2136 "not heartbeating\n");
2137 ret = -EPROTO;
2138 goto leave;
2139 }
2140
2141 mlog(0, "register called for domain \"%s\"\n", domain); 2141 mlog(0, "register called for domain \"%s\"\n", domain);
2142 2142
2143retry: 2143retry:
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 8d39e0fd66f..975810b9849 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -183,10 +183,6 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
183 kick_thread = 1; 183 kick_thread = 1;
184 } 184 }
185 } 185 }
186 /* reduce the inflight count, this may result in the lockres
187 * being purged below during calc_usage */
188 if (lock->ml.node == dlm->node_num)
189 dlm_lockres_drop_inflight_ref(dlm, res);
190 186
191 spin_unlock(&res->spinlock); 187 spin_unlock(&res->spinlock);
192 wake_up(&res->wq); 188 wake_up(&res->wq);
@@ -231,10 +227,16 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
231 lock->ml.type, res->lockname.len, 227 lock->ml.type, res->lockname.len,
232 res->lockname.name, flags); 228 res->lockname.name, flags);
233 229
230 /*
231 * Wait if resource is getting recovered, remastered, etc.
232 * If the resource was remastered and new owner is self, then exit.
233 */
234 spin_lock(&res->spinlock); 234 spin_lock(&res->spinlock);
235
236 /* will exit this call with spinlock held */
237 __dlm_wait_on_lockres(res); 235 __dlm_wait_on_lockres(res);
236 if (res->owner == dlm->node_num) {
237 spin_unlock(&res->spinlock);
238 return DLM_RECOVERING;
239 }
238 res->state |= DLM_LOCK_RES_IN_PROGRESS; 240 res->state |= DLM_LOCK_RES_IN_PROGRESS;
239 241
240 /* add lock to local (secondary) queue */ 242 /* add lock to local (secondary) queue */
@@ -319,27 +321,23 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
319 tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create, 321 tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create,
320 sizeof(create), res->owner, &status); 322 sizeof(create), res->owner, &status);
321 if (tmpret >= 0) { 323 if (tmpret >= 0) {
322 // successfully sent and received 324 ret = status;
323 ret = status; // this is already a dlm_status
324 if (ret == DLM_REJECTED) { 325 if (ret == DLM_REJECTED) {
325 mlog(ML_ERROR, "%s:%.*s: BUG. this is a stale lockres " 326 mlog(ML_ERROR, "%s: res %.*s, Stale lockres no longer "
326 "no longer owned by %u. that node is coming back " 327 "owned by node %u. That node is coming back up "
327 "up currently.\n", dlm->name, create.namelen, 328 "currently.\n", dlm->name, create.namelen,
328 create.name, res->owner); 329 create.name, res->owner);
329 dlm_print_one_lock_resource(res); 330 dlm_print_one_lock_resource(res);
330 BUG(); 331 BUG();
331 } 332 }
332 } else { 333 } else {
333 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " 334 mlog(ML_ERROR, "%s: res %.*s, Error %d send CREATE LOCK to "
334 "node %u\n", tmpret, DLM_CREATE_LOCK_MSG, dlm->key, 335 "node %u\n", dlm->name, create.namelen, create.name,
335 res->owner); 336 tmpret, res->owner);
336 if (dlm_is_host_down(tmpret)) { 337 if (dlm_is_host_down(tmpret))
337 ret = DLM_RECOVERING; 338 ret = DLM_RECOVERING;
338 mlog(0, "node %u died so returning DLM_RECOVERING " 339 else
339 "from lock message!\n", res->owner);
340 } else {
341 ret = dlm_err_to_dlm_status(tmpret); 340 ret = dlm_err_to_dlm_status(tmpret);
342 }
343 } 341 }
344 342
345 return ret; 343 return ret;
@@ -440,7 +438,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
440 /* zero memory only if kernel-allocated */ 438 /* zero memory only if kernel-allocated */
441 lksb = kzalloc(sizeof(*lksb), GFP_NOFS); 439 lksb = kzalloc(sizeof(*lksb), GFP_NOFS);
442 if (!lksb) { 440 if (!lksb) {
443 kfree(lock); 441 kmem_cache_free(dlm_lock_cache, lock);
444 return NULL; 442 return NULL;
445 } 443 }
446 kernel_allocated = 1; 444 kernel_allocated = 1;
@@ -718,18 +716,10 @@ retry_lock:
718 716
719 if (status == DLM_RECOVERING || status == DLM_MIGRATING || 717 if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
720 status == DLM_FORWARD) { 718 status == DLM_FORWARD) {
721 mlog(0, "retrying lock with migration/"
722 "recovery/in progress\n");
723 msleep(100); 719 msleep(100);
724 /* no waiting for dlm_reco_thread */
725 if (recovery) { 720 if (recovery) {
726 if (status != DLM_RECOVERING) 721 if (status != DLM_RECOVERING)
727 goto retry_lock; 722 goto retry_lock;
728
729 mlog(0, "%s: got RECOVERING "
730 "for $RECOVERY lock, master "
731 "was %u\n", dlm->name,
732 res->owner);
733 /* wait to see the node go down, then 723 /* wait to see the node go down, then
734 * drop down and allow the lockres to 724 * drop down and allow the lockres to
735 * get cleaned up. need to remaster. */ 725 * get cleaned up. need to remaster. */
@@ -741,6 +731,14 @@ retry_lock:
741 } 731 }
742 } 732 }
743 733
734 /* Inflight taken in dlm_get_lock_resource() is dropped here */
735 spin_lock(&res->spinlock);
736 dlm_lockres_drop_inflight_ref(dlm, res);
737 spin_unlock(&res->spinlock);
738
739 dlm_lockres_calc_usage(dlm, res);
740 dlm_kick_thread(dlm, res);
741
744 if (status != DLM_NORMAL) { 742 if (status != DLM_NORMAL) {
745 lock->lksb->flags &= ~DLM_LKSB_GET_LVB; 743 lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
746 if (status != DLM_NOTQUEUED) 744 if (status != DLM_NOTQUEUED)
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 11eefb8c12e..005261c333b 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -631,39 +631,54 @@ error:
631 return NULL; 631 return NULL;
632} 632}
633 633
634void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, 634void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,
635 struct dlm_lock_resource *res, 635 struct dlm_lock_resource *res, int bit)
636 int new_lockres,
637 const char *file,
638 int line)
639{ 636{
640 if (!new_lockres) 637 assert_spin_locked(&res->spinlock);
641 assert_spin_locked(&res->spinlock); 638
639 mlog(0, "res %.*s, set node %u, %ps()\n", res->lockname.len,
640 res->lockname.name, bit, __builtin_return_address(0));
641
642 set_bit(bit, res->refmap);
643}
644
645void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
646 struct dlm_lock_resource *res, int bit)
647{
648 assert_spin_locked(&res->spinlock);
649
650 mlog(0, "res %.*s, clr node %u, %ps()\n", res->lockname.len,
651 res->lockname.name, bit, __builtin_return_address(0));
652
653 clear_bit(bit, res->refmap);
654}
655
656
657void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
658 struct dlm_lock_resource *res)
659{
660 assert_spin_locked(&res->spinlock);
642 661
643 if (!test_bit(dlm->node_num, res->refmap)) {
644 BUG_ON(res->inflight_locks != 0);
645 dlm_lockres_set_refmap_bit(dlm->node_num, res);
646 }
647 res->inflight_locks++; 662 res->inflight_locks++;
648 mlog(0, "%s:%.*s: inflight++: now %u\n", 663
649 dlm->name, res->lockname.len, res->lockname.name, 664 mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name,
650 res->inflight_locks); 665 res->lockname.len, res->lockname.name, res->inflight_locks,
666 __builtin_return_address(0));
651} 667}
652 668
653void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, 669void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
654 struct dlm_lock_resource *res, 670 struct dlm_lock_resource *res)
655 const char *file,
656 int line)
657{ 671{
658 assert_spin_locked(&res->spinlock); 672 assert_spin_locked(&res->spinlock);
659 673
660 BUG_ON(res->inflight_locks == 0); 674 BUG_ON(res->inflight_locks == 0);
675
661 res->inflight_locks--; 676 res->inflight_locks--;
662 mlog(0, "%s:%.*s: inflight--: now %u\n", 677
663 dlm->name, res->lockname.len, res->lockname.name, 678 mlog(0, "%s: res %.*s, inflight--: now %u, %ps()\n", dlm->name,
664 res->inflight_locks); 679 res->lockname.len, res->lockname.name, res->inflight_locks,
665 if (res->inflight_locks == 0) 680 __builtin_return_address(0));
666 dlm_lockres_clear_refmap_bit(dlm->node_num, res); 681
667 wake_up(&res->wq); 682 wake_up(&res->wq);
668} 683}
669 684
@@ -697,7 +712,6 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
697 unsigned int hash; 712 unsigned int hash;
698 int tries = 0; 713 int tries = 0;
699 int bit, wait_on_recovery = 0; 714 int bit, wait_on_recovery = 0;
700 int drop_inflight_if_nonlocal = 0;
701 715
702 BUG_ON(!lockid); 716 BUG_ON(!lockid);
703 717
@@ -709,36 +723,33 @@ lookup:
709 spin_lock(&dlm->spinlock); 723 spin_lock(&dlm->spinlock);
710 tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash); 724 tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash);
711 if (tmpres) { 725 if (tmpres) {
712 int dropping_ref = 0;
713
714 spin_unlock(&dlm->spinlock); 726 spin_unlock(&dlm->spinlock);
715
716 spin_lock(&tmpres->spinlock); 727 spin_lock(&tmpres->spinlock);
717 /* We wait for the other thread that is mastering the resource */ 728 /* Wait on the thread that is mastering the resource */
718 if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { 729 if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
719 __dlm_wait_on_lockres(tmpres); 730 __dlm_wait_on_lockres(tmpres);
720 BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN); 731 BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
732 spin_unlock(&tmpres->spinlock);
733 dlm_lockres_put(tmpres);
734 tmpres = NULL;
735 goto lookup;
721 } 736 }
722 737
723 if (tmpres->owner == dlm->node_num) { 738 /* Wait on the resource purge to complete before continuing */
724 BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF); 739 if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) {
725 dlm_lockres_grab_inflight_ref(dlm, tmpres); 740 BUG_ON(tmpres->owner == dlm->node_num);
726 } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) 741 __dlm_wait_on_lockres_flags(tmpres,
727 dropping_ref = 1; 742 DLM_LOCK_RES_DROPPING_REF);
728 spin_unlock(&tmpres->spinlock);
729
730 /* wait until done messaging the master, drop our ref to allow
731 * the lockres to be purged, start over. */
732 if (dropping_ref) {
733 spin_lock(&tmpres->spinlock);
734 __dlm_wait_on_lockres_flags(tmpres, DLM_LOCK_RES_DROPPING_REF);
735 spin_unlock(&tmpres->spinlock); 743 spin_unlock(&tmpres->spinlock);
736 dlm_lockres_put(tmpres); 744 dlm_lockres_put(tmpres);
737 tmpres = NULL; 745 tmpres = NULL;
738 goto lookup; 746 goto lookup;
739 } 747 }
740 748
741 mlog(0, "found in hash!\n"); 749 /* Grab inflight ref to pin the resource */
750 dlm_lockres_grab_inflight_ref(dlm, tmpres);
751
752 spin_unlock(&tmpres->spinlock);
742 if (res) 753 if (res)
743 dlm_lockres_put(res); 754 dlm_lockres_put(res);
744 res = tmpres; 755 res = tmpres;
@@ -829,8 +840,8 @@ lookup:
829 * but they might own this lockres. wait on them. */ 840 * but they might own this lockres. wait on them. */
830 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); 841 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
831 if (bit < O2NM_MAX_NODES) { 842 if (bit < O2NM_MAX_NODES) {
832 mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to " 843 mlog(0, "%s: res %.*s, At least one node (%d) "
833 "recover before lock mastery can begin\n", 844 "to recover before lock mastery can begin\n",
834 dlm->name, namelen, (char *)lockid, bit); 845 dlm->name, namelen, (char *)lockid, bit);
835 wait_on_recovery = 1; 846 wait_on_recovery = 1;
836 } 847 }
@@ -843,12 +854,11 @@ lookup:
843 854
844 /* finally add the lockres to its hash bucket */ 855 /* finally add the lockres to its hash bucket */
845 __dlm_insert_lockres(dlm, res); 856 __dlm_insert_lockres(dlm, res);
846 /* since this lockres is new it doesn't not require the spinlock */
847 dlm_lockres_grab_inflight_ref_new(dlm, res);
848 857
849 /* if this node does not become the master make sure to drop 858 /* Grab inflight ref to pin the resource */
850 * this inflight reference below */ 859 spin_lock(&res->spinlock);
851 drop_inflight_if_nonlocal = 1; 860 dlm_lockres_grab_inflight_ref(dlm, res);
861 spin_unlock(&res->spinlock);
852 862
853 /* get an extra ref on the mle in case this is a BLOCK 863 /* get an extra ref on the mle in case this is a BLOCK
854 * if so, the creator of the BLOCK may try to put the last 864 * if so, the creator of the BLOCK may try to put the last
@@ -864,8 +874,8 @@ redo_request:
864 * dlm spinlock would be detectable be a change on the mle, 874 * dlm spinlock would be detectable be a change on the mle,
865 * so we only need to clear out the recovery map once. */ 875 * so we only need to clear out the recovery map once. */
866 if (dlm_is_recovery_lock(lockid, namelen)) { 876 if (dlm_is_recovery_lock(lockid, namelen)) {
867 mlog(ML_NOTICE, "%s: recovery map is not empty, but " 877 mlog(0, "%s: Recovery map is not empty, but must "
868 "must master $RECOVERY lock now\n", dlm->name); 878 "master $RECOVERY lock now\n", dlm->name);
869 if (!dlm_pre_master_reco_lockres(dlm, res)) 879 if (!dlm_pre_master_reco_lockres(dlm, res))
870 wait_on_recovery = 0; 880 wait_on_recovery = 0;
871 else { 881 else {
@@ -883,8 +893,8 @@ redo_request:
883 spin_lock(&dlm->spinlock); 893 spin_lock(&dlm->spinlock);
884 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); 894 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
885 if (bit < O2NM_MAX_NODES) { 895 if (bit < O2NM_MAX_NODES) {
886 mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to " 896 mlog(0, "%s: res %.*s, At least one node (%d) "
887 "recover before lock mastery can begin\n", 897 "to recover before lock mastery can begin\n",
888 dlm->name, namelen, (char *)lockid, bit); 898 dlm->name, namelen, (char *)lockid, bit);
889 wait_on_recovery = 1; 899 wait_on_recovery = 1;
890 } else 900 } else
@@ -913,8 +923,8 @@ redo_request:
913 * yet, keep going until it does. this is how the 923 * yet, keep going until it does. this is how the
914 * master will know that asserts are needed back to 924 * master will know that asserts are needed back to
915 * the lower nodes. */ 925 * the lower nodes. */
916 mlog(0, "%s:%.*s: requests only up to %u but master " 926 mlog(0, "%s: res %.*s, Requests only up to %u but "
917 "is %u, keep going\n", dlm->name, namelen, 927 "master is %u, keep going\n", dlm->name, namelen,
918 lockid, nodenum, mle->master); 928 lockid, nodenum, mle->master);
919 } 929 }
920 } 930 }
@@ -924,13 +934,12 @@ wait:
924 ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked); 934 ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
925 if (ret < 0) { 935 if (ret < 0) {
926 wait_on_recovery = 1; 936 wait_on_recovery = 1;
927 mlog(0, "%s:%.*s: node map changed, redo the " 937 mlog(0, "%s: res %.*s, Node map changed, redo the master "
928 "master request now, blocked=%d\n", 938 "request now, blocked=%d\n", dlm->name, res->lockname.len,
929 dlm->name, res->lockname.len,
930 res->lockname.name, blocked); 939 res->lockname.name, blocked);
931 if (++tries > 20) { 940 if (++tries > 20) {
932 mlog(ML_ERROR, "%s:%.*s: spinning on " 941 mlog(ML_ERROR, "%s: res %.*s, Spinning on "
933 "dlm_wait_for_lock_mastery, blocked=%d\n", 942 "dlm_wait_for_lock_mastery, blocked = %d\n",
934 dlm->name, res->lockname.len, 943 dlm->name, res->lockname.len,
935 res->lockname.name, blocked); 944 res->lockname.name, blocked);
936 dlm_print_one_lock_resource(res); 945 dlm_print_one_lock_resource(res);
@@ -940,7 +949,8 @@ wait:
940 goto redo_request; 949 goto redo_request;
941 } 950 }
942 951
943 mlog(0, "lockres mastered by %u\n", res->owner); 952 mlog(0, "%s: res %.*s, Mastered by %u\n", dlm->name, res->lockname.len,
953 res->lockname.name, res->owner);
944 /* make sure we never continue without this */ 954 /* make sure we never continue without this */
945 BUG_ON(res->owner == O2NM_MAX_NODES); 955 BUG_ON(res->owner == O2NM_MAX_NODES);
946 956
@@ -952,8 +962,6 @@ wait:
952 962
953wake_waiters: 963wake_waiters:
954 spin_lock(&res->spinlock); 964 spin_lock(&res->spinlock);
955 if (res->owner != dlm->node_num && drop_inflight_if_nonlocal)
956 dlm_lockres_drop_inflight_ref(dlm, res);
957 res->state &= ~DLM_LOCK_RES_IN_PROGRESS; 965 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
958 spin_unlock(&res->spinlock); 966 spin_unlock(&res->spinlock);
959 wake_up(&res->wq); 967 wake_up(&res->wq);
@@ -1426,9 +1434,7 @@ way_up_top:
1426 } 1434 }
1427 1435
1428 if (res->owner == dlm->node_num) { 1436 if (res->owner == dlm->node_num) {
1429 mlog(0, "%s:%.*s: setting bit %u in refmap\n", 1437 dlm_lockres_set_refmap_bit(dlm, res, request->node_idx);
1430 dlm->name, namelen, name, request->node_idx);
1431 dlm_lockres_set_refmap_bit(request->node_idx, res);
1432 spin_unlock(&res->spinlock); 1438 spin_unlock(&res->spinlock);
1433 response = DLM_MASTER_RESP_YES; 1439 response = DLM_MASTER_RESP_YES;
1434 if (mle) 1440 if (mle)
@@ -1493,10 +1499,8 @@ way_up_top:
1493 * go back and clean the mles on any 1499 * go back and clean the mles on any
1494 * other nodes */ 1500 * other nodes */
1495 dispatch_assert = 1; 1501 dispatch_assert = 1;
1496 dlm_lockres_set_refmap_bit(request->node_idx, res); 1502 dlm_lockres_set_refmap_bit(dlm, res,
1497 mlog(0, "%s:%.*s: setting bit %u in refmap\n", 1503 request->node_idx);
1498 dlm->name, namelen, name,
1499 request->node_idx);
1500 } else 1504 } else
1501 response = DLM_MASTER_RESP_NO; 1505 response = DLM_MASTER_RESP_NO;
1502 } else { 1506 } else {
@@ -1702,7 +1706,7 @@ again:
1702 "lockres, set the bit in the refmap\n", 1706 "lockres, set the bit in the refmap\n",
1703 namelen, lockname, to); 1707 namelen, lockname, to);
1704 spin_lock(&res->spinlock); 1708 spin_lock(&res->spinlock);
1705 dlm_lockres_set_refmap_bit(to, res); 1709 dlm_lockres_set_refmap_bit(dlm, res, to);
1706 spin_unlock(&res->spinlock); 1710 spin_unlock(&res->spinlock);
1707 } 1711 }
1708 } 1712 }
@@ -2187,8 +2191,6 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2187 namelen = res->lockname.len; 2191 namelen = res->lockname.len;
2188 BUG_ON(namelen > O2NM_MAX_NAME_LEN); 2192 BUG_ON(namelen > O2NM_MAX_NAME_LEN);
2189 2193
2190 mlog(0, "%s:%.*s: sending deref to %d\n",
2191 dlm->name, namelen, lockname, res->owner);
2192 memset(&deref, 0, sizeof(deref)); 2194 memset(&deref, 0, sizeof(deref));
2193 deref.node_idx = dlm->node_num; 2195 deref.node_idx = dlm->node_num;
2194 deref.namelen = namelen; 2196 deref.namelen = namelen;
@@ -2197,14 +2199,12 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2197 ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key, 2199 ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
2198 &deref, sizeof(deref), res->owner, &r); 2200 &deref, sizeof(deref), res->owner, &r);
2199 if (ret < 0) 2201 if (ret < 0)
2200 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " 2202 mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF to node %u\n",
2201 "node %u\n", ret, DLM_DEREF_LOCKRES_MSG, dlm->key, 2203 dlm->name, namelen, lockname, ret, res->owner);
2202 res->owner);
2203 else if (r < 0) { 2204 else if (r < 0) {
2204 /* BAD. other node says I did not have a ref. */ 2205 /* BAD. other node says I did not have a ref. */
2205 mlog(ML_ERROR,"while dropping ref on %s:%.*s " 2206 mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
2206 "(master=%u) got %d.\n", dlm->name, namelen, 2207 dlm->name, namelen, lockname, res->owner, r);
2207 lockname, res->owner, r);
2208 dlm_print_one_lock_resource(res); 2208 dlm_print_one_lock_resource(res);
2209 BUG(); 2209 BUG();
2210 } 2210 }
@@ -2260,7 +2260,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
2260 else { 2260 else {
2261 BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); 2261 BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
2262 if (test_bit(node, res->refmap)) { 2262 if (test_bit(node, res->refmap)) {
2263 dlm_lockres_clear_refmap_bit(node, res); 2263 dlm_lockres_clear_refmap_bit(dlm, res, node);
2264 cleared = 1; 2264 cleared = 1;
2265 } 2265 }
2266 } 2266 }
@@ -2320,7 +2320,7 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
2320 BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); 2320 BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
2321 if (test_bit(node, res->refmap)) { 2321 if (test_bit(node, res->refmap)) {
2322 __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); 2322 __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
2323 dlm_lockres_clear_refmap_bit(node, res); 2323 dlm_lockres_clear_refmap_bit(dlm, res, node);
2324 cleared = 1; 2324 cleared = 1;
2325 } 2325 }
2326 spin_unlock(&res->spinlock); 2326 spin_unlock(&res->spinlock);
@@ -2802,7 +2802,8 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
2802 BUG_ON(!list_empty(&lock->bast_list)); 2802 BUG_ON(!list_empty(&lock->bast_list));
2803 BUG_ON(lock->ast_pending); 2803 BUG_ON(lock->ast_pending);
2804 BUG_ON(lock->bast_pending); 2804 BUG_ON(lock->bast_pending);
2805 dlm_lockres_clear_refmap_bit(lock->ml.node, res); 2805 dlm_lockres_clear_refmap_bit(dlm, res,
2806 lock->ml.node);
2806 list_del_init(&lock->list); 2807 list_del_init(&lock->list);
2807 dlm_lock_put(lock); 2808 dlm_lock_put(lock);
2808 /* In a normal unlock, we would have added a 2809 /* In a normal unlock, we would have added a
@@ -2823,7 +2824,7 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
2823 mlog(0, "%s:%.*s: node %u had a ref to this " 2824 mlog(0, "%s:%.*s: node %u had a ref to this "
2824 "migrating lockres, clearing\n", dlm->name, 2825 "migrating lockres, clearing\n", dlm->name,
2825 res->lockname.len, res->lockname.name, bit); 2826 res->lockname.len, res->lockname.name, bit);
2826 dlm_lockres_clear_refmap_bit(bit, res); 2827 dlm_lockres_clear_refmap_bit(dlm, res, bit);
2827 } 2828 }
2828 bit++; 2829 bit++;
2829 } 2830 }
@@ -2916,9 +2917,9 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
2916 &migrate, sizeof(migrate), nodenum, 2917 &migrate, sizeof(migrate), nodenum,
2917 &status); 2918 &status);
2918 if (ret < 0) { 2919 if (ret < 0) {
2919 mlog(ML_ERROR, "Error %d when sending message %u (key " 2920 mlog(ML_ERROR, "%s: res %.*s, Error %d send "
2920 "0x%x) to node %u\n", ret, DLM_MIGRATE_REQUEST_MSG, 2921 "MIGRATE_REQUEST to node %u\n", dlm->name,
2921 dlm->key, nodenum); 2922 migrate.namelen, migrate.name, ret, nodenum);
2922 if (!dlm_is_host_down(ret)) { 2923 if (!dlm_is_host_down(ret)) {
2923 mlog(ML_ERROR, "unhandled error=%d!\n", ret); 2924 mlog(ML_ERROR, "unhandled error=%d!\n", ret);
2924 BUG(); 2925 BUG();
@@ -2937,7 +2938,7 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
2937 dlm->name, res->lockname.len, res->lockname.name, 2938 dlm->name, res->lockname.len, res->lockname.name,
2938 nodenum); 2939 nodenum);
2939 spin_lock(&res->spinlock); 2940 spin_lock(&res->spinlock);
2940 dlm_lockres_set_refmap_bit(nodenum, res); 2941 dlm_lockres_set_refmap_bit(dlm, res, nodenum);
2941 spin_unlock(&res->spinlock); 2942 spin_unlock(&res->spinlock);
2942 } 2943 }
2943 } 2944 }
@@ -3271,7 +3272,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
3271 * mastery reference here since old_master will briefly have 3272 * mastery reference here since old_master will briefly have
3272 * a reference after the migration completes */ 3273 * a reference after the migration completes */
3273 spin_lock(&res->spinlock); 3274 spin_lock(&res->spinlock);
3274 dlm_lockres_set_refmap_bit(old_master, res); 3275 dlm_lockres_set_refmap_bit(dlm, res, old_master);
3275 spin_unlock(&res->spinlock); 3276 spin_unlock(&res->spinlock);
3276 3277
3277 mlog(0, "now time to do a migrate request to other nodes\n"); 3278 mlog(0, "now time to do a migrate request to other nodes\n");
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 7efab6d28a2..01ebfd0bdad 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -362,40 +362,38 @@ static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node)
362} 362}
363 363
364 364
365int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) 365void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
366{ 366{
367 if (timeout) { 367 if (dlm_is_node_dead(dlm, node))
368 mlog(ML_NOTICE, "%s: waiting %dms for notification of " 368 return;
369 "death of node %u\n", dlm->name, timeout, node); 369
370 printk(KERN_NOTICE "o2dlm: Waiting on the death of node %u in "
371 "domain %s\n", node, dlm->name);
372
373 if (timeout)
370 wait_event_timeout(dlm->dlm_reco_thread_wq, 374 wait_event_timeout(dlm->dlm_reco_thread_wq,
371 dlm_is_node_dead(dlm, node), 375 dlm_is_node_dead(dlm, node),
372 msecs_to_jiffies(timeout)); 376 msecs_to_jiffies(timeout));
373 } else { 377 else
374 mlog(ML_NOTICE, "%s: waiting indefinitely for notification "
375 "of death of node %u\n", dlm->name, node);
376 wait_event(dlm->dlm_reco_thread_wq, 378 wait_event(dlm->dlm_reco_thread_wq,
377 dlm_is_node_dead(dlm, node)); 379 dlm_is_node_dead(dlm, node));
378 }
379 /* for now, return 0 */
380 return 0;
381} 380}
382 381
383int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout) 382void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
384{ 383{
385 if (timeout) { 384 if (dlm_is_node_recovered(dlm, node))
386 mlog(0, "%s: waiting %dms for notification of " 385 return;
387 "recovery of node %u\n", dlm->name, timeout, node); 386
387 printk(KERN_NOTICE "o2dlm: Waiting on the recovery of node %u in "
388 "domain %s\n", node, dlm->name);
389
390 if (timeout)
388 wait_event_timeout(dlm->dlm_reco_thread_wq, 391 wait_event_timeout(dlm->dlm_reco_thread_wq,
389 dlm_is_node_recovered(dlm, node), 392 dlm_is_node_recovered(dlm, node),
390 msecs_to_jiffies(timeout)); 393 msecs_to_jiffies(timeout));
391 } else { 394 else
392 mlog(0, "%s: waiting indefinitely for notification "
393 "of recovery of node %u\n", dlm->name, node);
394 wait_event(dlm->dlm_reco_thread_wq, 395 wait_event(dlm->dlm_reco_thread_wq,
395 dlm_is_node_recovered(dlm, node)); 396 dlm_is_node_recovered(dlm, node));
396 }
397 /* for now, return 0 */
398 return 0;
399} 397}
400 398
401/* callers of the top-level api calls (dlmlock/dlmunlock) should 399/* callers of the top-level api calls (dlmlock/dlmunlock) should
@@ -430,6 +428,8 @@ static void dlm_begin_recovery(struct dlm_ctxt *dlm)
430{ 428{
431 spin_lock(&dlm->spinlock); 429 spin_lock(&dlm->spinlock);
432 BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE); 430 BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
431 printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n",
432 dlm->name, dlm->reco.dead_node);
433 dlm->reco.state |= DLM_RECO_STATE_ACTIVE; 433 dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
434 spin_unlock(&dlm->spinlock); 434 spin_unlock(&dlm->spinlock);
435} 435}
@@ -440,9 +440,18 @@ static void dlm_end_recovery(struct dlm_ctxt *dlm)
440 BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE)); 440 BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE));
441 dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE; 441 dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE;
442 spin_unlock(&dlm->spinlock); 442 spin_unlock(&dlm->spinlock);
443 printk(KERN_NOTICE "o2dlm: End recovery on domain %s\n", dlm->name);
443 wake_up(&dlm->reco.event); 444 wake_up(&dlm->reco.event);
444} 445}
445 446
447static void dlm_print_recovery_master(struct dlm_ctxt *dlm)
448{
449 printk(KERN_NOTICE "o2dlm: Node %u (%s) is the Recovery Master for the "
450 "dead node %u in domain %s\n", dlm->reco.new_master,
451 (dlm->node_num == dlm->reco.new_master ? "me" : "he"),
452 dlm->reco.dead_node, dlm->name);
453}
454
446static int dlm_do_recovery(struct dlm_ctxt *dlm) 455static int dlm_do_recovery(struct dlm_ctxt *dlm)
447{ 456{
448 int status = 0; 457 int status = 0;
@@ -505,9 +514,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
505 } 514 }
506 mlog(0, "another node will master this recovery session.\n"); 515 mlog(0, "another node will master this recovery session.\n");
507 } 516 }
508 mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n", 517
509 dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), dlm->reco.new_master, 518 dlm_print_recovery_master(dlm);
510 dlm->node_num, dlm->reco.dead_node);
511 519
512 /* it is safe to start everything back up here 520 /* it is safe to start everything back up here
513 * because all of the dead node's lock resources 521 * because all of the dead node's lock resources
@@ -518,15 +526,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
518 return 0; 526 return 0;
519 527
520master_here: 528master_here:
521 mlog(ML_NOTICE, "(%d) Node %u is the Recovery Master for the Dead Node " 529 dlm_print_recovery_master(dlm);
522 "%u for Domain %s\n", task_pid_nr(dlm->dlm_reco_thread_task),
523 dlm->node_num, dlm->reco.dead_node, dlm->name);
524 530
525 status = dlm_remaster_locks(dlm, dlm->reco.dead_node); 531 status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
526 if (status < 0) { 532 if (status < 0) {
527 /* we should never hit this anymore */ 533 /* we should never hit this anymore */
528 mlog(ML_ERROR, "error %d remastering locks for node %u, " 534 mlog(ML_ERROR, "%s: Error %d remastering locks for node %u, "
529 "retrying.\n", status, dlm->reco.dead_node); 535 "retrying.\n", dlm->name, status, dlm->reco.dead_node);
530 /* yield a bit to allow any final network messages 536 /* yield a bit to allow any final network messages
531 * to get handled on remaining nodes */ 537 * to get handled on remaining nodes */
532 msleep(100); 538 msleep(100);
@@ -567,7 +573,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
567 BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT); 573 BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
568 ndata->state = DLM_RECO_NODE_DATA_REQUESTING; 574 ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
569 575
570 mlog(0, "requesting lock info from node %u\n", 576 mlog(0, "%s: Requesting lock info from node %u\n", dlm->name,
571 ndata->node_num); 577 ndata->node_num);
572 578
573 if (ndata->node_num == dlm->node_num) { 579 if (ndata->node_num == dlm->node_num) {
@@ -640,7 +646,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
640 spin_unlock(&dlm_reco_state_lock); 646 spin_unlock(&dlm_reco_state_lock);
641 } 647 }
642 648
643 mlog(0, "done requesting all lock info\n"); 649 mlog(0, "%s: Done requesting all lock info\n", dlm->name);
644 650
645 /* nodes should be sending reco data now 651 /* nodes should be sending reco data now
646 * just need to wait */ 652 * just need to wait */
@@ -802,10 +808,9 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
802 808
803 /* negative status is handled by caller */ 809 /* negative status is handled by caller */
804 if (ret < 0) 810 if (ret < 0)
805 mlog(ML_ERROR, "Error %d when sending message %u (key " 811 mlog(ML_ERROR, "%s: Error %d send LOCK_REQUEST to node %u "
806 "0x%x) to node %u\n", ret, DLM_LOCK_REQUEST_MSG, 812 "to recover dead node %u\n", dlm->name, ret,
807 dlm->key, request_from); 813 request_from, dead_node);
808
809 // return from here, then 814 // return from here, then
810 // sleep until all received or error 815 // sleep until all received or error
811 return ret; 816 return ret;
@@ -956,9 +961,9 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
956 ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, 961 ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
957 sizeof(done_msg), send_to, &tmpret); 962 sizeof(done_msg), send_to, &tmpret);
958 if (ret < 0) { 963 if (ret < 0) {
959 mlog(ML_ERROR, "Error %d when sending message %u (key " 964 mlog(ML_ERROR, "%s: Error %d send RECO_DATA_DONE to node %u "
960 "0x%x) to node %u\n", ret, DLM_RECO_DATA_DONE_MSG, 965 "to recover dead node %u\n", dlm->name, ret, send_to,
961 dlm->key, send_to); 966 dead_node);
962 if (!dlm_is_host_down(ret)) { 967 if (!dlm_is_host_down(ret)) {
963 BUG(); 968 BUG();
964 } 969 }
@@ -1127,9 +1132,11 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
1127 if (ret < 0) { 1132 if (ret < 0) {
1128 /* XXX: negative status is not handled. 1133 /* XXX: negative status is not handled.
1129 * this will end up killing this node. */ 1134 * this will end up killing this node. */
1130 mlog(ML_ERROR, "Error %d when sending message %u (key " 1135 mlog(ML_ERROR, "%s: res %.*s, Error %d send MIG_LOCKRES to "
1131 "0x%x) to node %u\n", ret, DLM_MIG_LOCKRES_MSG, 1136 "node %u (%s)\n", dlm->name, mres->lockname_len,
1132 dlm->key, send_to); 1137 mres->lockname, ret, send_to,
1138 (orig_flags & DLM_MRES_MIGRATION ?
1139 "migration" : "recovery"));
1133 } else { 1140 } else {
1134 /* might get an -ENOMEM back here */ 1141 /* might get an -ENOMEM back here */
1135 ret = status; 1142 ret = status;
@@ -1767,7 +1774,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1767 dlm->name, mres->lockname_len, mres->lockname, 1774 dlm->name, mres->lockname_len, mres->lockname,
1768 from); 1775 from);
1769 spin_lock(&res->spinlock); 1776 spin_lock(&res->spinlock);
1770 dlm_lockres_set_refmap_bit(from, res); 1777 dlm_lockres_set_refmap_bit(dlm, res, from);
1771 spin_unlock(&res->spinlock); 1778 spin_unlock(&res->spinlock);
1772 added++; 1779 added++;
1773 break; 1780 break;
@@ -1965,7 +1972,7 @@ skip_lvb:
1965 mlog(0, "%s:%.*s: added lock for node %u, " 1972 mlog(0, "%s:%.*s: added lock for node %u, "
1966 "setting refmap bit\n", dlm->name, 1973 "setting refmap bit\n", dlm->name,
1967 res->lockname.len, res->lockname.name, ml->node); 1974 res->lockname.len, res->lockname.name, ml->node);
1968 dlm_lockres_set_refmap_bit(ml->node, res); 1975 dlm_lockres_set_refmap_bit(dlm, res, ml->node);
1969 added++; 1976 added++;
1970 } 1977 }
1971 spin_unlock(&res->spinlock); 1978 spin_unlock(&res->spinlock);
@@ -2084,6 +2091,9 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
2084 2091
2085 list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) { 2092 list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
2086 if (res->owner == dead_node) { 2093 if (res->owner == dead_node) {
2094 mlog(0, "%s: res %.*s, Changing owner from %u to %u\n",
2095 dlm->name, res->lockname.len, res->lockname.name,
2096 res->owner, new_master);
2087 list_del_init(&res->recovering); 2097 list_del_init(&res->recovering);
2088 spin_lock(&res->spinlock); 2098 spin_lock(&res->spinlock);
2089 /* new_master has our reference from 2099 /* new_master has our reference from
@@ -2105,40 +2115,30 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
2105 for (i = 0; i < DLM_HASH_BUCKETS; i++) { 2115 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
2106 bucket = dlm_lockres_hash(dlm, i); 2116 bucket = dlm_lockres_hash(dlm, i);
2107 hlist_for_each_entry(res, hash_iter, bucket, hash_node) { 2117 hlist_for_each_entry(res, hash_iter, bucket, hash_node) {
2108 if (res->state & DLM_LOCK_RES_RECOVERING) { 2118 if (!(res->state & DLM_LOCK_RES_RECOVERING))
2109 if (res->owner == dead_node) { 2119 continue;
2110 mlog(0, "(this=%u) res %.*s owner=%u "
2111 "was not on recovering list, but "
2112 "clearing state anyway\n",
2113 dlm->node_num, res->lockname.len,
2114 res->lockname.name, new_master);
2115 } else if (res->owner == dlm->node_num) {
2116 mlog(0, "(this=%u) res %.*s owner=%u "
2117 "was not on recovering list, "
2118 "owner is THIS node, clearing\n",
2119 dlm->node_num, res->lockname.len,
2120 res->lockname.name, new_master);
2121 } else
2122 continue;
2123 2120
2124 if (!list_empty(&res->recovering)) { 2121 if (res->owner != dead_node &&
2125 mlog(0, "%s:%.*s: lockres was " 2122 res->owner != dlm->node_num)
2126 "marked RECOVERING, owner=%u\n", 2123 continue;
2127 dlm->name, res->lockname.len, 2124
2128 res->lockname.name, res->owner); 2125 if (!list_empty(&res->recovering)) {
2129 list_del_init(&res->recovering); 2126 list_del_init(&res->recovering);
2130 dlm_lockres_put(res); 2127 dlm_lockres_put(res);
2131 }
2132 spin_lock(&res->spinlock);
2133 /* new_master has our reference from
2134 * the lock state sent during recovery */
2135 dlm_change_lockres_owner(dlm, res, new_master);
2136 res->state &= ~DLM_LOCK_RES_RECOVERING;
2137 if (__dlm_lockres_has_locks(res))
2138 __dlm_dirty_lockres(dlm, res);
2139 spin_unlock(&res->spinlock);
2140 wake_up(&res->wq);
2141 } 2128 }
2129
2130 /* new_master has our reference from
2131 * the lock state sent during recovery */
2132 mlog(0, "%s: res %.*s, Changing owner from %u to %u\n",
2133 dlm->name, res->lockname.len, res->lockname.name,
2134 res->owner, new_master);
2135 spin_lock(&res->spinlock);
2136 dlm_change_lockres_owner(dlm, res, new_master);
2137 res->state &= ~DLM_LOCK_RES_RECOVERING;
2138 if (__dlm_lockres_has_locks(res))
2139 __dlm_dirty_lockres(dlm, res);
2140 spin_unlock(&res->spinlock);
2141 wake_up(&res->wq);
2142 } 2142 }
2143 } 2143 }
2144} 2144}
@@ -2252,12 +2252,12 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
2252 res->lockname.len, res->lockname.name, freed, dead_node); 2252 res->lockname.len, res->lockname.name, freed, dead_node);
2253 __dlm_print_one_lock_resource(res); 2253 __dlm_print_one_lock_resource(res);
2254 } 2254 }
2255 dlm_lockres_clear_refmap_bit(dead_node, res); 2255 dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
2256 } else if (test_bit(dead_node, res->refmap)) { 2256 } else if (test_bit(dead_node, res->refmap)) {
2257 mlog(0, "%s:%.*s: dead node %u had a ref, but had " 2257 mlog(0, "%s:%.*s: dead node %u had a ref, but had "
2258 "no locks and had not purged before dying\n", dlm->name, 2258 "no locks and had not purged before dying\n", dlm->name,
2259 res->lockname.len, res->lockname.name, dead_node); 2259 res->lockname.len, res->lockname.name, dead_node);
2260 dlm_lockres_clear_refmap_bit(dead_node, res); 2260 dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
2261 } 2261 }
2262 2262
2263 /* do not kick thread yet */ 2263 /* do not kick thread yet */
@@ -2324,9 +2324,9 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
2324 dlm_revalidate_lvb(dlm, res, dead_node); 2324 dlm_revalidate_lvb(dlm, res, dead_node);
2325 if (res->owner == dead_node) { 2325 if (res->owner == dead_node) {
2326 if (res->state & DLM_LOCK_RES_DROPPING_REF) { 2326 if (res->state & DLM_LOCK_RES_DROPPING_REF) {
2327 mlog(ML_NOTICE, "Ignore %.*s for " 2327 mlog(ML_NOTICE, "%s: res %.*s, Skip "
2328 "recovery as it is being freed\n", 2328 "recovery as it is being freed\n",
2329 res->lockname.len, 2329 dlm->name, res->lockname.len,
2330 res->lockname.name); 2330 res->lockname.name);
2331 } else 2331 } else
2332 dlm_move_lockres_to_recovery_list(dlm, 2332 dlm_move_lockres_to_recovery_list(dlm,
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 1d6d1d22c47..e73c833fc2a 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -94,24 +94,26 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
94{ 94{
95 int bit; 95 int bit;
96 96
97 assert_spin_locked(&res->spinlock);
98
97 if (__dlm_lockres_has_locks(res)) 99 if (__dlm_lockres_has_locks(res))
98 return 0; 100 return 0;
99 101
102 /* Locks are in the process of being created */
103 if (res->inflight_locks)
104 return 0;
105
100 if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY) 106 if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
101 return 0; 107 return 0;
102 108
103 if (res->state & DLM_LOCK_RES_RECOVERING) 109 if (res->state & DLM_LOCK_RES_RECOVERING)
104 return 0; 110 return 0;
105 111
112 /* Another node has this resource with this node as the master */
106 bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); 113 bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
107 if (bit < O2NM_MAX_NODES) 114 if (bit < O2NM_MAX_NODES)
108 return 0; 115 return 0;
109 116
110 /*
111 * since the bit for dlm->node_num is not set, inflight_locks better
112 * be zero
113 */
114 BUG_ON(res->inflight_locks != 0);
115 return 1; 117 return 1;
116} 118}
117 119
@@ -185,8 +187,6 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
185 /* clear our bit from the master's refmap, ignore errors */ 187 /* clear our bit from the master's refmap, ignore errors */
186 ret = dlm_drop_lockres_ref(dlm, res); 188 ret = dlm_drop_lockres_ref(dlm, res);
187 if (ret < 0) { 189 if (ret < 0) {
188 mlog(ML_ERROR, "%s: deref %.*s failed %d\n", dlm->name,
189 res->lockname.len, res->lockname.name, ret);
190 if (!dlm_is_host_down(ret)) 190 if (!dlm_is_host_down(ret))
191 BUG(); 191 BUG();
192 } 192 }
@@ -209,7 +209,7 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
209 BUG(); 209 BUG();
210 } 210 }
211 211
212 __dlm_unhash_lockres(res); 212 __dlm_unhash_lockres(dlm, res);
213 213
214 /* lockres is not in the hash now. drop the flag and wake up 214 /* lockres is not in the hash now. drop the flag and wake up
215 * any processes waiting in dlm_get_lock_resource. */ 215 * any processes waiting in dlm_get_lock_resource. */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index e1ed5e502ff..81a4cd22f80 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1692,7 +1692,7 @@ int ocfs2_open_lock(struct inode *inode)
1692 mlog(0, "inode %llu take PRMODE open lock\n", 1692 mlog(0, "inode %llu take PRMODE open lock\n",
1693 (unsigned long long)OCFS2_I(inode)->ip_blkno); 1693 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1694 1694
1695 if (ocfs2_mount_local(osb)) 1695 if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
1696 goto out; 1696 goto out;
1697 1697
1698 lockres = &OCFS2_I(inode)->ip_open_lockres; 1698 lockres = &OCFS2_I(inode)->ip_open_lockres;
@@ -1718,6 +1718,12 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
1718 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1718 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1719 write ? "EXMODE" : "PRMODE"); 1719 write ? "EXMODE" : "PRMODE");
1720 1720
1721 if (ocfs2_is_hard_readonly(osb)) {
1722 if (write)
1723 status = -EROFS;
1724 goto out;
1725 }
1726
1721 if (ocfs2_mount_local(osb)) 1727 if (ocfs2_mount_local(osb))
1722 goto out; 1728 goto out;
1723 1729
@@ -2298,7 +2304,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
2298 if (ocfs2_is_hard_readonly(osb)) { 2304 if (ocfs2_is_hard_readonly(osb)) {
2299 if (ex) 2305 if (ex)
2300 status = -EROFS; 2306 status = -EROFS;
2301 goto bail; 2307 goto getbh;
2302 } 2308 }
2303 2309
2304 if (ocfs2_mount_local(osb)) 2310 if (ocfs2_mount_local(osb))
@@ -2356,7 +2362,7 @@ local:
2356 mlog_errno(status); 2362 mlog_errno(status);
2357 goto bail; 2363 goto bail;
2358 } 2364 }
2359 2365getbh:
2360 if (ret_bh) { 2366 if (ret_bh) {
2361 status = ocfs2_assign_bh(inode, ret_bh, local_bh); 2367 status = ocfs2_assign_bh(inode, ret_bh, local_bh);
2362 if (status < 0) { 2368 if (status < 0) {
@@ -2628,8 +2634,11 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex)
2628 2634
2629 BUG_ON(!dl); 2635 BUG_ON(!dl);
2630 2636
2631 if (ocfs2_is_hard_readonly(osb)) 2637 if (ocfs2_is_hard_readonly(osb)) {
2632 return -EROFS; 2638 if (ex)
2639 return -EROFS;
2640 return 0;
2641 }
2633 2642
2634 if (ocfs2_mount_local(osb)) 2643 if (ocfs2_mount_local(osb))
2635 return 0; 2644 return 0;
@@ -2647,7 +2656,7 @@ void ocfs2_dentry_unlock(struct dentry *dentry, int ex)
2647 struct ocfs2_dentry_lock *dl = dentry->d_fsdata; 2656 struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
2648 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); 2657 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
2649 2658
2650 if (!ocfs2_mount_local(osb)) 2659 if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
2651 ocfs2_cluster_unlock(osb, &dl->dl_lockres, level); 2660 ocfs2_cluster_unlock(osb, &dl->dl_lockres, level);
2652} 2661}
2653 2662
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 23457b491e8..2f5b92ef0e5 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -832,6 +832,102 @@ out:
832 return ret; 832 return ret;
833} 833}
834 834
835int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin)
836{
837 struct inode *inode = file->f_mapping->host;
838 int ret;
839 unsigned int is_last = 0, is_data = 0;
840 u16 cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
841 u32 cpos, cend, clen, hole_size;
842 u64 extoff, extlen;
843 struct buffer_head *di_bh = NULL;
844 struct ocfs2_extent_rec rec;
845
846 BUG_ON(origin != SEEK_DATA && origin != SEEK_HOLE);
847
848 ret = ocfs2_inode_lock(inode, &di_bh, 0);
849 if (ret) {
850 mlog_errno(ret);
851 goto out;
852 }
853
854 down_read(&OCFS2_I(inode)->ip_alloc_sem);
855
856 if (*offset >= inode->i_size) {
857 ret = -ENXIO;
858 goto out_unlock;
859 }
860
861 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
862 if (origin == SEEK_HOLE)
863 *offset = inode->i_size;
864 goto out_unlock;
865 }
866
867 clen = 0;
868 cpos = *offset >> cs_bits;
869 cend = ocfs2_clusters_for_bytes(inode->i_sb, inode->i_size);
870
871 while (cpos < cend && !is_last) {
872 ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size,
873 &rec, &is_last);
874 if (ret) {
875 mlog_errno(ret);
876 goto out_unlock;
877 }
878
879 extoff = cpos;
880 extoff <<= cs_bits;
881
882 if (rec.e_blkno == 0ULL) {
883 clen = hole_size;
884 is_data = 0;
885 } else {
886 clen = le16_to_cpu(rec.e_leaf_clusters) -
887 (cpos - le32_to_cpu(rec.e_cpos));
888 is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ? 0 : 1;
889 }
890
891 if ((!is_data && origin == SEEK_HOLE) ||
892 (is_data && origin == SEEK_DATA)) {
893 if (extoff > *offset)
894 *offset = extoff;
895 goto out_unlock;
896 }
897
898 if (!is_last)
899 cpos += clen;
900 }
901
902 if (origin == SEEK_HOLE) {
903 extoff = cpos;
904 extoff <<= cs_bits;
905 extlen = clen;
906 extlen <<= cs_bits;
907
908 if ((extoff + extlen) > inode->i_size)
909 extlen = inode->i_size - extoff;
910 extoff += extlen;
911 if (extoff > *offset)
912 *offset = extoff;
913 goto out_unlock;
914 }
915
916 ret = -ENXIO;
917
918out_unlock:
919
920 brelse(di_bh);
921
922 up_read(&OCFS2_I(inode)->ip_alloc_sem);
923
924 ocfs2_inode_unlock(inode, 0);
925out:
926 if (ret && ret != -ENXIO)
927 ret = -ENXIO;
928 return ret;
929}
930
835int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, 931int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
836 struct buffer_head *bhs[], int flags, 932 struct buffer_head *bhs[], int flags,
837 int (*validate)(struct super_block *sb, 933 int (*validate)(struct super_block *sb,
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index e79d41c2c90..67ea57d2fd5 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -53,6 +53,8 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
53int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 53int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
54 u64 map_start, u64 map_len); 54 u64 map_start, u64 map_len);
55 55
56int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin);
57
56int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, 58int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
57 u32 *p_cluster, u32 *num_clusters, 59 u32 *p_cluster, u32 *num_clusters,
58 struct ocfs2_extent_list *el, 60 struct ocfs2_extent_list *el,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index de4ea1af041..6e396683c3d 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1950,6 +1950,9 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1950 if (ret < 0) 1950 if (ret < 0)
1951 mlog_errno(ret); 1951 mlog_errno(ret);
1952 1952
1953 if (file->f_flags & O_SYNC)
1954 handle->h_sync = 1;
1955
1953 ocfs2_commit_trans(osb, handle); 1956 ocfs2_commit_trans(osb, handle);
1954 1957
1955out_inode_unlock: 1958out_inode_unlock:
@@ -2052,6 +2055,23 @@ out:
2052 return ret; 2055 return ret;
2053} 2056}
2054 2057
2058static void ocfs2_aiodio_wait(struct inode *inode)
2059{
2060 wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
2061
2062 wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
2063}
2064
2065static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
2066{
2067 int blockmask = inode->i_sb->s_blocksize - 1;
2068 loff_t final_size = pos + count;
2069
2070 if ((pos & blockmask) || (final_size & blockmask))
2071 return 1;
2072 return 0;
2073}
2074
2055static int ocfs2_prepare_inode_for_refcount(struct inode *inode, 2075static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
2056 struct file *file, 2076 struct file *file,
2057 loff_t pos, size_t count, 2077 loff_t pos, size_t count,
@@ -2230,6 +2250,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
2230 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2250 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2231 int full_coherency = !(osb->s_mount_opt & 2251 int full_coherency = !(osb->s_mount_opt &
2232 OCFS2_MOUNT_COHERENCY_BUFFERED); 2252 OCFS2_MOUNT_COHERENCY_BUFFERED);
2253 int unaligned_dio = 0;
2233 2254
2234 trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, 2255 trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
2235 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2256 (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2297,6 +2318,10 @@ relock:
2297 goto out; 2318 goto out;
2298 } 2319 }
2299 2320
2321 if (direct_io && !is_sync_kiocb(iocb))
2322 unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left,
2323 *ppos);
2324
2300 /* 2325 /*
2301 * We can't complete the direct I/O as requested, fall back to 2326 * We can't complete the direct I/O as requested, fall back to
2302 * buffered I/O. 2327 * buffered I/O.
@@ -2311,6 +2336,18 @@ relock:
2311 goto relock; 2336 goto relock;
2312 } 2337 }
2313 2338
2339 if (unaligned_dio) {
2340 /*
2341 * Wait on previous unaligned aio to complete before
2342 * proceeding.
2343 */
2344 ocfs2_aiodio_wait(inode);
2345
2346 /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
2347 atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
2348 ocfs2_iocb_set_unaligned_aio(iocb);
2349 }
2350
2314 /* 2351 /*
2315 * To later detect whether a journal commit for sync writes is 2352 * To later detect whether a journal commit for sync writes is
2316 * necessary, we sample i_size, and cluster count here. 2353 * necessary, we sample i_size, and cluster count here.
@@ -2382,8 +2419,12 @@ out_dio:
2382 if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { 2419 if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
2383 rw_level = -1; 2420 rw_level = -1;
2384 have_alloc_sem = 0; 2421 have_alloc_sem = 0;
2422 unaligned_dio = 0;
2385 } 2423 }
2386 2424
2425 if (unaligned_dio)
2426 atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);
2427
2387out: 2428out:
2388 if (rw_level != -1) 2429 if (rw_level != -1)
2389 ocfs2_rw_unlock(inode, rw_level); 2430 ocfs2_rw_unlock(inode, rw_level);
@@ -2591,6 +2632,57 @@ bail:
2591 return ret; 2632 return ret;
2592} 2633}
2593 2634
2635/* Refer generic_file_llseek_unlocked() */
2636static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin)
2637{
2638 struct inode *inode = file->f_mapping->host;
2639 int ret = 0;
2640
2641 mutex_lock(&inode->i_mutex);
2642
2643 switch (origin) {
2644 case SEEK_SET:
2645 break;
2646 case SEEK_END:
2647 offset += inode->i_size;
2648 break;
2649 case SEEK_CUR:
2650 if (offset == 0) {
2651 offset = file->f_pos;
2652 goto out;
2653 }
2654 offset += file->f_pos;
2655 break;
2656 case SEEK_DATA:
2657 case SEEK_HOLE:
2658 ret = ocfs2_seek_data_hole_offset(file, &offset, origin);
2659 if (ret)
2660 goto out;
2661 break;
2662 default:
2663 ret = -EINVAL;
2664 goto out;
2665 }
2666
2667 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2668 ret = -EINVAL;
2669 if (!ret && offset > inode->i_sb->s_maxbytes)
2670 ret = -EINVAL;
2671 if (ret)
2672 goto out;
2673
2674 if (offset != file->f_pos) {
2675 file->f_pos = offset;
2676 file->f_version = 0;
2677 }
2678
2679out:
2680 mutex_unlock(&inode->i_mutex);
2681 if (ret)
2682 return ret;
2683 return offset;
2684}
2685
2594const struct inode_operations ocfs2_file_iops = { 2686const struct inode_operations ocfs2_file_iops = {
2595 .setattr = ocfs2_setattr, 2687 .setattr = ocfs2_setattr,
2596 .getattr = ocfs2_getattr, 2688 .getattr = ocfs2_getattr,
@@ -2615,7 +2707,7 @@ const struct inode_operations ocfs2_special_file_iops = {
2615 * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks! 2707 * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
2616 */ 2708 */
2617const struct file_operations ocfs2_fops = { 2709const struct file_operations ocfs2_fops = {
2618 .llseek = generic_file_llseek, 2710 .llseek = ocfs2_file_llseek,
2619 .read = do_sync_read, 2711 .read = do_sync_read,
2620 .write = do_sync_write, 2712 .write = do_sync_write,
2621 .mmap = ocfs2_mmap, 2713 .mmap = ocfs2_mmap,
@@ -2663,7 +2755,7 @@ const struct file_operations ocfs2_dops = {
2663 * the cluster. 2755 * the cluster.
2664 */ 2756 */
2665const struct file_operations ocfs2_fops_no_plocks = { 2757const struct file_operations ocfs2_fops_no_plocks = {
2666 .llseek = generic_file_llseek, 2758 .llseek = ocfs2_file_llseek,
2667 .read = do_sync_read, 2759 .read = do_sync_read,
2668 .write = do_sync_write, 2760 .write = do_sync_write,
2669 .mmap = ocfs2_mmap, 2761 .mmap = ocfs2_mmap,
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index a22d2c09889..17454a904d7 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -951,7 +951,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode,
951 trace_ocfs2_cleanup_delete_inode( 951 trace_ocfs2_cleanup_delete_inode(
952 (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data); 952 (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);
953 if (sync_data) 953 if (sync_data)
954 write_inode_now(inode, 1); 954 filemap_write_and_wait(inode->i_mapping);
955 truncate_inode_pages(&inode->i_data, 0); 955 truncate_inode_pages(&inode->i_data, 0);
956} 956}
957 957
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 1c508b149b3..88924a3133f 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -43,6 +43,9 @@ struct ocfs2_inode_info
43 /* protects extended attribute changes on this inode */ 43 /* protects extended attribute changes on this inode */
44 struct rw_semaphore ip_xattr_sem; 44 struct rw_semaphore ip_xattr_sem;
45 45
46 /* Number of outstanding AIO's which are not page aligned */
47 atomic_t ip_unaligned_aio;
48
46 /* These fields are protected by ip_lock */ 49 /* These fields are protected by ip_lock */
47 spinlock_t ip_lock; 50 spinlock_t ip_lock;
48 u32 ip_open_count; 51 u32 ip_open_count;
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index bc91072b721..726ff265b29 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -122,7 +122,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
122 if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) & 122 if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) &
123 (OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) { 123 (OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) {
124 if (!capable(CAP_LINUX_IMMUTABLE)) 124 if (!capable(CAP_LINUX_IMMUTABLE))
125 goto bail_unlock; 125 goto bail_commit;
126 } 126 }
127 127
128 ocfs2_inode->ip_attr = flags; 128 ocfs2_inode->ip_attr = flags;
@@ -132,6 +132,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
132 if (status < 0) 132 if (status < 0)
133 mlog_errno(status); 133 mlog_errno(status);
134 134
135bail_commit:
135 ocfs2_commit_trans(osb, handle); 136 ocfs2_commit_trans(osb, handle);
136bail_unlock: 137bail_unlock:
137 ocfs2_inode_unlock(inode, 1); 138 ocfs2_inode_unlock(inode, 1);
@@ -381,7 +382,7 @@ int ocfs2_info_handle_freeinode(struct inode *inode,
381 if (!oifi) { 382 if (!oifi) {
382 status = -ENOMEM; 383 status = -ENOMEM;
383 mlog_errno(status); 384 mlog_errno(status);
384 goto bail; 385 goto out_err;
385 } 386 }
386 387
387 if (o2info_from_user(*oifi, req)) 388 if (o2info_from_user(*oifi, req))
@@ -431,7 +432,7 @@ bail:
431 o2info_set_request_error(&oifi->ifi_req, req); 432 o2info_set_request_error(&oifi->ifi_req, req);
432 433
433 kfree(oifi); 434 kfree(oifi);
434 435out_err:
435 return status; 436 return status;
436} 437}
437 438
@@ -666,7 +667,7 @@ int ocfs2_info_handle_freefrag(struct inode *inode,
666 if (!oiff) { 667 if (!oiff) {
667 status = -ENOMEM; 668 status = -ENOMEM;
668 mlog_errno(status); 669 mlog_errno(status);
669 goto bail; 670 goto out_err;
670 } 671 }
671 672
672 if (o2info_from_user(*oiff, req)) 673 if (o2info_from_user(*oiff, req))
@@ -716,7 +717,7 @@ bail:
716 o2info_set_request_error(&oiff->iff_req, req); 717 o2info_set_request_error(&oiff->iff_req, req);
717 718
718 kfree(oiff); 719 kfree(oiff);
719 720out_err:
720 return status; 721 return status;
721} 722}
722 723
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 295d56454e8..0a42ae96dca 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1544,9 +1544,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1544 /* we need to run complete recovery for offline orphan slots */ 1544 /* we need to run complete recovery for offline orphan slots */
1545 ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); 1545 ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
1546 1546
1547 mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n", 1547 printk(KERN_NOTICE "ocfs2: Begin replay journal (node %d, slot %d) on "\
1548 node_num, slot_num, 1548 "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev),
1549 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 1549 MINOR(osb->sb->s_dev));
1550 1550
1551 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 1551 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
1552 1552
@@ -1601,6 +1601,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1601 1601
1602 jbd2_journal_destroy(journal); 1602 jbd2_journal_destroy(journal);
1603 1603
1604 printk(KERN_NOTICE "ocfs2: End replay journal (node %d, slot %d) on "\
1605 "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev),
1606 MINOR(osb->sb->s_dev));
1604done: 1607done:
1605 /* drop the lock on this nodes journal */ 1608 /* drop the lock on this nodes journal */
1606 if (got_lock) 1609 if (got_lock)
@@ -1808,6 +1811,20 @@ static inline unsigned long ocfs2_orphan_scan_timeout(void)
1808 * every slot, queuing a recovery of the slot on the ocfs2_wq thread. This 1811 * every slot, queuing a recovery of the slot on the ocfs2_wq thread. This
1809 * is done to catch any orphans that are left over in orphan directories. 1812 * is done to catch any orphans that are left over in orphan directories.
1810 * 1813 *
1814 * It scans all slots, even ones that are in use. It does so to handle the
1815 * case described below:
1816 *
1817 * Node 1 has an inode it was using. The dentry went away due to memory
1818 * pressure. Node 1 closes the inode, but it's on the free list. The node
1819 * has the open lock.
1820 * Node 2 unlinks the inode. It grabs the dentry lock to notify others,
1821 * but node 1 has no dentry and doesn't get the message. It trylocks the
1822 * open lock, sees that another node has a PR, and does nothing.
1823 * Later node 2 runs its orphan dir. It igets the inode, trylocks the
1824 * open lock, sees the PR still, and does nothing.
1825 * Basically, we have to trigger an orphan iput on node 1. The only way
1826 * for this to happen is if node 1 runs node 2's orphan dir.
1827 *
1811 * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT 1828 * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT
1812 * seconds. It gets an EX lock on os_lockres and checks sequence number 1829 * seconds. It gets an EX lock on os_lockres and checks sequence number
1813 * stored in LVB. If the sequence number has changed, it means some other 1830 * stored in LVB. If the sequence number has changed, it means some other
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 68cf2f6d3c6..a3385b63ff5 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -441,10 +441,11 @@ static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir,
441#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) 441#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
442 442
443/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota 443/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
444 * update on dir + index leaf + dx root update for free list */ 444 * update on dir + index leaf + dx root update for free list +
445 * previous dirblock update in the free list */
445static inline int ocfs2_link_credits(struct super_block *sb) 446static inline int ocfs2_link_credits(struct super_block *sb)
446{ 447{
447 return 2*OCFS2_INODE_UPDATE_CREDITS + 3 + 448 return 2*OCFS2_INODE_UPDATE_CREDITS + 4 +
448 ocfs2_quota_trans_credits(sb); 449 ocfs2_quota_trans_credits(sb);
449} 450}
450 451
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 3e9393ca39e..9cd41083e99 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -61,7 +61,7 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
61static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, 61static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
62 struct page *page) 62 struct page *page)
63{ 63{
64 int ret; 64 int ret = VM_FAULT_NOPAGE;
65 struct inode *inode = file->f_path.dentry->d_inode; 65 struct inode *inode = file->f_path.dentry->d_inode;
66 struct address_space *mapping = inode->i_mapping; 66 struct address_space *mapping = inode->i_mapping;
67 loff_t pos = page_offset(page); 67 loff_t pos = page_offset(page);
@@ -71,32 +71,25 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
71 void *fsdata; 71 void *fsdata;
72 loff_t size = i_size_read(inode); 72 loff_t size = i_size_read(inode);
73 73
74 /*
75 * Another node might have truncated while we were waiting on
76 * cluster locks.
77 * We don't check size == 0 before the shift. This is borrowed
78 * from do_generic_file_read.
79 */
80 last_index = (size - 1) >> PAGE_CACHE_SHIFT; 74 last_index = (size - 1) >> PAGE_CACHE_SHIFT;
81 if (unlikely(!size || page->index > last_index)) {
82 ret = -EINVAL;
83 goto out;
84 }
85 75
86 /* 76 /*
87 * The i_size check above doesn't catch the case where nodes 77 * There are cases that lead to the page no longer bebongs to the
88 * truncated and then re-extended the file. We'll re-check the 78 * mapping.
89 * page mapping after taking the page lock inside of 79 * 1) pagecache truncates locally due to memory pressure.
90 * ocfs2_write_begin_nolock(). 80 * 2) pagecache truncates when another is taking EX lock against
81 * inode lock. see ocfs2_data_convert_worker.
82 *
83 * The i_size check doesn't catch the case where nodes truncated and
84 * then re-extended the file. We'll re-check the page mapping after
85 * taking the page lock inside of ocfs2_write_begin_nolock().
86 *
87 * Let VM retry with these cases.
91 */ 88 */
92 if (!PageUptodate(page) || page->mapping != inode->i_mapping) { 89 if ((page->mapping != inode->i_mapping) ||
93 /* 90 (!PageUptodate(page)) ||
94 * the page has been umapped in ocfs2_data_downconvert_worker. 91 (page_offset(page) >= size))
95 * So return 0 here and let VFS retry.
96 */
97 ret = 0;
98 goto out; 92 goto out;
99 }
100 93
101 /* 94 /*
102 * Call ocfs2_write_begin() and ocfs2_write_end() to take 95 * Call ocfs2_write_begin() and ocfs2_write_end() to take
@@ -116,17 +109,21 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
116 if (ret) { 109 if (ret) {
117 if (ret != -ENOSPC) 110 if (ret != -ENOSPC)
118 mlog_errno(ret); 111 mlog_errno(ret);
112 if (ret == -ENOMEM)
113 ret = VM_FAULT_OOM;
114 else
115 ret = VM_FAULT_SIGBUS;
119 goto out; 116 goto out;
120 } 117 }
121 118
122 ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page, 119 if (!locked_page) {
123 fsdata); 120 ret = VM_FAULT_NOPAGE;
124 if (ret < 0) {
125 mlog_errno(ret);
126 goto out; 121 goto out;
127 } 122 }
123 ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
124 fsdata);
128 BUG_ON(ret != len); 125 BUG_ON(ret != len);
129 ret = 0; 126 ret = VM_FAULT_LOCKED;
130out: 127out:
131 return ret; 128 return ret;
132} 129}
@@ -168,8 +165,6 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
168 165
169out: 166out:
170 ocfs2_unblock_signals(&oldset); 167 ocfs2_unblock_signals(&oldset);
171 if (ret)
172 ret = VM_FAULT_SIGBUS;
173 return ret; 168 return ret;
174} 169}
175 170
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index d53cb706f14..184c76b8c29 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -745,7 +745,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
745 */ 745 */
746 ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop, 746 ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
747 new_phys_cpos); 747 new_phys_cpos);
748 if (!new_phys_cpos) { 748 if (!*new_phys_cpos) {
749 ret = -ENOSPC; 749 ret = -ENOSPC;
750 goto out_commit; 750 goto out_commit;
751 } 751 }
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 409285854f6..d355e6e36b3 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -836,18 +836,65 @@ static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb,
836 836
837static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap) 837static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
838{ 838{
839 __test_and_set_bit_le(bit, bitmap); 839 __set_bit_le(bit, bitmap);
840} 840}
841#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr)) 841#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
842 842
843static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap) 843static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
844{ 844{
845 __test_and_clear_bit_le(bit, bitmap); 845 __clear_bit_le(bit, bitmap);
846} 846}
847#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr)) 847#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
848 848
849#define ocfs2_test_bit test_bit_le 849#define ocfs2_test_bit test_bit_le
850#define ocfs2_find_next_zero_bit find_next_zero_bit_le 850#define ocfs2_find_next_zero_bit find_next_zero_bit_le
851#define ocfs2_find_next_bit find_next_bit_le 851#define ocfs2_find_next_bit find_next_bit_le
852
853static inline void *correct_addr_and_bit_unaligned(int *bit, void *addr)
854{
855#if BITS_PER_LONG == 64
856 *bit += ((unsigned long) addr & 7UL) << 3;
857 addr = (void *) ((unsigned long) addr & ~7UL);
858#elif BITS_PER_LONG == 32
859 *bit += ((unsigned long) addr & 3UL) << 3;
860 addr = (void *) ((unsigned long) addr & ~3UL);
861#else
862#error "how many bits you are?!"
863#endif
864 return addr;
865}
866
867static inline void ocfs2_set_bit_unaligned(int bit, void *bitmap)
868{
869 bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
870 ocfs2_set_bit(bit, bitmap);
871}
872
873static inline void ocfs2_clear_bit_unaligned(int bit, void *bitmap)
874{
875 bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
876 ocfs2_clear_bit(bit, bitmap);
877}
878
879static inline int ocfs2_test_bit_unaligned(int bit, void *bitmap)
880{
881 bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
882 return ocfs2_test_bit(bit, bitmap);
883}
884
885static inline int ocfs2_find_next_zero_bit_unaligned(void *bitmap, int max,
886 int start)
887{
888 int fix = 0, ret, tmpmax;
889 bitmap = correct_addr_and_bit_unaligned(&fix, bitmap);
890 tmpmax = max + fix;
891 start += fix;
892
893 ret = ocfs2_find_next_zero_bit(bitmap, tmpmax, start) - fix;
894 if (ret > max)
895 return max;
896 return ret;
897}
898
852#endif /* OCFS2_H */ 899#endif /* OCFS2_H */
853 900
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index dc8007fc924..f100bf70a90 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -404,7 +404,9 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
404 int status = 0; 404 int status = 0;
405 struct ocfs2_quota_recovery *rec; 405 struct ocfs2_quota_recovery *rec;
406 406
407 mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num); 407 printk(KERN_NOTICE "ocfs2: Beginning quota recovery on device (%s) for "
408 "slot %u\n", osb->dev_str, slot_num);
409
408 rec = ocfs2_alloc_quota_recovery(); 410 rec = ocfs2_alloc_quota_recovery();
409 if (!rec) 411 if (!rec)
410 return ERR_PTR(-ENOMEM); 412 return ERR_PTR(-ENOMEM);
@@ -549,8 +551,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
549 goto out_commit; 551 goto out_commit;
550 } 552 }
551 lock_buffer(qbh); 553 lock_buffer(qbh);
552 WARN_ON(!ocfs2_test_bit(bit, dchunk->dqc_bitmap)); 554 WARN_ON(!ocfs2_test_bit_unaligned(bit, dchunk->dqc_bitmap));
553 ocfs2_clear_bit(bit, dchunk->dqc_bitmap); 555 ocfs2_clear_bit_unaligned(bit, dchunk->dqc_bitmap);
554 le32_add_cpu(&dchunk->dqc_free, 1); 556 le32_add_cpu(&dchunk->dqc_free, 1);
555 unlock_buffer(qbh); 557 unlock_buffer(qbh);
556 ocfs2_journal_dirty(handle, qbh); 558 ocfs2_journal_dirty(handle, qbh);
@@ -596,7 +598,9 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
596 struct inode *lqinode; 598 struct inode *lqinode;
597 unsigned int flags; 599 unsigned int flags;
598 600
599 mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num); 601 printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for "
602 "slot %u\n", osb->dev_str, slot_num);
603
600 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); 604 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
601 for (type = 0; type < MAXQUOTAS; type++) { 605 for (type = 0; type < MAXQUOTAS; type++) {
602 if (list_empty(&(rec->r_list[type]))) 606 if (list_empty(&(rec->r_list[type])))
@@ -612,8 +616,9 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
612 /* Someone else is holding the lock? Then he must be 616 /* Someone else is holding the lock? Then he must be
613 * doing the recovery. Just skip the file... */ 617 * doing the recovery. Just skip the file... */
614 if (status == -EAGAIN) { 618 if (status == -EAGAIN) {
615 mlog(ML_NOTICE, "skipping quota recovery for slot %d " 619 printk(KERN_NOTICE "ocfs2: Skipping quota recovery on "
616 "because quota file is locked.\n", slot_num); 620 "device (%s) for slot %d because quota file is "
621 "locked.\n", osb->dev_str, slot_num);
617 status = 0; 622 status = 0;
618 goto out_put; 623 goto out_put;
619 } else if (status < 0) { 624 } else if (status < 0) {
@@ -944,7 +949,7 @@ static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb,
944 * ol_quota_entries_per_block(sb); 949 * ol_quota_entries_per_block(sb);
945 } 950 }
946 951
947 found = ocfs2_find_next_zero_bit(dchunk->dqc_bitmap, len, 0); 952 found = ocfs2_find_next_zero_bit_unaligned(dchunk->dqc_bitmap, len, 0);
948 /* We failed? */ 953 /* We failed? */
949 if (found == len) { 954 if (found == len) {
950 mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u" 955 mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u"
@@ -1208,7 +1213,7 @@ static void olq_alloc_dquot(struct buffer_head *bh, void *private)
1208 struct ocfs2_local_disk_chunk *dchunk; 1213 struct ocfs2_local_disk_chunk *dchunk;
1209 1214
1210 dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; 1215 dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
1211 ocfs2_set_bit(*offset, dchunk->dqc_bitmap); 1216 ocfs2_set_bit_unaligned(*offset, dchunk->dqc_bitmap);
1212 le32_add_cpu(&dchunk->dqc_free, -1); 1217 le32_add_cpu(&dchunk->dqc_free, -1);
1213} 1218}
1214 1219
@@ -1289,7 +1294,7 @@ int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot)
1289 (od->dq_chunk->qc_headerbh->b_data); 1294 (od->dq_chunk->qc_headerbh->b_data);
1290 /* Mark structure as freed */ 1295 /* Mark structure as freed */
1291 lock_buffer(od->dq_chunk->qc_headerbh); 1296 lock_buffer(od->dq_chunk->qc_headerbh);
1292 ocfs2_clear_bit(offset, dchunk->dqc_bitmap); 1297 ocfs2_clear_bit_unaligned(offset, dchunk->dqc_bitmap);
1293 le32_add_cpu(&dchunk->dqc_free, 1); 1298 le32_add_cpu(&dchunk->dqc_free, 1);
1294 unlock_buffer(od->dq_chunk->qc_headerbh); 1299 unlock_buffer(od->dq_chunk->qc_headerbh);
1295 ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh); 1300 ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 26fc0014d50..1424c151ccc 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -493,8 +493,8 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
493 goto bail; 493 goto bail;
494 } 494 }
495 } else 495 } else
496 mlog(ML_NOTICE, "slot %d is already allocated to this node!\n", 496 printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already "
497 slot); 497 "allocated to this node!\n", slot, osb->dev_str);
498 498
499 ocfs2_set_slot(si, slot, osb->node_num); 499 ocfs2_set_slot(si, slot, osb->node_num);
500 osb->slot_num = slot; 500 osb->slot_num = slot;
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 19965b00c43..94368017edb 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -28,6 +28,7 @@
28#include "cluster/masklog.h" 28#include "cluster/masklog.h"
29#include "cluster/nodemanager.h" 29#include "cluster/nodemanager.h"
30#include "cluster/heartbeat.h" 30#include "cluster/heartbeat.h"
31#include "cluster/tcp.h"
31 32
32#include "stackglue.h" 33#include "stackglue.h"
33 34
@@ -256,6 +257,61 @@ static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb)
256} 257}
257 258
258/* 259/*
260 * Check if this node is heartbeating and is connected to all other
261 * heartbeating nodes.
262 */
263static int o2cb_cluster_check(void)
264{
265 u8 node_num;
266 int i;
267 unsigned long hbmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
268 unsigned long netmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
269
270 node_num = o2nm_this_node();
271 if (node_num == O2NM_MAX_NODES) {
272 printk(KERN_ERR "o2cb: This node has not been configured.\n");
273 return -EINVAL;
274 }
275
276 /*
277 * o2dlm expects o2net sockets to be created. If not, then
278 * dlm_join_domain() fails with a stack of errors which are both cryptic
279 * and incomplete. The idea here is to detect upfront whether we have
280 * managed to connect to all nodes or not. If not, then list the nodes
281 * to allow the user to check the configuration (incorrect IP, firewall,
282 * etc.) Yes, this is racy. But its not the end of the world.
283 */
284#define O2CB_MAP_STABILIZE_COUNT 60
285 for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) {
286 o2hb_fill_node_map(hbmap, sizeof(hbmap));
287 if (!test_bit(node_num, hbmap)) {
288 printk(KERN_ERR "o2cb: %s heartbeat has not been "
289 "started.\n", (o2hb_global_heartbeat_active() ?
290 "Global" : "Local"));
291 return -EINVAL;
292 }
293 o2net_fill_node_map(netmap, sizeof(netmap));
294 /* Force set the current node to allow easy compare */
295 set_bit(node_num, netmap);
296 if (!memcmp(hbmap, netmap, sizeof(hbmap)))
297 return 0;
298 if (i < O2CB_MAP_STABILIZE_COUNT)
299 msleep(1000);
300 }
301
302 printk(KERN_ERR "o2cb: This node could not connect to nodes:");
303 i = -1;
304 while ((i = find_next_bit(hbmap, O2NM_MAX_NODES,
305 i + 1)) < O2NM_MAX_NODES) {
306 if (!test_bit(i, netmap))
307 printk(" %u", i);
308 }
309 printk(".\n");
310
311 return -ENOTCONN;
312}
313
314/*
259 * Called from the dlm when it's about to evict a node. This is how the 315 * Called from the dlm when it's about to evict a node. This is how the
260 * classic stack signals node death. 316 * classic stack signals node death.
261 */ 317 */
@@ -263,8 +319,8 @@ static void o2dlm_eviction_cb(int node_num, void *data)
263{ 319{
264 struct ocfs2_cluster_connection *conn = data; 320 struct ocfs2_cluster_connection *conn = data;
265 321
266 mlog(ML_NOTICE, "o2dlm has evicted node %d from group %.*s\n", 322 printk(KERN_NOTICE "o2cb: o2dlm has evicted node %d from domain %.*s\n",
267 node_num, conn->cc_namelen, conn->cc_name); 323 node_num, conn->cc_namelen, conn->cc_name);
268 324
269 conn->cc_recovery_handler(node_num, conn->cc_recovery_data); 325 conn->cc_recovery_handler(node_num, conn->cc_recovery_data);
270} 326}
@@ -280,12 +336,11 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
280 BUG_ON(conn == NULL); 336 BUG_ON(conn == NULL);
281 BUG_ON(conn->cc_proto == NULL); 337 BUG_ON(conn->cc_proto == NULL);
282 338
283 /* for now we only have one cluster/node, make sure we see it 339 /* Ensure cluster stack is up and all nodes are connected */
284 * in the heartbeat universe */ 340 rc = o2cb_cluster_check();
285 if (!o2hb_check_local_node_heartbeating()) { 341 if (rc) {
286 if (o2hb_global_heartbeat_active()) 342 printk(KERN_ERR "o2cb: Cluster check failed. Fix errors "
287 mlog(ML_ERROR, "Global heartbeat not started\n"); 343 "before retrying.\n");
288 rc = -EINVAL;
289 goto out; 344 goto out;
290 } 345 }
291 346
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 56f61027236..4994f8b0e60 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -54,6 +54,7 @@
54#include "ocfs1_fs_compat.h" 54#include "ocfs1_fs_compat.h"
55 55
56#include "alloc.h" 56#include "alloc.h"
57#include "aops.h"
57#include "blockcheck.h" 58#include "blockcheck.h"
58#include "dlmglue.h" 59#include "dlmglue.h"
59#include "export.h" 60#include "export.h"
@@ -1107,9 +1108,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
1107 1108
1108 ocfs2_set_ro_flag(osb, 1); 1109 ocfs2_set_ro_flag(osb, 1);
1109 1110
1110 printk(KERN_NOTICE "Readonly device detected. No cluster " 1111 printk(KERN_NOTICE "ocfs2: Readonly device (%s) detected. "
1111 "services will be utilized for this mount. Recovery " 1112 "Cluster services will not be used for this mount. "
1112 "will be skipped.\n"); 1113 "Recovery will be skipped.\n", osb->dev_str);
1113 } 1114 }
1114 1115
1115 if (!ocfs2_is_hard_readonly(osb)) { 1116 if (!ocfs2_is_hard_readonly(osb)) {
@@ -1616,12 +1617,17 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1616 return 0; 1617 return 0;
1617} 1618}
1618 1619
1620wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
1621
1619static int __init ocfs2_init(void) 1622static int __init ocfs2_init(void)
1620{ 1623{
1621 int status; 1624 int status, i;
1622 1625
1623 ocfs2_print_version(); 1626 ocfs2_print_version();
1624 1627
1628 for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++)
1629 init_waitqueue_head(&ocfs2__ioend_wq[i]);
1630
1625 status = init_ocfs2_uptodate_cache(); 1631 status = init_ocfs2_uptodate_cache();
1626 if (status < 0) { 1632 if (status < 0) {
1627 mlog_errno(status); 1633 mlog_errno(status);
@@ -1760,7 +1766,7 @@ static void ocfs2_inode_init_once(void *data)
1760 ocfs2_extent_map_init(&oi->vfs_inode); 1766 ocfs2_extent_map_init(&oi->vfs_inode);
1761 INIT_LIST_HEAD(&oi->ip_io_markers); 1767 INIT_LIST_HEAD(&oi->ip_io_markers);
1762 oi->ip_dir_start_lookup = 0; 1768 oi->ip_dir_start_lookup = 0;
1763 1769 atomic_set(&oi->ip_unaligned_aio, 0);
1764 init_rwsem(&oi->ip_alloc_sem); 1770 init_rwsem(&oi->ip_alloc_sem);
1765 init_rwsem(&oi->ip_xattr_sem); 1771 init_rwsem(&oi->ip_xattr_sem);
1766 mutex_init(&oi->ip_io_mutex); 1772 mutex_init(&oi->ip_io_mutex);
@@ -1974,7 +1980,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1974 * If we failed before we got a uuid_str yet, we can't stop 1980 * If we failed before we got a uuid_str yet, we can't stop
1975 * heartbeat. Otherwise, do it. 1981 * heartbeat. Otherwise, do it.
1976 */ 1982 */
1977 if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str) 1983 if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str &&
1984 !ocfs2_is_hard_readonly(osb))
1978 hangup_needed = 1; 1985 hangup_needed = 1;
1979 1986
1980 if (osb->cconn) 1987 if (osb->cconn)
@@ -2353,7 +2360,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
2353 mlog_errno(status); 2360 mlog_errno(status);
2354 goto bail; 2361 goto bail;
2355 } 2362 }
2356 cleancache_init_shared_fs((char *)&uuid_net_key, sb); 2363 cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb);
2357 2364
2358bail: 2365bail:
2359 return status; 2366 return status;
@@ -2462,8 +2469,8 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
2462 goto finally; 2469 goto finally;
2463 } 2470 }
2464 } else { 2471 } else {
2465 mlog(ML_NOTICE, "File system was not unmounted cleanly, " 2472 printk(KERN_NOTICE "ocfs2: File system on device (%s) was not "
2466 "recovering volume.\n"); 2473 "unmounted cleanly, recovering it.\n", osb->dev_str);
2467 } 2474 }
2468 2475
2469 local = ocfs2_mount_local(osb); 2476 local = ocfs2_mount_local(osb);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 194fb22ef79..aa9e8777b09 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2376,16 +2376,18 @@ static int ocfs2_remove_value_outside(struct inode*inode,
2376 } 2376 }
2377 2377
2378 ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt); 2378 ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
2379 if (ret < 0) {
2380 mlog_errno(ret);
2381 break;
2382 }
2383 2379
2384 ocfs2_commit_trans(osb, ctxt.handle); 2380 ocfs2_commit_trans(osb, ctxt.handle);
2385 if (ctxt.meta_ac) { 2381 if (ctxt.meta_ac) {
2386 ocfs2_free_alloc_context(ctxt.meta_ac); 2382 ocfs2_free_alloc_context(ctxt.meta_ac);
2387 ctxt.meta_ac = NULL; 2383 ctxt.meta_ac = NULL;
2388 } 2384 }
2385
2386 if (ret < 0) {
2387 mlog_errno(ret);
2388 break;
2389 }
2390
2389 } 2391 }
2390 2392
2391 if (ctxt.meta_ac) 2393 if (ctxt.meta_ac)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 2db1bd3173b..851ba3dcdc2 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1652,46 +1652,12 @@ out:
1652 return error; 1652 return error;
1653} 1653}
1654 1654
1655static int proc_pid_fd_link_getattr(struct vfsmount *mnt, struct dentry *dentry,
1656 struct kstat *stat)
1657{
1658 struct inode *inode = dentry->d_inode;
1659 struct task_struct *task = get_proc_task(inode);
1660 int rc;
1661
1662 if (task == NULL)
1663 return -ESRCH;
1664
1665 rc = -EACCES;
1666 if (lock_trace(task))
1667 goto out_task;
1668
1669 generic_fillattr(inode, stat);
1670 unlock_trace(task);
1671 rc = 0;
1672out_task:
1673 put_task_struct(task);
1674 return rc;
1675}
1676
1677static const struct inode_operations proc_pid_link_inode_operations = { 1655static const struct inode_operations proc_pid_link_inode_operations = {
1678 .readlink = proc_pid_readlink, 1656 .readlink = proc_pid_readlink,
1679 .follow_link = proc_pid_follow_link, 1657 .follow_link = proc_pid_follow_link,
1680 .setattr = proc_setattr, 1658 .setattr = proc_setattr,
1681}; 1659};
1682 1660
1683static const struct inode_operations proc_fdinfo_link_inode_operations = {
1684 .setattr = proc_setattr,
1685 .getattr = proc_pid_fd_link_getattr,
1686};
1687
1688static const struct inode_operations proc_fd_link_inode_operations = {
1689 .readlink = proc_pid_readlink,
1690 .follow_link = proc_pid_follow_link,
1691 .setattr = proc_setattr,
1692 .getattr = proc_pid_fd_link_getattr,
1693};
1694
1695 1661
1696/* building an inode */ 1662/* building an inode */
1697 1663
@@ -1923,61 +1889,49 @@ out:
1923 1889
1924static int proc_fd_info(struct inode *inode, struct path *path, char *info) 1890static int proc_fd_info(struct inode *inode, struct path *path, char *info)
1925{ 1891{
1926 struct task_struct *task; 1892 struct task_struct *task = get_proc_task(inode);
1927 struct files_struct *files; 1893 struct files_struct *files = NULL;
1928 struct file *file; 1894 struct file *file;
1929 int fd = proc_fd(inode); 1895 int fd = proc_fd(inode);
1930 int rc;
1931
1932 task = get_proc_task(inode);
1933 if (!task)
1934 return -ENOENT;
1935
1936 rc = -EACCES;
1937 if (lock_trace(task))
1938 goto out_task;
1939
1940 rc = -ENOENT;
1941 files = get_files_struct(task);
1942 if (files == NULL)
1943 goto out_unlock;
1944 1896
1945 /* 1897 if (task) {
1946 * We are not taking a ref to the file structure, so we must 1898 files = get_files_struct(task);
1947 * hold ->file_lock. 1899 put_task_struct(task);
1948 */ 1900 }
1949 spin_lock(&files->file_lock); 1901 if (files) {
1950 file = fcheck_files(files, fd); 1902 /*
1951 if (file) { 1903 * We are not taking a ref to the file structure, so we must
1952 unsigned int f_flags; 1904 * hold ->file_lock.
1953 struct fdtable *fdt; 1905 */
1954 1906 spin_lock(&files->file_lock);
1955 fdt = files_fdtable(files); 1907 file = fcheck_files(files, fd);
1956 f_flags = file->f_flags & ~O_CLOEXEC; 1908 if (file) {
1957 if (FD_ISSET(fd, fdt->close_on_exec)) 1909 unsigned int f_flags;
1958 f_flags |= O_CLOEXEC; 1910 struct fdtable *fdt;
1959 1911
1960 if (path) { 1912 fdt = files_fdtable(files);
1961 *path = file->f_path; 1913 f_flags = file->f_flags & ~O_CLOEXEC;
1962 path_get(&file->f_path); 1914 if (FD_ISSET(fd, fdt->close_on_exec))
1915 f_flags |= O_CLOEXEC;
1916
1917 if (path) {
1918 *path = file->f_path;
1919 path_get(&file->f_path);
1920 }
1921 if (info)
1922 snprintf(info, PROC_FDINFO_MAX,
1923 "pos:\t%lli\n"
1924 "flags:\t0%o\n",
1925 (long long) file->f_pos,
1926 f_flags);
1927 spin_unlock(&files->file_lock);
1928 put_files_struct(files);
1929 return 0;
1963 } 1930 }
1964 if (info) 1931 spin_unlock(&files->file_lock);
1965 snprintf(info, PROC_FDINFO_MAX, 1932 put_files_struct(files);
1966 "pos:\t%lli\n" 1933 }
1967 "flags:\t0%o\n", 1934 return -ENOENT;
1968 (long long) file->f_pos,
1969 f_flags);
1970 rc = 0;
1971 } else
1972 rc = -ENOENT;
1973 spin_unlock(&files->file_lock);
1974 put_files_struct(files);
1975
1976out_unlock:
1977 unlock_trace(task);
1978out_task:
1979 put_task_struct(task);
1980 return rc;
1981} 1935}
1982 1936
1983static int proc_fd_link(struct inode *inode, struct path *path) 1937static int proc_fd_link(struct inode *inode, struct path *path)
@@ -2072,7 +2026,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir,
2072 spin_unlock(&files->file_lock); 2026 spin_unlock(&files->file_lock);
2073 put_files_struct(files); 2027 put_files_struct(files);
2074 2028
2075 inode->i_op = &proc_fd_link_inode_operations; 2029 inode->i_op = &proc_pid_link_inode_operations;
2076 inode->i_size = 64; 2030 inode->i_size = 64;
2077 ei->op.proc_get_link = proc_fd_link; 2031 ei->op.proc_get_link = proc_fd_link;
2078 d_set_d_op(dentry, &tid_fd_dentry_operations); 2032 d_set_d_op(dentry, &tid_fd_dentry_operations);
@@ -2104,12 +2058,7 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
2104 if (fd == ~0U) 2058 if (fd == ~0U)
2105 goto out; 2059 goto out;
2106 2060
2107 result = ERR_PTR(-EACCES);
2108 if (lock_trace(task))
2109 goto out;
2110
2111 result = instantiate(dir, dentry, task, &fd); 2061 result = instantiate(dir, dentry, task, &fd);
2112 unlock_trace(task);
2113out: 2062out:
2114 put_task_struct(task); 2063 put_task_struct(task);
2115out_no_task: 2064out_no_task:
@@ -2129,28 +2078,23 @@ static int proc_readfd_common(struct file * filp, void * dirent,
2129 retval = -ENOENT; 2078 retval = -ENOENT;
2130 if (!p) 2079 if (!p)
2131 goto out_no_task; 2080 goto out_no_task;
2132
2133 retval = -EACCES;
2134 if (lock_trace(p))
2135 goto out;
2136
2137 retval = 0; 2081 retval = 0;
2138 2082
2139 fd = filp->f_pos; 2083 fd = filp->f_pos;
2140 switch (fd) { 2084 switch (fd) {
2141 case 0: 2085 case 0:
2142 if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) 2086 if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
2143 goto out_unlock; 2087 goto out;
2144 filp->f_pos++; 2088 filp->f_pos++;
2145 case 1: 2089 case 1:
2146 ino = parent_ino(dentry); 2090 ino = parent_ino(dentry);
2147 if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) 2091 if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
2148 goto out_unlock; 2092 goto out;
2149 filp->f_pos++; 2093 filp->f_pos++;
2150 default: 2094 default:
2151 files = get_files_struct(p); 2095 files = get_files_struct(p);
2152 if (!files) 2096 if (!files)
2153 goto out_unlock; 2097 goto out;
2154 rcu_read_lock(); 2098 rcu_read_lock();
2155 for (fd = filp->f_pos-2; 2099 for (fd = filp->f_pos-2;
2156 fd < files_fdtable(files)->max_fds; 2100 fd < files_fdtable(files)->max_fds;
@@ -2174,9 +2118,6 @@ static int proc_readfd_common(struct file * filp, void * dirent,
2174 rcu_read_unlock(); 2118 rcu_read_unlock();
2175 put_files_struct(files); 2119 put_files_struct(files);
2176 } 2120 }
2177
2178out_unlock:
2179 unlock_trace(p);
2180out: 2121out:
2181 put_task_struct(p); 2122 put_task_struct(p);
2182out_no_task: 2123out_no_task:
@@ -2254,7 +2195,6 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
2254 ei->fd = fd; 2195 ei->fd = fd;
2255 inode->i_mode = S_IFREG | S_IRUSR; 2196 inode->i_mode = S_IFREG | S_IRUSR;
2256 inode->i_fop = &proc_fdinfo_file_operations; 2197 inode->i_fop = &proc_fdinfo_file_operations;
2257 inode->i_op = &proc_fdinfo_link_inode_operations;
2258 d_set_d_op(dentry, &tid_fd_dentry_operations); 2198 d_set_d_op(dentry, &tid_fd_dentry_operations);
2259 d_add(dentry, inode); 2199 d_add(dentry, inode);
2260 /* Close the race of the process dying before we return the dentry */ 2200 /* Close the race of the process dying before we return the dentry */
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 586174168e2..80e4645f799 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -131,12 +131,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
131 K(i.freeswap), 131 K(i.freeswap),
132 K(global_page_state(NR_FILE_DIRTY)), 132 K(global_page_state(NR_FILE_DIRTY)),
133 K(global_page_state(NR_WRITEBACK)), 133 K(global_page_state(NR_WRITEBACK)),
134 K(global_page_state(NR_ANON_PAGES)
135#ifdef CONFIG_TRANSPARENT_HUGEPAGE 134#ifdef CONFIG_TRANSPARENT_HUGEPAGE
135 K(global_page_state(NR_ANON_PAGES)
136 + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * 136 + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
137 HPAGE_PMD_NR 137 HPAGE_PMD_NR),
138#else
139 K(global_page_state(NR_ANON_PAGES)),
138#endif 140#endif
139 ),
140 K(global_page_state(NR_FILE_MAPPED)), 141 K(global_page_state(NR_FILE_MAPPED)),
141 K(global_page_state(NR_SHMEM)), 142 K(global_page_state(NR_SHMEM)),
142 K(global_page_state(NR_SLAB_RECLAIMABLE) + 143 K(global_page_state(NR_SLAB_RECLAIMABLE) +
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 42b274da92c..2a30d67dd6b 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -32,7 +32,7 @@ static cputime64_t get_idle_time(int cpu)
32 idle = kstat_cpu(cpu).cpustat.idle; 32 idle = kstat_cpu(cpu).cpustat.idle;
33 idle = cputime64_add(idle, arch_idle_time(cpu)); 33 idle = cputime64_add(idle, arch_idle_time(cpu));
34 } else 34 } else
35 idle = usecs_to_cputime(idle_time); 35 idle = nsecs_to_jiffies64(1000 * idle_time);
36 36
37 return idle; 37 return idle;
38} 38}
@@ -46,7 +46,7 @@ static cputime64_t get_iowait_time(int cpu)
46 /* !NO_HZ so we can rely on cpustat.iowait */ 46 /* !NO_HZ so we can rely on cpustat.iowait */
47 iowait = kstat_cpu(cpu).cpustat.iowait; 47 iowait = kstat_cpu(cpu).cpustat.iowait;
48 else 48 else
49 iowait = usecs_to_cputime(iowait_time); 49 iowait = nsecs_to_jiffies64(1000 * iowait_time);
50 50
51 return iowait; 51 return iowait;
52} 52}
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index cd99bf55765..b0f450a2bb7 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -12,6 +12,7 @@
12#include <linux/user.h> 12#include <linux/user.h>
13#include <linux/elf.h> 13#include <linux/elf.h>
14#include <linux/elfcore.h> 14#include <linux/elfcore.h>
15#include <linux/export.h>
15#include <linux/slab.h> 16#include <linux/slab.h>
16#include <linux/highmem.h> 17#include <linux/highmem.h>
17#include <linux/bootmem.h> 18#include <linux/bootmem.h>
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 2bd620f0d79..57bbf9078ac 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -167,6 +167,7 @@ int pstore_register(struct pstore_info *psi)
167 } 167 }
168 168
169 psinfo = psi; 169 psinfo = psi;
170 mutex_init(&psinfo->read_mutex);
170 spin_unlock(&pstore_lock); 171 spin_unlock(&pstore_lock);
171 172
172 if (owner && !try_module_get(owner)) { 173 if (owner && !try_module_get(owner)) {
@@ -195,30 +196,32 @@ EXPORT_SYMBOL_GPL(pstore_register);
195void pstore_get_records(int quiet) 196void pstore_get_records(int quiet)
196{ 197{
197 struct pstore_info *psi = psinfo; 198 struct pstore_info *psi = psinfo;
199 char *buf = NULL;
198 ssize_t size; 200 ssize_t size;
199 u64 id; 201 u64 id;
200 enum pstore_type_id type; 202 enum pstore_type_id type;
201 struct timespec time; 203 struct timespec time;
202 int failed = 0, rc; 204 int failed = 0, rc;
203 unsigned long flags;
204 205
205 if (!psi) 206 if (!psi)
206 return; 207 return;
207 208
208 spin_lock_irqsave(&psinfo->buf_lock, flags); 209 mutex_lock(&psi->read_mutex);
209 rc = psi->open(psi); 210 rc = psi->open(psi);
210 if (rc) 211 if (rc)
211 goto out; 212 goto out;
212 213
213 while ((size = psi->read(&id, &type, &time, psi)) > 0) { 214 while ((size = psi->read(&id, &type, &time, &buf, psi)) > 0) {
214 rc = pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size, 215 rc = pstore_mkfile(type, psi->name, id, buf, (size_t)size,
215 time, psi); 216 time, psi);
217 kfree(buf);
218 buf = NULL;
216 if (rc && (rc != -EEXIST || !quiet)) 219 if (rc && (rc != -EEXIST || !quiet))
217 failed++; 220 failed++;
218 } 221 }
219 psi->close(psi); 222 psi->close(psi);
220out: 223out:
221 spin_unlock_irqrestore(&psinfo->buf_lock, flags); 224 mutex_unlock(&psi->read_mutex);
222 225
223 if (failed) 226 if (failed)
224 printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n", 227 printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n",
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index aae0edb95c6..35f4b0ecdeb 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -286,7 +286,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
286 /* caller already holds s_umount */ 286 /* caller already holds s_umount */
287 if (sb->s_flags & MS_RDONLY) 287 if (sb->s_flags & MS_RDONLY)
288 return -EROFS; 288 return -EROFS;
289 writeback_inodes_sb(sb); 289 writeback_inodes_sb(sb, WB_REASON_SYNC);
290 return 0; 290 return 0;
291 default: 291 default:
292 return -EINVAL; 292 return -EINVAL;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 05d6b0e78c9..dba43c3ea3a 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -449,8 +449,6 @@ EXPORT_SYMBOL(seq_path);
449 449
450/* 450/*
451 * Same as seq_path, but relative to supplied root. 451 * Same as seq_path, but relative to supplied root.
452 *
453 * root may be changed, see __d_path().
454 */ 452 */
455int seq_path_root(struct seq_file *m, struct path *path, struct path *root, 453int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
456 char *esc) 454 char *esc)
@@ -463,6 +461,8 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
463 char *p; 461 char *p;
464 462
465 p = __d_path(path, root, buf, size); 463 p = __d_path(path, root, buf, size);
464 if (!p)
465 return SEQ_SKIP;
466 res = PTR_ERR(p); 466 res = PTR_ERR(p);
467 if (!IS_ERR(p)) { 467 if (!IS_ERR(p)) {
468 char *end = mangle_path(buf, p, esc); 468 char *end = mangle_path(buf, p, esc);
@@ -474,7 +474,7 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
474 } 474 }
475 seq_commit(m, res); 475 seq_commit(m, res);
476 476
477 return res < 0 ? res : 0; 477 return res < 0 && res != -ENAMETOOLONG ? res : 0;
478} 478}
479 479
480/* 480/*
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 048b59d5b2f..c70111ebefd 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -78,6 +78,28 @@ config SQUASHFS_XZ
78 78
79 If unsure, say N. 79 If unsure, say N.
80 80
81config SQUASHFS_4K_DEVBLK_SIZE
82 bool "Use 4K device block size?"
83 depends on SQUASHFS
84 help
85 By default Squashfs sets the dev block size (sb_min_blocksize)
86 to 1K or the smallest block size supported by the block device
87 (if larger). This, because blocks are packed together and
88 unaligned in Squashfs, should reduce latency.
89
90 This, however, gives poor performance on MTD NAND devices where
91 the optimal I/O size is 4K (even though the devices can support
92 smaller block sizes).
93
94 Using a 4K device block size may also improve overall I/O
95 performance for some file access patterns (e.g. sequential
96 accesses of files in filesystem order) on all media.
97
98 Setting this option will force Squashfs to use a 4K device block
99 size by default.
100
101 If unsure, say N.
102
81config SQUASHFS_EMBEDDED 103config SQUASHFS_EMBEDDED
82 bool "Additional option for memory-constrained systems" 104 bool "Additional option for memory-constrained systems"
83 depends on SQUASHFS 105 depends on SQUASHFS
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index b4a4e539a08..e8e14645de9 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -36,6 +36,13 @@
36#define SQUASHFS_FILE_SIZE 131072 36#define SQUASHFS_FILE_SIZE 131072
37#define SQUASHFS_FILE_LOG 17 37#define SQUASHFS_FILE_LOG 17
38 38
39/* default size of block device I/O */
40#ifdef CONFIG_SQUASHFS_4K_DEVBLK_SIZE
41#define SQUASHFS_DEVBLK_SIZE 4096
42#else
43#define SQUASHFS_DEVBLK_SIZE 1024
44#endif
45
39#define SQUASHFS_FILE_MAX_SIZE 1048576 46#define SQUASHFS_FILE_MAX_SIZE 1048576
40#define SQUASHFS_FILE_MAX_LOG 20 47#define SQUASHFS_FILE_MAX_LOG 20
41 48
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 7438850c62d..2da1715452a 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -95,7 +95,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
95 } 95 }
96 msblk = sb->s_fs_info; 96 msblk = sb->s_fs_info;
97 97
98 msblk->devblksize = sb_min_blocksize(sb, BLOCK_SIZE); 98 msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE);
99 msblk->devblksize_log2 = ffz(~msblk->devblksize); 99 msblk->devblksize_log2 = ffz(~msblk->devblksize);
100 100
101 mutex_init(&msblk->read_data_mutex); 101 mutex_init(&msblk->read_data_mutex);
diff --git a/fs/statfs.c b/fs/statfs.c
index 8244924dec5..9cf04a11896 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -76,7 +76,7 @@ EXPORT_SYMBOL(vfs_statfs);
76int user_statfs(const char __user *pathname, struct kstatfs *st) 76int user_statfs(const char __user *pathname, struct kstatfs *st)
77{ 77{
78 struct path path; 78 struct path path;
79 int error = user_path(pathname, &path); 79 int error = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
80 if (!error) { 80 if (!error) {
81 error = vfs_statfs(&path, st); 81 error = vfs_statfs(&path, st);
82 path_put(&path); 82 path_put(&path);
diff --git a/fs/sync.c b/fs/sync.c
index c98a7477edf..101b8ef901d 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -43,7 +43,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
43 if (wait) 43 if (wait)
44 sync_inodes_sb(sb); 44 sync_inodes_sb(sb);
45 else 45 else
46 writeback_inodes_sb(sb); 46 writeback_inodes_sb(sb, WB_REASON_SYNC);
47 47
48 if (sb->s_op->sync_fs) 48 if (sb->s_op->sync_fs)
49 sb->s_op->sync_fs(sb, wait); 49 sb->s_op->sync_fs(sb, wait);
@@ -98,7 +98,7 @@ static void sync_filesystems(int wait)
98 */ 98 */
99SYSCALL_DEFINE0(sync) 99SYSCALL_DEFINE0(sync)
100{ 100{
101 wakeup_flusher_threads(0); 101 wakeup_flusher_threads(0, WB_REASON_SYNC);
102 sync_filesystems(0); 102 sync_filesystems(0);
103 sync_filesystems(1); 103 sync_filesystems(1);
104 if (unlikely(laptop_mode)) 104 if (unlikely(laptop_mode))
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 315de66e52b..bc4f94b2870 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -63,7 +63,7 @@
63static void shrink_liability(struct ubifs_info *c, int nr_to_write) 63static void shrink_liability(struct ubifs_info *c, int nr_to_write)
64{ 64{
65 down_read(&c->vfs_sb->s_umount); 65 down_read(&c->vfs_sb->s_umount);
66 writeback_inodes_sb(c->vfs_sb); 66 writeback_inodes_sb(c->vfs_sb, WB_REASON_FS_FREE_SPACE);
67 up_read(&c->vfs_sb->s_umount); 67 up_read(&c->vfs_sb->s_umount);
68} 68}
69 69
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index eef109a1a92..b09ba2dd8b6 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -870,6 +870,22 @@ void dbg_dump_lpt_info(struct ubifs_info *c)
870 spin_unlock(&dbg_lock); 870 spin_unlock(&dbg_lock);
871} 871}
872 872
873void dbg_dump_sleb(const struct ubifs_info *c,
874 const struct ubifs_scan_leb *sleb, int offs)
875{
876 struct ubifs_scan_node *snod;
877
878 printk(KERN_DEBUG "(pid %d) start dumping scanned data from LEB %d:%d\n",
879 current->pid, sleb->lnum, offs);
880
881 list_for_each_entry(snod, &sleb->nodes, list) {
882 cond_resched();
883 printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", sleb->lnum,
884 snod->offs, snod->len);
885 dbg_dump_node(c, snod->node);
886 }
887}
888
873void dbg_dump_leb(const struct ubifs_info *c, int lnum) 889void dbg_dump_leb(const struct ubifs_info *c, int lnum)
874{ 890{
875 struct ubifs_scan_leb *sleb; 891 struct ubifs_scan_leb *sleb;
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index feb361e252a..8d9c4681018 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -269,6 +269,8 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp);
269void dbg_dump_lprops(struct ubifs_info *c); 269void dbg_dump_lprops(struct ubifs_info *c);
270void dbg_dump_lpt_info(struct ubifs_info *c); 270void dbg_dump_lpt_info(struct ubifs_info *c);
271void dbg_dump_leb(const struct ubifs_info *c, int lnum); 271void dbg_dump_leb(const struct ubifs_info *c, int lnum);
272void dbg_dump_sleb(const struct ubifs_info *c,
273 const struct ubifs_scan_leb *sleb, int offs);
272void dbg_dump_znode(const struct ubifs_info *c, 274void dbg_dump_znode(const struct ubifs_info *c,
273 const struct ubifs_znode *znode); 275 const struct ubifs_znode *znode);
274void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat); 276void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat);
@@ -387,6 +389,9 @@ static inline void dbg_dump_lpt_info(struct ubifs_info *c) { return; }
387static inline void dbg_dump_leb(const struct ubifs_info *c, 389static inline void dbg_dump_leb(const struct ubifs_info *c,
388 int lnum) { return; } 390 int lnum) { return; }
389static inline void 391static inline void
392dbg_dump_sleb(const struct ubifs_info *c,
393 const struct ubifs_scan_leb *sleb, int offs) { return; }
394static inline void
390dbg_dump_znode(const struct ubifs_info *c, 395dbg_dump_znode(const struct ubifs_info *c,
391 const struct ubifs_znode *znode) { return; } 396 const struct ubifs_znode *znode) { return; }
392static inline void dbg_dump_heap(struct ubifs_info *c, 397static inline void dbg_dump_heap(struct ubifs_info *c,
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index af02790d932..ee4f43f4bb9 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -983,7 +983,7 @@ int ubifs_recover_inl_heads(struct ubifs_info *c, void *sbuf)
983} 983}
984 984
985/** 985/**
986 * clean_an_unclean_leb - read and write a LEB to remove corruption. 986 * clean_an_unclean_leb - read and write a LEB to remove corruption.
987 * @c: UBIFS file-system description object 987 * @c: UBIFS file-system description object
988 * @ucleb: unclean LEB information 988 * @ucleb: unclean LEB information
989 * @sbuf: LEB-sized buffer to use 989 * @sbuf: LEB-sized buffer to use
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 93d938ad3d2..6094c5a5d7a 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -247,7 +247,7 @@ static int create_default_filesystem(struct ubifs_info *c)
247 mst->total_dirty = cpu_to_le64(tmp64); 247 mst->total_dirty = cpu_to_le64(tmp64);
248 248
249 /* The indexing LEB does not contribute to dark space */ 249 /* The indexing LEB does not contribute to dark space */
250 tmp64 = (c->main_lebs - 1) * c->dark_wm; 250 tmp64 = ((long long)(c->main_lebs - 1) * c->dark_wm);
251 mst->total_dark = cpu_to_le64(tmp64); 251 mst->total_dark = cpu_to_le64(tmp64);
252 252
253 mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ); 253 mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ);
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index b6c4b3795c4..76e4266d2e7 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -42,6 +42,8 @@ xfs_acl_from_disk(struct xfs_acl *aclp)
42 int count, i; 42 int count, i;
43 43
44 count = be32_to_cpu(aclp->acl_cnt); 44 count = be32_to_cpu(aclp->acl_cnt);
45 if (count > XFS_ACL_MAX_ENTRIES)
46 return ERR_PTR(-EFSCORRUPTED);
45 47
46 acl = posix_acl_alloc(count, GFP_KERNEL); 48 acl = posix_acl_alloc(count, GFP_KERNEL);
47 if (!acl) 49 if (!acl)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 33b13310ee0..574d4ee9b62 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -189,7 +189,7 @@ xfs_end_io(
189 int error = 0; 189 int error = 0;
190 190
191 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 191 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
192 error = -EIO; 192 ioend->io_error = -EIO;
193 goto done; 193 goto done;
194 } 194 }
195 if (ioend->io_error) 195 if (ioend->io_error)
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index d4906e7c978..c1b55e59655 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -110,6 +110,7 @@ xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
110/* 110/*
111 * Query whether the requested number of additional bytes of extended 111 * Query whether the requested number of additional bytes of extended
112 * attribute space will be able to fit inline. 112 * attribute space will be able to fit inline.
113 *
113 * Returns zero if not, else the di_forkoff fork offset to be used in the 114 * Returns zero if not, else the di_forkoff fork offset to be used in the
114 * literal area for attribute data once the new bytes have been added. 115 * literal area for attribute data once the new bytes have been added.
115 * 116 *
@@ -122,7 +123,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
122 int offset; 123 int offset;
123 int minforkoff; /* lower limit on valid forkoff locations */ 124 int minforkoff; /* lower limit on valid forkoff locations */
124 int maxforkoff; /* upper limit on valid forkoff locations */ 125 int maxforkoff; /* upper limit on valid forkoff locations */
125 int dsize; 126 int dsize;
126 xfs_mount_t *mp = dp->i_mount; 127 xfs_mount_t *mp = dp->i_mount;
127 128
128 offset = (XFS_LITINO(mp) - bytes) >> 3; /* rounded down */ 129 offset = (XFS_LITINO(mp) - bytes) >> 3; /* rounded down */
@@ -136,47 +137,60 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
136 return (offset >= minforkoff) ? minforkoff : 0; 137 return (offset >= minforkoff) ? minforkoff : 0;
137 } 138 }
138 139
139 if (!(mp->m_flags & XFS_MOUNT_ATTR2)) { 140 /*
140 if (bytes <= XFS_IFORK_ASIZE(dp)) 141 * If the requested numbers of bytes is smaller or equal to the
141 return dp->i_d.di_forkoff; 142 * current attribute fork size we can always proceed.
143 *
144 * Note that if_bytes in the data fork might actually be larger than
145 * the current data fork size is due to delalloc extents. In that
146 * case either the extent count will go down when they are converted
147 * to real extents, or the delalloc conversion will take care of the
148 * literal area rebalancing.
149 */
150 if (bytes <= XFS_IFORK_ASIZE(dp))
151 return dp->i_d.di_forkoff;
152
153 /*
154 * For attr2 we can try to move the forkoff if there is space in the
155 * literal area, but for the old format we are done if there is no
156 * space in the fixed attribute fork.
157 */
158 if (!(mp->m_flags & XFS_MOUNT_ATTR2))
142 return 0; 159 return 0;
143 }
144 160
145 dsize = dp->i_df.if_bytes; 161 dsize = dp->i_df.if_bytes;
146 162
147 switch (dp->i_d.di_format) { 163 switch (dp->i_d.di_format) {
148 case XFS_DINODE_FMT_EXTENTS: 164 case XFS_DINODE_FMT_EXTENTS:
149 /* 165 /*
150 * If there is no attr fork and the data fork is extents, 166 * If there is no attr fork and the data fork is extents,
151 * determine if creating the default attr fork will result 167 * determine if creating the default attr fork will result
152 * in the extents form migrating to btree. If so, the 168 * in the extents form migrating to btree. If so, the
153 * minimum offset only needs to be the space required for 169 * minimum offset only needs to be the space required for
154 * the btree root. 170 * the btree root.
155 */ 171 */
156 if (!dp->i_d.di_forkoff && dp->i_df.if_bytes > 172 if (!dp->i_d.di_forkoff && dp->i_df.if_bytes >
157 xfs_default_attroffset(dp)) 173 xfs_default_attroffset(dp))
158 dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS); 174 dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
159 break; 175 break;
160
161 case XFS_DINODE_FMT_BTREE: 176 case XFS_DINODE_FMT_BTREE:
162 /* 177 /*
163 * If have data btree then keep forkoff if we have one, 178 * If we have a data btree then keep forkoff if we have one,
164 * otherwise we are adding a new attr, so then we set 179 * otherwise we are adding a new attr, so then we set
165 * minforkoff to where the btree root can finish so we have 180 * minforkoff to where the btree root can finish so we have
166 * plenty of room for attrs 181 * plenty of room for attrs
167 */ 182 */
168 if (dp->i_d.di_forkoff) { 183 if (dp->i_d.di_forkoff) {
169 if (offset < dp->i_d.di_forkoff) 184 if (offset < dp->i_d.di_forkoff)
170 return 0; 185 return 0;
171 else 186 return dp->i_d.di_forkoff;
172 return dp->i_d.di_forkoff; 187 }
173 } else 188 dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot);
174 dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot);
175 break; 189 break;
176 } 190 }
177 191
178 /* 192 /*
179 * A data fork btree root must have space for at least 193 * A data fork btree root must have space for at least
180 * MINDBTPTRS key/ptr pairs if the data fork is small or empty. 194 * MINDBTPTRS key/ptr pairs if the data fork is small or empty.
181 */ 195 */
182 minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS)); 196 minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS));
@@ -186,10 +200,10 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
186 maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); 200 maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS);
187 maxforkoff = maxforkoff >> 3; /* rounded down */ 201 maxforkoff = maxforkoff >> 3; /* rounded down */
188 202
189 if (offset >= minforkoff && offset < maxforkoff)
190 return offset;
191 if (offset >= maxforkoff) 203 if (offset >= maxforkoff)
192 return maxforkoff; 204 return maxforkoff;
205 if (offset >= minforkoff)
206 return offset;
193 return 0; 207 return 0;
194} 208}
195 209
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index c68baeb0974..d0ab7883705 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2383,6 +2383,8 @@ xfs_bmap_btalloc(
2383 int tryagain; 2383 int tryagain;
2384 int error; 2384 int error;
2385 2385
2386 ASSERT(ap->length);
2387
2386 mp = ap->ip->i_mount; 2388 mp = ap->ip->i_mount;
2387 align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; 2389 align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
2388 if (unlikely(align)) { 2390 if (unlikely(align)) {
@@ -4629,6 +4631,8 @@ xfs_bmapi_allocate(
4629 int error; 4631 int error;
4630 int rt; 4632 int rt;
4631 4633
4634 ASSERT(bma->length > 0);
4635
4632 rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(bma->ip); 4636 rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(bma->ip);
4633 4637
4634 /* 4638 /*
@@ -4849,6 +4853,7 @@ xfs_bmapi_write(
4849 ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); 4853 ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
4850 ASSERT(!(flags & XFS_BMAPI_IGSTATE)); 4854 ASSERT(!(flags & XFS_BMAPI_IGSTATE));
4851 ASSERT(tp != NULL); 4855 ASSERT(tp != NULL);
4856 ASSERT(len > 0);
4852 4857
4853 whichfork = (flags & XFS_BMAPI_ATTRFORK) ? 4858 whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
4854 XFS_ATTR_FORK : XFS_DATA_FORK; 4859 XFS_ATTR_FORK : XFS_DATA_FORK;
@@ -4918,9 +4923,22 @@ xfs_bmapi_write(
4918 bma.eof = eof; 4923 bma.eof = eof;
4919 bma.conv = !!(flags & XFS_BMAPI_CONVERT); 4924 bma.conv = !!(flags & XFS_BMAPI_CONVERT);
4920 bma.wasdel = wasdelay; 4925 bma.wasdel = wasdelay;
4921 bma.length = len;
4922 bma.offset = bno; 4926 bma.offset = bno;
4923 4927
4928 /*
4929 * There's a 32/64 bit type mismatch between the
4930 * allocation length request (which can be 64 bits in
4931 * length) and the bma length request, which is
4932 * xfs_extlen_t and therefore 32 bits. Hence we have to
4933 * check for 32-bit overflows and handle them here.
4934 */
4935 if (len > (xfs_filblks_t)MAXEXTLEN)
4936 bma.length = MAXEXTLEN;
4937 else
4938 bma.length = len;
4939
4940 ASSERT(len > 0);
4941 ASSERT(bma.length > 0);
4924 error = xfs_bmapi_allocate(&bma, flags); 4942 error = xfs_bmapi_allocate(&bma, flags);
4925 if (error) 4943 if (error)
4926 goto error0; 4944 goto error0;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 1a3513881bc..eac97ef81e2 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -656,7 +656,7 @@ xfs_buf_item_committing(
656/* 656/*
657 * This is the ops vector shared by all buf log items. 657 * This is the ops vector shared by all buf log items.
658 */ 658 */
659static struct xfs_item_ops xfs_buf_item_ops = { 659static const struct xfs_item_ops xfs_buf_item_ops = {
660 .iop_size = xfs_buf_item_size, 660 .iop_size = xfs_buf_item_size,
661 .iop_format = xfs_buf_item_format, 661 .iop_format = xfs_buf_item_format,
662 .iop_pin = xfs_buf_item_pin, 662 .iop_pin = xfs_buf_item_pin,
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index bb3f71d236d..0dee0b71029 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -295,7 +295,7 @@ xfs_qm_dquot_logitem_committing(
295/* 295/*
296 * This is the ops vector for dquots 296 * This is the ops vector for dquots
297 */ 297 */
298static struct xfs_item_ops xfs_dquot_item_ops = { 298static const struct xfs_item_ops xfs_dquot_item_ops = {
299 .iop_size = xfs_qm_dquot_logitem_size, 299 .iop_size = xfs_qm_dquot_logitem_size,
300 .iop_format = xfs_qm_dquot_logitem_format, 300 .iop_format = xfs_qm_dquot_logitem_format,
301 .iop_pin = xfs_qm_dquot_logitem_pin, 301 .iop_pin = xfs_qm_dquot_logitem_pin,
@@ -483,7 +483,7 @@ xfs_qm_qoff_logitem_committing(
483{ 483{
484} 484}
485 485
486static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { 486static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
487 .iop_size = xfs_qm_qoff_logitem_size, 487 .iop_size = xfs_qm_qoff_logitem_size,
488 .iop_format = xfs_qm_qoff_logitem_format, 488 .iop_format = xfs_qm_qoff_logitem_format,
489 .iop_pin = xfs_qm_qoff_logitem_pin, 489 .iop_pin = xfs_qm_qoff_logitem_pin,
@@ -498,7 +498,7 @@ static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
498/* 498/*
499 * This is the ops vector shared by all quotaoff-start log items. 499 * This is the ops vector shared by all quotaoff-start log items.
500 */ 500 */
501static struct xfs_item_ops xfs_qm_qoff_logitem_ops = { 501static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
502 .iop_size = xfs_qm_qoff_logitem_size, 502 .iop_size = xfs_qm_qoff_logitem_size,
503 .iop_format = xfs_qm_qoff_logitem_format, 503 .iop_format = xfs_qm_qoff_logitem_format,
504 .iop_pin = xfs_qm_qoff_logitem_pin, 504 .iop_pin = xfs_qm_qoff_logitem_pin,
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index da108977b21..558910f5e3c 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -98,22 +98,22 @@ xfs_fs_encode_fh(
98 switch (fileid_type) { 98 switch (fileid_type) {
99 case FILEID_INO32_GEN_PARENT: 99 case FILEID_INO32_GEN_PARENT:
100 spin_lock(&dentry->d_lock); 100 spin_lock(&dentry->d_lock);
101 fid->i32.parent_ino = dentry->d_parent->d_inode->i_ino; 101 fid->i32.parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino;
102 fid->i32.parent_gen = dentry->d_parent->d_inode->i_generation; 102 fid->i32.parent_gen = dentry->d_parent->d_inode->i_generation;
103 spin_unlock(&dentry->d_lock); 103 spin_unlock(&dentry->d_lock);
104 /*FALLTHRU*/ 104 /*FALLTHRU*/
105 case FILEID_INO32_GEN: 105 case FILEID_INO32_GEN:
106 fid->i32.ino = inode->i_ino; 106 fid->i32.ino = XFS_I(inode)->i_ino;
107 fid->i32.gen = inode->i_generation; 107 fid->i32.gen = inode->i_generation;
108 break; 108 break;
109 case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: 109 case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
110 spin_lock(&dentry->d_lock); 110 spin_lock(&dentry->d_lock);
111 fid64->parent_ino = dentry->d_parent->d_inode->i_ino; 111 fid64->parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino;
112 fid64->parent_gen = dentry->d_parent->d_inode->i_generation; 112 fid64->parent_gen = dentry->d_parent->d_inode->i_generation;
113 spin_unlock(&dentry->d_lock); 113 spin_unlock(&dentry->d_lock);
114 /*FALLTHRU*/ 114 /*FALLTHRU*/
115 case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG: 115 case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
116 fid64->ino = inode->i_ino; 116 fid64->ino = XFS_I(inode)->i_ino;
117 fid64->gen = inode->i_generation; 117 fid64->gen = inode->i_generation;
118 break; 118 break;
119 } 119 }
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index d22e6262343..35c2aff38b2 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -217,7 +217,7 @@ xfs_efi_item_committing(
217/* 217/*
218 * This is the ops vector shared by all efi log items. 218 * This is the ops vector shared by all efi log items.
219 */ 219 */
220static struct xfs_item_ops xfs_efi_item_ops = { 220static const struct xfs_item_ops xfs_efi_item_ops = {
221 .iop_size = xfs_efi_item_size, 221 .iop_size = xfs_efi_item_size,
222 .iop_format = xfs_efi_item_format, 222 .iop_format = xfs_efi_item_format,
223 .iop_pin = xfs_efi_item_pin, 223 .iop_pin = xfs_efi_item_pin,
@@ -477,7 +477,7 @@ xfs_efd_item_committing(
477/* 477/*
478 * This is the ops vector shared by all efd log items. 478 * This is the ops vector shared by all efd log items.
479 */ 479 */
480static struct xfs_item_ops xfs_efd_item_ops = { 480static const struct xfs_item_ops xfs_efd_item_ops = {
481 .iop_size = xfs_efd_item_size, 481 .iop_size = xfs_efd_item_size,
482 .iop_format = xfs_efd_item_format, 482 .iop_format = xfs_efd_item_format,
483 .iop_pin = xfs_efd_item_pin, 483 .iop_pin = xfs_efd_item_pin,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c0237c602f1..755ee816488 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2835,6 +2835,27 @@ corrupt_out:
2835 return XFS_ERROR(EFSCORRUPTED); 2835 return XFS_ERROR(EFSCORRUPTED);
2836} 2836}
2837 2837
2838void
2839xfs_promote_inode(
2840 struct xfs_inode *ip)
2841{
2842 struct xfs_buf *bp;
2843
2844 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2845
2846 bp = xfs_incore(ip->i_mount->m_ddev_targp, ip->i_imap.im_blkno,
2847 ip->i_imap.im_len, XBF_TRYLOCK);
2848 if (!bp)
2849 return;
2850
2851 if (XFS_BUF_ISDELAYWRITE(bp)) {
2852 xfs_buf_delwri_promote(bp);
2853 wake_up_process(ip->i_mount->m_ddev_targp->bt_task);
2854 }
2855
2856 xfs_buf_relse(bp);
2857}
2858
2838/* 2859/*
2839 * Return a pointer to the extent record at file index idx. 2860 * Return a pointer to the extent record at file index idx.
2840 */ 2861 */
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 760140d1dd6..b4cd4739f98 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -498,6 +498,7 @@ int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
498void xfs_iext_realloc(xfs_inode_t *, int, int); 498void xfs_iext_realloc(xfs_inode_t *, int, int);
499void xfs_iunpin_wait(xfs_inode_t *); 499void xfs_iunpin_wait(xfs_inode_t *);
500int xfs_iflush(xfs_inode_t *, uint); 500int xfs_iflush(xfs_inode_t *, uint);
501void xfs_promote_inode(struct xfs_inode *);
501void xfs_lock_inodes(xfs_inode_t **, int, uint); 502void xfs_lock_inodes(xfs_inode_t **, int, uint);
502void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); 503void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
503 504
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index b7cf21ba240..abaafdbb3e6 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -795,7 +795,7 @@ xfs_inode_item_committing(
795/* 795/*
796 * This is the ops vector shared by all buf log items. 796 * This is the ops vector shared by all buf log items.
797 */ 797 */
798static struct xfs_item_ops xfs_inode_item_ops = { 798static const struct xfs_item_ops xfs_inode_item_ops = {
799 .iop_size = xfs_inode_item_size, 799 .iop_size = xfs_inode_item_size,
800 .iop_format = xfs_inode_item_format, 800 .iop_format = xfs_inode_item_format,
801 .iop_pin = xfs_inode_item_pin, 801 .iop_pin = xfs_inode_item_pin,
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 2758a6277c5..34817adf4b9 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -150,6 +150,117 @@ xlog_grant_add_space(
150 } while (head_val != old); 150 } while (head_val != old);
151} 151}
152 152
153STATIC bool
154xlog_reserveq_wake(
155 struct log *log,
156 int *free_bytes)
157{
158 struct xlog_ticket *tic;
159 int need_bytes;
160
161 list_for_each_entry(tic, &log->l_reserveq, t_queue) {
162 if (tic->t_flags & XLOG_TIC_PERM_RESERV)
163 need_bytes = tic->t_unit_res * tic->t_cnt;
164 else
165 need_bytes = tic->t_unit_res;
166
167 if (*free_bytes < need_bytes)
168 return false;
169 *free_bytes -= need_bytes;
170
171 trace_xfs_log_grant_wake_up(log, tic);
172 wake_up(&tic->t_wait);
173 }
174
175 return true;
176}
177
178STATIC bool
179xlog_writeq_wake(
180 struct log *log,
181 int *free_bytes)
182{
183 struct xlog_ticket *tic;
184 int need_bytes;
185
186 list_for_each_entry(tic, &log->l_writeq, t_queue) {
187 ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
188
189 need_bytes = tic->t_unit_res;
190
191 if (*free_bytes < need_bytes)
192 return false;
193 *free_bytes -= need_bytes;
194
195 trace_xfs_log_regrant_write_wake_up(log, tic);
196 wake_up(&tic->t_wait);
197 }
198
199 return true;
200}
201
202STATIC int
203xlog_reserveq_wait(
204 struct log *log,
205 struct xlog_ticket *tic,
206 int need_bytes)
207{
208 list_add_tail(&tic->t_queue, &log->l_reserveq);
209
210 do {
211 if (XLOG_FORCED_SHUTDOWN(log))
212 goto shutdown;
213 xlog_grant_push_ail(log, need_bytes);
214
215 XFS_STATS_INC(xs_sleep_logspace);
216 trace_xfs_log_grant_sleep(log, tic);
217
218 xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
219 trace_xfs_log_grant_wake(log, tic);
220
221 spin_lock(&log->l_grant_reserve_lock);
222 if (XLOG_FORCED_SHUTDOWN(log))
223 goto shutdown;
224 } while (xlog_space_left(log, &log->l_grant_reserve_head) < need_bytes);
225
226 list_del_init(&tic->t_queue);
227 return 0;
228shutdown:
229 list_del_init(&tic->t_queue);
230 return XFS_ERROR(EIO);
231}
232
233STATIC int
234xlog_writeq_wait(
235 struct log *log,
236 struct xlog_ticket *tic,
237 int need_bytes)
238{
239 list_add_tail(&tic->t_queue, &log->l_writeq);
240
241 do {
242 if (XLOG_FORCED_SHUTDOWN(log))
243 goto shutdown;
244 xlog_grant_push_ail(log, need_bytes);
245
246 XFS_STATS_INC(xs_sleep_logspace);
247 trace_xfs_log_regrant_write_sleep(log, tic);
248
249 xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
250 trace_xfs_log_regrant_write_wake(log, tic);
251
252 spin_lock(&log->l_grant_write_lock);
253 if (XLOG_FORCED_SHUTDOWN(log))
254 goto shutdown;
255 } while (xlog_space_left(log, &log->l_grant_write_head) < need_bytes);
256
257 list_del_init(&tic->t_queue);
258 return 0;
259shutdown:
260 list_del_init(&tic->t_queue);
261 return XFS_ERROR(EIO);
262}
263
153static void 264static void
154xlog_tic_reset_res(xlog_ticket_t *tic) 265xlog_tic_reset_res(xlog_ticket_t *tic)
155{ 266{
@@ -350,8 +461,19 @@ xfs_log_reserve(
350 retval = xlog_grant_log_space(log, internal_ticket); 461 retval = xlog_grant_log_space(log, internal_ticket);
351 } 462 }
352 463
464 if (unlikely(retval)) {
465 /*
466 * If we are failing, make sure the ticket doesn't have any
467 * current reservations. We don't want to add this back
468 * when the ticket/ transaction gets cancelled.
469 */
470 internal_ticket->t_curr_res = 0;
471 /* ungrant will give back unit_res * t_cnt. */
472 internal_ticket->t_cnt = 0;
473 }
474
353 return retval; 475 return retval;
354} /* xfs_log_reserve */ 476}
355 477
356 478
357/* 479/*
@@ -626,7 +748,7 @@ xfs_log_item_init(
626 struct xfs_mount *mp, 748 struct xfs_mount *mp,
627 struct xfs_log_item *item, 749 struct xfs_log_item *item,
628 int type, 750 int type,
629 struct xfs_item_ops *ops) 751 const struct xfs_item_ops *ops)
630{ 752{
631 item->li_mountp = mp; 753 item->li_mountp = mp;
632 item->li_ailp = mp->m_ail; 754 item->li_ailp = mp->m_ail;
@@ -2481,8 +2603,8 @@ restart:
2481/* 2603/*
2482 * Atomically get the log space required for a log ticket. 2604 * Atomically get the log space required for a log ticket.
2483 * 2605 *
2484 * Once a ticket gets put onto the reserveq, it will only return after 2606 * Once a ticket gets put onto the reserveq, it will only return after the
2485 * the needed reservation is satisfied. 2607 * needed reservation is satisfied.
2486 * 2608 *
2487 * This function is structured so that it has a lock free fast path. This is 2609 * This function is structured so that it has a lock free fast path. This is
2488 * necessary because every new transaction reservation will come through this 2610 * necessary because every new transaction reservation will come through this
@@ -2490,113 +2612,53 @@ restart:
2490 * every pass. 2612 * every pass.
2491 * 2613 *
2492 * As tickets are only ever moved on and off the reserveq under the 2614 * As tickets are only ever moved on and off the reserveq under the
2493 * l_grant_reserve_lock, we only need to take that lock if we are going 2615 * l_grant_reserve_lock, we only need to take that lock if we are going to add
2494 * to add the ticket to the queue and sleep. We can avoid taking the lock if the 2616 * the ticket to the queue and sleep. We can avoid taking the lock if the ticket
2495 * ticket was never added to the reserveq because the t_queue list head will be 2617 * was never added to the reserveq because the t_queue list head will be empty
2496 * empty and we hold the only reference to it so it can safely be checked 2618 * and we hold the only reference to it so it can safely be checked unlocked.
2497 * unlocked.
2498 */ 2619 */
2499STATIC int 2620STATIC int
2500xlog_grant_log_space(xlog_t *log, 2621xlog_grant_log_space(
2501 xlog_ticket_t *tic) 2622 struct log *log,
2623 struct xlog_ticket *tic)
2502{ 2624{
2503 int free_bytes; 2625 int free_bytes, need_bytes;
2504 int need_bytes; 2626 int error = 0;
2505 2627
2506#ifdef DEBUG 2628 ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
2507 if (log->l_flags & XLOG_ACTIVE_RECOVERY)
2508 panic("grant Recovery problem");
2509#endif
2510 2629
2511 trace_xfs_log_grant_enter(log, tic); 2630 trace_xfs_log_grant_enter(log, tic);
2512 2631
2632 /*
2633 * If there are other waiters on the queue then give them a chance at
2634 * logspace before us. Wake up the first waiters, if we do not wake
2635 * up all the waiters then go to sleep waiting for more free space,
2636 * otherwise try to get some space for this transaction.
2637 */
2513 need_bytes = tic->t_unit_res; 2638 need_bytes = tic->t_unit_res;
2514 if (tic->t_flags & XFS_LOG_PERM_RESERV) 2639 if (tic->t_flags & XFS_LOG_PERM_RESERV)
2515 need_bytes *= tic->t_ocnt; 2640 need_bytes *= tic->t_ocnt;
2516
2517 /* something is already sleeping; insert new transaction at end */
2518 if (!list_empty_careful(&log->l_reserveq)) {
2519 spin_lock(&log->l_grant_reserve_lock);
2520 /* recheck the queue now we are locked */
2521 if (list_empty(&log->l_reserveq)) {
2522 spin_unlock(&log->l_grant_reserve_lock);
2523 goto redo;
2524 }
2525 list_add_tail(&tic->t_queue, &log->l_reserveq);
2526
2527 trace_xfs_log_grant_sleep1(log, tic);
2528
2529 /*
2530 * Gotta check this before going to sleep, while we're
2531 * holding the grant lock.
2532 */
2533 if (XLOG_FORCED_SHUTDOWN(log))
2534 goto error_return;
2535
2536 XFS_STATS_INC(xs_sleep_logspace);
2537 xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
2538
2539 /*
2540 * If we got an error, and the filesystem is shutting down,
2541 * we'll catch it down below. So just continue...
2542 */
2543 trace_xfs_log_grant_wake1(log, tic);
2544 }
2545
2546redo:
2547 if (XLOG_FORCED_SHUTDOWN(log))
2548 goto error_return_unlocked;
2549
2550 free_bytes = xlog_space_left(log, &log->l_grant_reserve_head); 2641 free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
2551 if (free_bytes < need_bytes) { 2642 if (!list_empty_careful(&log->l_reserveq)) {
2552 spin_lock(&log->l_grant_reserve_lock); 2643 spin_lock(&log->l_grant_reserve_lock);
2553 if (list_empty(&tic->t_queue)) 2644 if (!xlog_reserveq_wake(log, &free_bytes) ||
2554 list_add_tail(&tic->t_queue, &log->l_reserveq); 2645 free_bytes < need_bytes)
2555 2646 error = xlog_reserveq_wait(log, tic, need_bytes);
2556 trace_xfs_log_grant_sleep2(log, tic); 2647 spin_unlock(&log->l_grant_reserve_lock);
2557 2648 } else if (free_bytes < need_bytes) {
2558 if (XLOG_FORCED_SHUTDOWN(log))
2559 goto error_return;
2560
2561 xlog_grant_push_ail(log, need_bytes);
2562
2563 XFS_STATS_INC(xs_sleep_logspace);
2564 xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
2565
2566 trace_xfs_log_grant_wake2(log, tic);
2567 goto redo;
2568 }
2569
2570 if (!list_empty(&tic->t_queue)) {
2571 spin_lock(&log->l_grant_reserve_lock); 2649 spin_lock(&log->l_grant_reserve_lock);
2572 list_del_init(&tic->t_queue); 2650 error = xlog_reserveq_wait(log, tic, need_bytes);
2573 spin_unlock(&log->l_grant_reserve_lock); 2651 spin_unlock(&log->l_grant_reserve_lock);
2574 } 2652 }
2653 if (error)
2654 return error;
2575 2655
2576 /* we've got enough space */
2577 xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes); 2656 xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
2578 xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes); 2657 xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
2579 trace_xfs_log_grant_exit(log, tic); 2658 trace_xfs_log_grant_exit(log, tic);
2580 xlog_verify_grant_tail(log); 2659 xlog_verify_grant_tail(log);
2581 return 0; 2660 return 0;
2582 2661}
2583error_return_unlocked:
2584 spin_lock(&log->l_grant_reserve_lock);
2585error_return:
2586 list_del_init(&tic->t_queue);
2587 spin_unlock(&log->l_grant_reserve_lock);
2588 trace_xfs_log_grant_error(log, tic);
2589
2590 /*
2591 * If we are failing, make sure the ticket doesn't have any
2592 * current reservations. We don't want to add this back when
2593 * the ticket/transaction gets cancelled.
2594 */
2595 tic->t_curr_res = 0;
2596 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
2597 return XFS_ERROR(EIO);
2598} /* xlog_grant_log_space */
2599
2600 2662
2601/* 2663/*
2602 * Replenish the byte reservation required by moving the grant write head. 2664 * Replenish the byte reservation required by moving the grant write head.
@@ -2605,10 +2667,12 @@ error_return:
2605 * free fast path. 2667 * free fast path.
2606 */ 2668 */
2607STATIC int 2669STATIC int
2608xlog_regrant_write_log_space(xlog_t *log, 2670xlog_regrant_write_log_space(
2609 xlog_ticket_t *tic) 2671 struct log *log,
2672 struct xlog_ticket *tic)
2610{ 2673{
2611 int free_bytes, need_bytes; 2674 int free_bytes, need_bytes;
2675 int error = 0;
2612 2676
2613 tic->t_curr_res = tic->t_unit_res; 2677 tic->t_curr_res = tic->t_unit_res;
2614 xlog_tic_reset_res(tic); 2678 xlog_tic_reset_res(tic);
@@ -2616,104 +2680,38 @@ xlog_regrant_write_log_space(xlog_t *log,
2616 if (tic->t_cnt > 0) 2680 if (tic->t_cnt > 0)
2617 return 0; 2681 return 0;
2618 2682
2619#ifdef DEBUG 2683 ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
2620 if (log->l_flags & XLOG_ACTIVE_RECOVERY)
2621 panic("regrant Recovery problem");
2622#endif
2623 2684
2624 trace_xfs_log_regrant_write_enter(log, tic); 2685 trace_xfs_log_regrant_write_enter(log, tic);
2625 if (XLOG_FORCED_SHUTDOWN(log))
2626 goto error_return_unlocked;
2627 2686
2628 /* If there are other waiters on the queue then give them a 2687 /*
2629 * chance at logspace before us. Wake up the first waiters, 2688 * If there are other waiters on the queue then give them a chance at
2630 * if we do not wake up all the waiters then go to sleep waiting 2689 * logspace before us. Wake up the first waiters, if we do not wake
2631 * for more free space, otherwise try to get some space for 2690 * up all the waiters then go to sleep waiting for more free space,
2632 * this transaction. 2691 * otherwise try to get some space for this transaction.
2633 */ 2692 */
2634 need_bytes = tic->t_unit_res; 2693 need_bytes = tic->t_unit_res;
2635 if (!list_empty_careful(&log->l_writeq)) {
2636 struct xlog_ticket *ntic;
2637
2638 spin_lock(&log->l_grant_write_lock);
2639 free_bytes = xlog_space_left(log, &log->l_grant_write_head);
2640 list_for_each_entry(ntic, &log->l_writeq, t_queue) {
2641 ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
2642
2643 if (free_bytes < ntic->t_unit_res)
2644 break;
2645 free_bytes -= ntic->t_unit_res;
2646 wake_up(&ntic->t_wait);
2647 }
2648
2649 if (ntic != list_first_entry(&log->l_writeq,
2650 struct xlog_ticket, t_queue)) {
2651 if (list_empty(&tic->t_queue))
2652 list_add_tail(&tic->t_queue, &log->l_writeq);
2653 trace_xfs_log_regrant_write_sleep1(log, tic);
2654
2655 xlog_grant_push_ail(log, need_bytes);
2656
2657 XFS_STATS_INC(xs_sleep_logspace);
2658 xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
2659 trace_xfs_log_regrant_write_wake1(log, tic);
2660 } else
2661 spin_unlock(&log->l_grant_write_lock);
2662 }
2663
2664redo:
2665 if (XLOG_FORCED_SHUTDOWN(log))
2666 goto error_return_unlocked;
2667
2668 free_bytes = xlog_space_left(log, &log->l_grant_write_head); 2694 free_bytes = xlog_space_left(log, &log->l_grant_write_head);
2669 if (free_bytes < need_bytes) { 2695 if (!list_empty_careful(&log->l_writeq)) {
2670 spin_lock(&log->l_grant_write_lock); 2696 spin_lock(&log->l_grant_write_lock);
2671 if (list_empty(&tic->t_queue)) 2697 if (!xlog_writeq_wake(log, &free_bytes) ||
2672 list_add_tail(&tic->t_queue, &log->l_writeq); 2698 free_bytes < need_bytes)
2673 2699 error = xlog_writeq_wait(log, tic, need_bytes);
2674 if (XLOG_FORCED_SHUTDOWN(log)) 2700 spin_unlock(&log->l_grant_write_lock);
2675 goto error_return; 2701 } else if (free_bytes < need_bytes) {
2676
2677 xlog_grant_push_ail(log, need_bytes);
2678
2679 XFS_STATS_INC(xs_sleep_logspace);
2680 trace_xfs_log_regrant_write_sleep2(log, tic);
2681 xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
2682
2683 trace_xfs_log_regrant_write_wake2(log, tic);
2684 goto redo;
2685 }
2686
2687 if (!list_empty(&tic->t_queue)) {
2688 spin_lock(&log->l_grant_write_lock); 2702 spin_lock(&log->l_grant_write_lock);
2689 list_del_init(&tic->t_queue); 2703 error = xlog_writeq_wait(log, tic, need_bytes);
2690 spin_unlock(&log->l_grant_write_lock); 2704 spin_unlock(&log->l_grant_write_lock);
2691 } 2705 }
2692 2706
2693 /* we've got enough space */ 2707 if (error)
2708 return error;
2709
2694 xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes); 2710 xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
2695 trace_xfs_log_regrant_write_exit(log, tic); 2711 trace_xfs_log_regrant_write_exit(log, tic);
2696 xlog_verify_grant_tail(log); 2712 xlog_verify_grant_tail(log);
2697 return 0; 2713 return 0;
2698 2714}
2699
2700 error_return_unlocked:
2701 spin_lock(&log->l_grant_write_lock);
2702 error_return:
2703 list_del_init(&tic->t_queue);
2704 spin_unlock(&log->l_grant_write_lock);
2705 trace_xfs_log_regrant_write_error(log, tic);
2706
2707 /*
2708 * If we are failing, make sure the ticket doesn't have any
2709 * current reservations. We don't want to add this back when
2710 * the ticket/transaction gets cancelled.
2711 */
2712 tic->t_curr_res = 0;
2713 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
2714 return XFS_ERROR(EIO);
2715} /* xlog_regrant_write_log_space */
2716
2717 2715
2718/* The first cnt-1 times through here we don't need to 2716/* The first cnt-1 times through here we don't need to
2719 * move the grant write head because the permanent 2717 * move the grant write head because the permanent
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 78c9039994a..3f7bf451c03 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -137,7 +137,7 @@ struct xfs_trans;
137void xfs_log_item_init(struct xfs_mount *mp, 137void xfs_log_item_init(struct xfs_mount *mp,
138 struct xfs_log_item *item, 138 struct xfs_log_item *item,
139 int type, 139 int type,
140 struct xfs_item_ops *ops); 140 const struct xfs_item_ops *ops);
141 141
142xfs_lsn_t xfs_log_done(struct xfs_mount *mp, 142xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
143 struct xlog_ticket *ticket, 143 struct xlog_ticket *ticket,
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 5cff443f6cd..0bbb1a41998 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -674,7 +674,8 @@ xfs_qm_dqattach_one(
674 * disk and we didn't ask it to allocate; 674 * disk and we didn't ask it to allocate;
675 * ESRCH if quotas got turned off suddenly. 675 * ESRCH if quotas got turned off suddenly.
676 */ 676 */
677 error = xfs_qm_dqget(ip->i_mount, ip, id, type, XFS_QMOPT_DOWARN, &dqp); 677 error = xfs_qm_dqget(ip->i_mount, ip, id, type,
678 doalloc | XFS_QMOPT_DOWARN, &dqp);
678 if (error) 679 if (error)
679 return error; 680 return error;
680 681
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index aa3dc1a4d53..be5c51d8f75 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -770,6 +770,17 @@ restart:
770 if (!xfs_iflock_nowait(ip)) { 770 if (!xfs_iflock_nowait(ip)) {
771 if (!(sync_mode & SYNC_WAIT)) 771 if (!(sync_mode & SYNC_WAIT))
772 goto out; 772 goto out;
773
774 /*
775 * If we only have a single dirty inode in a cluster there is
776 * a fair chance that the AIL push may have pushed it into
777 * the buffer, but xfsbufd won't touch it until 30 seconds
778 * from now, and thus we will lock up here.
779 *
780 * Promote the inode buffer to the front of the delwri list
781 * and wake up xfsbufd now.
782 */
783 xfs_promote_inode(ip);
773 xfs_iflock(ip); 784 xfs_iflock(ip);
774 } 785 }
775 786
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index f1d2802b2f0..49403579887 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -834,18 +834,14 @@ DEFINE_LOGGRANT_EVENT(xfs_log_umount_write);
834DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter); 834DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter);
835DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit); 835DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit);
836DEFINE_LOGGRANT_EVENT(xfs_log_grant_error); 836DEFINE_LOGGRANT_EVENT(xfs_log_grant_error);
837DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1); 837DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep);
838DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1); 838DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake);
839DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
840DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
841DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up); 839DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
842DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter); 840DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
843DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit); 841DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
844DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error); 842DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
845DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1); 843DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep);
846DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1); 844DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake);
847DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
848DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
849DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up); 845DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
850DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); 846DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
851DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); 847DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 603f3eb5204..3ae713c0abd 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -326,7 +326,7 @@ typedef struct xfs_log_item {
326 struct xfs_log_item *); 326 struct xfs_log_item *);
327 /* buffer item iodone */ 327 /* buffer item iodone */
328 /* callback func */ 328 /* callback func */
329 struct xfs_item_ops *li_ops; /* function list */ 329 const struct xfs_item_ops *li_ops; /* function list */
330 330
331 /* delayed logging */ 331 /* delayed logging */
332 struct list_head li_cil; /* CIL pointers */ 332 struct list_head li_cil; /* CIL pointers */
@@ -341,7 +341,7 @@ typedef struct xfs_log_item {
341 { XFS_LI_IN_AIL, "IN_AIL" }, \ 341 { XFS_LI_IN_AIL, "IN_AIL" }, \
342 { XFS_LI_ABORTED, "ABORTED" } 342 { XFS_LI_ABORTED, "ABORTED" }
343 343
344typedef struct xfs_item_ops { 344struct xfs_item_ops {
345 uint (*iop_size)(xfs_log_item_t *); 345 uint (*iop_size)(xfs_log_item_t *);
346 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); 346 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
347 void (*iop_pin)(xfs_log_item_t *); 347 void (*iop_pin)(xfs_log_item_t *);
@@ -352,7 +352,7 @@ typedef struct xfs_item_ops {
352 void (*iop_push)(xfs_log_item_t *); 352 void (*iop_push)(xfs_log_item_t *);
353 bool (*iop_pushbuf)(xfs_log_item_t *); 353 bool (*iop_pushbuf)(xfs_log_item_t *);
354 void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); 354 void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
355} xfs_item_ops_t; 355};
356 356
357#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip) 357#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip)
358#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) 358#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp)
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 4ecf2a54906..ce9268a2f56 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -112,7 +112,7 @@ xfs_readlink(
112 char *link) 112 char *link)
113{ 113{
114 xfs_mount_t *mp = ip->i_mount; 114 xfs_mount_t *mp = ip->i_mount;
115 int pathlen; 115 xfs_fsize_t pathlen;
116 int error = 0; 116 int error = 0;
117 117
118 trace_xfs_readlink(ip); 118 trace_xfs_readlink(ip);
@@ -122,13 +122,19 @@ xfs_readlink(
122 122
123 xfs_ilock(ip, XFS_ILOCK_SHARED); 123 xfs_ilock(ip, XFS_ILOCK_SHARED);
124 124
125 ASSERT(S_ISLNK(ip->i_d.di_mode));
126 ASSERT(ip->i_d.di_size <= MAXPATHLEN);
127
128 pathlen = ip->i_d.di_size; 125 pathlen = ip->i_d.di_size;
129 if (!pathlen) 126 if (!pathlen)
130 goto out; 127 goto out;
131 128
129 if (pathlen < 0 || pathlen > MAXPATHLEN) {
130 xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)",
131 __func__, (unsigned long long) ip->i_ino,
132 (long long) pathlen);
133 ASSERT(0);
134 return XFS_ERROR(EFSCORRUPTED);
135 }
136
137
132 if (ip->i_df.if_flags & XFS_IFINLINE) { 138 if (ip->i_df.if_flags & XFS_IFINLINE) {
133 memcpy(link, ip->i_df.if_u1.if_data, pathlen); 139 memcpy(link, ip->i_df.if_u1.if_data, pathlen);
134 link[pathlen] = '\0'; 140 link[pathlen] = '\0';